File size: 4,480 Bytes
0d1e8bd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import os
import csv
import re
def find_last_occurrence(lines, pattern):
"""
Search backwards in the given list of lines for the first line matching the pattern.
Returns the captured group text if found, otherwise returns an empty string.
"""
for line in reversed(lines):
match = re.match(pattern, line.strip())
if match:
return match.group(1)
return "" # Return an empty string if not found
def extract_prompts_from_file(file_path):
"""
Search from the end of file for "title": "xxx", "cover_prompt": "xxx", "video_prompt": "xxx".
- Case insensitive
- Can contain multiple underscores between characters
If the extracted title equals
"Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall."
then set all three values to empty strings.
"""
# Use (?i) in regex for case insensitivity
# For example, title -> t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e
# This means there can be 0~n underscores between t and i (?:_+)? and similarly for others
title_pattern = (
r'^(?i)\s*"t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e"\s*:\s*"(.*)",?\s*$'
)
cover_pattern = (
r'^(?i)\s*"c(?:_+)?o(?:_+)?v(?:_+)?e(?:_+)?r(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$'
)
video_pattern = (
r'^(?i)\s*"v(?:_+)?i(?:_+)?d(?:_+)?e(?:_+)?o(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$'
)
# Read the text
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.read().splitlines()
# Search for the three items from the end
title = find_last_occurrence(lines, title_pattern)
cover_prompt = find_last_occurrence(lines, cover_pattern)
video_prompt = find_last_occurrence(lines, video_pattern)
# If the found title is the specified text, set all three to empty
if title.strip() == "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall.":
title = ""
cover_prompt = ""
video_prompt = ""
return title, cover_prompt, video_prompt
def process_txt_files(input_folder, output_csv):
"""
1. Traverse all .txt files in input_folder
2. For each file, search backwards for the last occurrence of "title": "...", "cover_prompt": "...", "video_prompt": "..."
3. Output CSV: user prompt, title, cover prompt, video prompt
"""
out_dir = os.path.dirname(output_csv)
if out_dir and not os.path.exists(out_dir):
os.makedirs(out_dir)
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["user prompt", "title", "cover prompt", "video prompt"])
# Traverse all .txt files in the folder
for filename in os.listdir(input_folder):
if filename.lower().endswith(".txt"):
full_path = os.path.join(input_folder, filename)
# Extract text corresponding to JSON keys
title, cover_prompt, video_prompt = extract_prompts_from_file(full_path)
# User prompt is the filename without the extension
user_prompt = os.path.splitext(filename)[0]
# Write a row
writer.writerow([user_prompt, title, cover_prompt, video_prompt])
if __name__ == "__main__":
# 1) Change to your txt folder path
#creation_outputs_ai_concrete_rag_50_testset
#baseline_concrete_outputs_2
#creation_outputs_ai_concrete_rag_50_tags_4_testset
#creation_rag_cot_prompt_ai_abstract_rag_50_testset_deepseek
#creation_rag_cot_prompt_ai_concrete_rag_50_testset_deepseekr1
#baseline_concrete_outputs_deepseekr1
typs = ["concrete"] #"concrete",
rags = [50] # 0,20,40,60,80,120,140
for typ in typs:
for rag in rags:
input_folder_path = f"creation_rag_cot_prompt_ai_{typ}_rag_{rag}_tags_1_testset"
# 2) Change output CSV path
# output_prompt_baseline/prompt_baseline_abstract_2.csv
# output_prompt_rag_more
# output_prompt_baseline/prompt_baseline_concrete_gpt4o.csv
# output_prompt_rag_more/prompt_ai_concrete_rag_50_testset_deepseekr1.csv
output_csv_file = f"output_prompt_rag_more/prompt_ai_{typ}_rag_{rag}_tags_1_testset.csv"
process_txt_files(input_folder_path, output_csv_file)
print("Processing complete! Results written to:", output_csv_file)
|