|
|
|
|
|
import os |
|
|
import argparse |
|
|
import hashlib |
|
|
import json |
|
|
from openai import OpenAI |
|
|
from tqdm import tqdm |
|
|
|
|
|
from src.open_storyline.utils.prompts import get_prompt |
|
|
from src.open_storyline.utils.parse_json import parse_json_dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") |
|
|
|
|
|
client = None |
|
|
if API_KEY: |
|
|
client = OpenAI( |
|
|
api_key=API_KEY, |
|
|
base_url="https://api.deepseek.com/v1", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def file_md5(path: str) -> str: |
|
|
"""Compute MD5 of file content.""" |
|
|
md5 = hashlib.md5() |
|
|
with open(path, "rb") as f: |
|
|
for chunk in iter(lambda: f.read(4096), b""): |
|
|
md5.update(chunk) |
|
|
return md5.hexdigest() |
|
|
|
|
|
|
|
|
def label_template(path: str, system_prompt: str) -> dict: |
|
|
"""Call LLM to label a single text template.""" |
|
|
if not client: |
|
|
raise RuntimeError("API client not initialized") |
|
|
|
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
content = f.read() |
|
|
|
|
|
resp = client.chat.completions.create( |
|
|
model="deepseek-chat", |
|
|
messages=[ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": content}, |
|
|
], |
|
|
stream=False, |
|
|
) |
|
|
|
|
|
return parse_json_dict(resp.choices[0].message.content) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--input_dir", |
|
|
type=str, |
|
|
default="resource/script_templates", |
|
|
help="Folder containing .txt style templates", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output_json", |
|
|
type=str, |
|
|
default="resource/script_templates/meta.json", |
|
|
help="Output meta.json path", |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
input_dir = args.input_dir |
|
|
output_json = args.output_json |
|
|
|
|
|
|
|
|
if os.path.exists(output_json): |
|
|
with open(output_json, "r", encoding="utf-8") as f: |
|
|
meta_data = json.load(f) |
|
|
else: |
|
|
meta_data = [] |
|
|
|
|
|
|
|
|
md5_map = {item["id"]: item for item in meta_data} |
|
|
|
|
|
|
|
|
system_prompt = get_prompt("scripts.script_template_label", lang="zh") |
|
|
|
|
|
|
|
|
files = [] |
|
|
for root, _, filenames in os.walk(input_dir): |
|
|
for name in filenames: |
|
|
if name.lower().endswith(".txt"): |
|
|
files.append(os.path.join(root, name)) |
|
|
|
|
|
updated_meta = [] |
|
|
needs_processing = False |
|
|
|
|
|
|
|
|
resource_root = os.path.abspath(os.path.join(input_dir, "../..")) |
|
|
|
|
|
for file_path in tqdm(files, desc="Labeling templates", unit="file"): |
|
|
md5 = file_md5(file_path) |
|
|
|
|
|
rel_path = os.path.relpath(file_path, start=resource_root).replace("\\", "/") |
|
|
|
|
|
|
|
|
if md5 in md5_map: |
|
|
updated_meta.append(md5_map[md5]) |
|
|
continue |
|
|
|
|
|
needs_processing = True |
|
|
tqdm.write(f"Processing {rel_path} ...") |
|
|
|
|
|
if not client: |
|
|
continue |
|
|
|
|
|
try: |
|
|
res = label_template(file_path, system_prompt) |
|
|
except Exception as e: |
|
|
tqdm.write(f"⚠️ Failed on {rel_path}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
res["id"] = md5 |
|
|
res["path"] = rel_path |
|
|
|
|
|
updated_meta.append(res) |
|
|
|
|
|
if not client and needs_processing: |
|
|
print("⚠️ Warning: API key missing, new/changed templates were not labeled.") |
|
|
|
|
|
os.makedirs(os.path.dirname(output_json), exist_ok=True) |
|
|
with open(output_json, "w", encoding="utf-8") as f: |
|
|
json.dump(updated_meta, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"✅ Done! meta.json saved to {output_json}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |