DouDou commited on
Commit
0b75f52
·
verified ·
1 Parent(s): 96eab9a

Upload data3/main.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/main.py +256 -0
data3/main.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import csv
4
+ import shutil
5
+ import threading
6
+ import logging
7
+ import signal
8
+ import sys
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+
13
+ # ================= 配置区域 =================
14
+ # OpenAI API Key
15
+ OPENAI_API_KEY = "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"
16
+
17
+ # 源文件夹路径
18
+ SOURCE_REPOS_DIR = Path("/home/weifengsun/tangou1/domain_code/src/workdir/repos_raw").resolve()
19
+
20
+ # 基础输出路径
21
+ BASE_OUTPUT_DIR = Path("~/chemrepo").expanduser().resolve()
22
+
23
+ # 全局失败日志路径
24
+ GLOBAL_ERROR_LOG = BASE_OUTPUT_DIR / "failures.log"
25
+
26
+ # CSV 记录路径
27
+ CSV_FILE = BASE_OUTPUT_DIR / "run.csv"
28
+
29
+ # 并发数量
30
+ MAX_WORKERS = 256
31
+ # ===========================================
32
+
33
+ # 设置环境变量
34
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
35
+
36
+ # 确保基础输出目录存在
37
+ BASE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
38
+
39
+ # --- 全局锁与状态追踪 ---
40
+ # 用于写入 failures.log 的锁
41
+ error_log_lock = threading.Lock()
42
+ # 用于追踪当前正在处理的项目(用于中断时清理)
43
+ active_projects = set()
44
+ active_projects_lock = threading.Lock()
45
+
46
+ def add_active_project(name):
47
+ with active_projects_lock:
48
+ active_projects.add(name)
49
+
50
+ def remove_active_project(name):
51
+ with active_projects_lock:
52
+ active_projects.discard(name)
53
+
54
+ def log_failure_globally(project_name, content, extra_info=""):
55
+ """将失败信息写入全局日志"""
56
+ with error_log_lock:
57
+ with open(GLOBAL_ERROR_LOG, "a", encoding="utf-8") as g_log:
58
+ g_log.write(f"\n{'='*40}\n")
59
+ g_log.write(f"PROJECT: {project_name}\n")
60
+ g_log.write(f"TIME: {datetime.now()}\n")
61
+ g_log.write(f"STATUS: Failed/Interrupted\n")
62
+ g_log.write(f"{'='*40}\n")
63
+ g_log.write(content)
64
+ if extra_info:
65
+ g_log.write(f"\n[Details]: {extra_info}\n")
66
+ g_log.write(f"\n{'='*40}\n")
67
+
68
+ def cleanup_project_folder(project_name):
69
+ """删除项目输出文件夹"""
70
+ project_out_dir = BASE_OUTPUT_DIR / project_name
71
+ if project_out_dir.exists():
72
+ try:
73
+ shutil.rmtree(project_out_dir)
74
+ print(f"🗑️ Deleted failed/interrupted directory: {project_out_dir}")
75
+ except OSError as e:
76
+ print(f"⚠️ Failed to delete directory {project_out_dir}: {e}")
77
+
78
+ def process_single_project(project_path):
79
+ """
80
+ 处理单个项目文件夹的任务函数
81
+ """
82
+ project_name = project_path.name
83
+ start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
84
+
85
+ # 定义输出路径
86
+ project_out_dir = BASE_OUTPUT_DIR / project_name
87
+ hp_dir = project_out_dir / "hp"
88
+ mdp_dir = project_out_dir / "mdp"
89
+ local_log_file = project_out_dir / "process.log"
90
+
91
+ # --- 1. 检查输出文件夹是否存在 (断点续传) ---
92
+ # 如果 hp 和 mdp 存在,且 mdp 不为空,才算跳过;如果为空,重新跑一遍可能也没意义,
93
+ # 但根据逻辑这里只要文件夹在就跳过。如果你想重试空项目,可以把这里改一下。
94
+ if hp_dir.exists() and mdp_dir.exists():
95
+ return {
96
+ "project": project_name,
97
+ "status": "Skipped",
98
+ "start_time": start_time,
99
+ "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
100
+ }
101
+
102
+ # 标记为活跃项目
103
+ add_active_project(project_name)
104
+
105
+ # 创建项目主目录
106
+ project_out_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ status = "Failed"
109
+ python_error = None
110
+
111
+ # 使用 try-finally 确保即使线程崩溃也能从 active 列表移除
112
+ try:
113
+ with open(local_log_file, "w", encoding="utf-8") as log_f:
114
+ try:
115
+ log_f.write(f"[{datetime.now()}] Processing project: {project_name}\n")
116
+
117
+ # --- 2. 确保 .gitignore 存在 ---
118
+ gitignore_path = project_path / ".gitignore"
119
+ if not gitignore_path.exists():
120
+ gitignore_path.touch()
121
+ log_f.write(f"[{datetime.now()}] Created .gitignore file.\n")
122
+
123
+ # --- 3. 构建命令 ---
124
+ cmd = [
125
+ "repoagent", "run",
126
+ "-m", "gpt-5.1-2025-11-13",
127
+ "-r", "1",
128
+ "-tp", str(project_path.absolute()),
129
+ "--print-hierarchy",
130
+ "-hp", str(hp_dir),
131
+ "-mdp", str(mdp_dir)
132
+ ]
133
+
134
+ log_f.write(f"[{datetime.now()}] Command: {' '.join(cmd)}\n")
135
+ log_f.write(f"[{datetime.now()}] Starting RepoAgent...\n")
136
+ log_f.flush()
137
+
138
+ # --- 4. 执行命令 ---
139
+ subprocess.run(cmd, stdout=log_f, stderr=subprocess.STDOUT, check=True)
140
+
141
+ # --- 5. 检查是否生成了文档 (新增逻辑) ---
142
+ has_docs = False
143
+ if mdp_dir.exists():
144
+ # 检查目录下是否有任何文件
145
+ if any(mdp_dir.iterdir()):
146
+ has_docs = True
147
+
148
+ if has_docs:
149
+ status = "Success"
150
+ log_f.write(f"\n[{datetime.now()}] Completed successfully.\n")
151
+ else:
152
+ status = "EmptyProject"
153
+ log_f.write(f"\n[{datetime.now()}] Finished, but mdp folder is EMPTY. Marked as EmptyProject.\n")
154
+
155
+ except Exception as e:
156
+ status = "Failed"
157
+ python_error = str(e)
158
+ try: log_f.write(f"\n[{datetime.now()}] ERROR: {python_error}\n")
159
+ except: pass
160
+ print(f"❌ Error processing {project_name}: {python_error}")
161
+
162
+ # --- 6. 失败处理逻辑 ---
163
+ if status == "Failed":
164
+ # 读取日志内容
165
+ failed_log_content = ""
166
+ if local_log_file.exists():
167
+ try:
168
+ with open(local_log_file, "r", encoding="utf-8", errors='ignore') as f:
169
+ failed_log_content = f.read()
170
+ except: failed_log_content = "Read Error"
171
+
172
+ # 写入全局日志
173
+ log_failure_globally(project_name, failed_log_content, python_error)
174
+
175
+ # 删除文件夹
176
+ cleanup_project_folder(project_name)
177
+
178
+ except Exception:
179
+ # 兜底捕获
180
+ pass
181
+ finally:
182
+ remove_active_project(project_name)
183
+
184
+ return {
185
+ "project": project_name,
186
+ "status": status,
187
+ "start_time": start_time,
188
+ "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
189
+ }
190
+
191
+ def main():
192
+ if not SOURCE_REPOS_DIR.exists():
193
+ print(f"Error: Source directory {SOURCE_REPOS_DIR} does not exist.")
194
+ return
195
+
196
+ # CSV 头部
197
+ csv_headers = ["project", "status", "start_time", "end_time"]
198
+
199
+ # 初始化 CSV
200
+ file_exists = CSV_FILE.exists()
201
+ with open(CSV_FILE, mode='a', newline='', encoding='utf-8') as f:
202
+ writer = csv.DictWriter(f, fieldnames=csv_headers)
203
+ if not file_exists:
204
+ writer.writeheader()
205
+
206
+ # --- 1. 获取项目并按首字母排序 ---
207
+ projects = sorted([p for p in SOURCE_REPOS_DIR.iterdir() if p.is_dir()], key=lambda x: x.name)
208
+
209
+ print(f"Found {len(projects)} projects (Sorted A-Z).\nOutput Dir: {BASE_OUTPUT_DIR}")
210
+ print(f"Failures Log: {GLOBAL_ERROR_LOG}")
211
+ print(f"Starting concurrent processing with {MAX_WORKERS} workers...\n")
212
+ print(f"💡 Press Ctrl+C to stop. Interrupted projects will be cleaned up automatically.\n")
213
+
214
+ executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
215
+
216
+ try:
217
+ future_to_project = {executor.submit(process_single_project, p): p for p in projects}
218
+
219
+ with open(CSV_FILE, mode='a', newline='', encoding='utf-8') as f:
220
+ writer = csv.DictWriter(f, fieldnames=csv_headers)
221
+
222
+ for future in as_completed(future_to_project):
223
+ result = future.result()
224
+ writer.writerow(result)
225
+ f.flush()
226
+
227
+ # 控制台输出增加 EmptyProject 的显示
228
+ if result["status"] == "Success":
229
+ print(f"✅ {result['project']} Finished.")
230
+ elif result["status"] == "EmptyProject":
231
+ print(f"⚠️ {result['project']} Finished (Empty - No Docs Generated).")
232
+ elif result["status"] == "Skipped":
233
+ print(f"⏭️ {result['project']} Skipped.")
234
+ else:
235
+ print(f"❌ {result['project']} Failed.")
236
+
237
+ except KeyboardInterrupt:
238
+ print("\n\n🛑 KeyboardInterrupt detected! Stopping workers...")
239
+
240
+ executor.shutdown(wait=False, cancel_futures=True)
241
+
242
+ print("🧹 Cleaning up active incomplete projects...")
243
+ with active_projects_lock:
244
+ projects_to_clean = list(active_projects)
245
+
246
+ for proj_name in projects_to_clean:
247
+ log_failure_globally(proj_name, "Process terminated by User (KeyboardInterrupt).")
248
+ cleanup_project_folder(proj_name)
249
+
250
+ print("Done. Exiting.")
251
+ sys.exit(0)
252
+
253
+ print(f"\nAll tasks completed. \nCSV: {CSV_FILE}")
254
+
255
+ if __name__ == "__main__":
256
+ main()