DouDou commited on
Commit
657cb02
·
verified ·
1 Parent(s): 2116622

Upload data2/instruction_generation/summarize_repo_readme.py with huggingface_hub

Browse files
data2/instruction_generation/summarize_repo_readme.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ README/Markdown Summarization Script
4
+ Scans README files in each repository, uses LLM to generate summaries, writes back to README_SUMMARY.md
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import json
10
+ import asyncio
11
+ import argparse
12
+ from pathlib import Path
13
+ from typing import List, Dict, Optional
14
+ from tqdm import tqdm
15
+ import hashlib
16
+ from dotenv import load_dotenv
17
+
18
+ # Load .env file (before importing other modules)
19
+ env_file = Path(__file__).parent / ".env"
20
+ if env_file.exists():
21
+ load_dotenv(env_file)
22
+ elif (Path(__file__).parent.parent / ".env").exists():
23
+ # If not in current directory, try loading from project root
24
+ load_dotenv(Path(__file__).parent.parent / ".env")
25
+
26
+ # Add current directory to path (for importing schemas)
27
+ sys.path.insert(0, str(Path(__file__).parent))
28
+ # Add domain_code/src to path for reusing util functions
29
+ sys.path.insert(0, str(Path(__file__).parent.parent / "domain_code" / "src"))
30
+ from util import call_llm, init_logger, logger
31
+ from schemas import READMESummary
32
+
33
+
34
+ # Default output filename (written back to repository directory)
35
+ SUMMARY_FILENAME = "README_SUMMARY.md"
36
+
37
+
38
+ def find_readme_files(repo_dir: Path) -> List[Path]:
39
+ """
40
+ Find README files in the repository
41
+
42
+ Args:
43
+ repo_dir: Repository root directory
44
+
45
+ Returns:
46
+ List of README file paths
47
+ """
48
+ readme_files = []
49
+
50
+ # README* files in repository root directory
51
+ for pattern in ["README*", "readme*"]:
52
+ readme_files.extend(repo_dir.glob(pattern))
53
+
54
+ # README* files in docs/ directory
55
+ docs_dir = repo_dir / "docs"
56
+ if docs_dir.exists() and docs_dir.is_dir():
57
+ for pattern in ["README*", "readme*"]:
58
+ readme_files.extend(docs_dir.glob(pattern))
59
+
60
+ # Filter: keep only .md or .markdown files
61
+ readme_files = [
62
+ f for f in readme_files
63
+ if f.is_file() and f.suffix.lower() in [".md", ".markdown"]
64
+ ]
65
+
66
+ return sorted(set(readme_files)) # Deduplicate and sort
67
+
68
+
69
+ def read_readme_content(readme_files: List[Path]) -> str:
70
+ """
71
+ Read and merge all README file contents
72
+
73
+ Args:
74
+ readme_files: List of README file paths
75
+
76
+ Returns:
77
+ Merged README content
78
+ """
79
+ contents = []
80
+ for readme_file in readme_files:
81
+ try:
82
+ with open(readme_file, "r", encoding="utf-8", errors="ignore") as f:
83
+ content = f.read().strip()
84
+ if content:
85
+ contents.append(f"## File: {readme_file.name}\n\n{content}")
86
+ except Exception as e:
87
+ logger.warning(f"Unable to read file {readme_file}: {e}")
88
+
89
+ return "\n\n---\n\n".join(contents)
90
+
91
+
92
+ async def summarize_readme(
93
+ readme_content: str,
94
+ base_url: str,
95
+ model: str,
96
+ api_key: str,
97
+ log_file: str,
98
+ ) -> Optional[Dict]:
99
+ """
100
+ Use LLM to summarize README content
101
+
102
+ Args:
103
+ readme_content: README file content
104
+ base_url: LLM API base URL
105
+ model: Model name
106
+ api_key: API key
107
+ log_file: Log file path
108
+
109
+ Returns:
110
+ README summary (dict) or None
111
+ """
112
+ # Read prompt template
113
+ prompt_template_path = Path(__file__).parent / "prompts" / "readme_summary.txt"
114
+ try:
115
+ with open(prompt_template_path, "r", encoding="utf-8") as f:
116
+ prompt_template = f.read()
117
+ except Exception as e:
118
+ logger.error(f"Unable to read prompt template: {e}")
119
+ return None
120
+
121
+ # Build prompt
122
+ prompt = prompt_template.format(readme_content=readme_content)
123
+
124
+ # Call LLM
125
+ messages = [{"role": "user", "content": prompt}]
126
+
127
+ try:
128
+ result = await call_llm(
129
+ messages=messages,
130
+ model=model,
131
+ base_url=base_url,
132
+ api_key=api_key,
133
+ pydantic_object=READMESummary,
134
+ log_file=log_file,
135
+ )
136
+
137
+ if result is None:
138
+ logger.warning("LLM call returned None, skipping")
139
+ return None
140
+
141
+ # If result is a string, try to parse JSON
142
+ if isinstance(result, str):
143
+ try:
144
+ result = json.loads(result)
145
+ except json.JSONDecodeError:
146
+ logger.warning(f"Unable to parse JSON from LLM response: {result[:200]}")
147
+ return None
148
+
149
+ return result
150
+ except Exception as e:
151
+ logger.error(f"LLM call failed: {e}")
152
+ return None
153
+
154
+
155
+ def write_summary_file(
156
+ repo_dir: Path,
157
+ summary: Dict,
158
+ readme_files: List[Path],
159
+ ) -> Path:
160
+ """
161
+ Write summary to README_SUMMARY.md file (in repository directory)
162
+
163
+ Args:
164
+ repo_dir: Repository root directory
165
+ summary: README summary (dict)
166
+ readme_files: Original README file list
167
+
168
+ Returns:
169
+ Output file path
170
+ """
171
+ output_file = repo_dir / SUMMARY_FILENAME
172
+
173
+ # Build output content (simplified format)
174
+ lines = []
175
+ lines.append("# Project Summary\n\n")
176
+
177
+ # Add summary fields (only essential ones)
178
+ if "project_overview" in summary:
179
+ lines.append(f"## Project Overview\n\n{summary['project_overview']}\n\n")
180
+
181
+ if "main_features" in summary:
182
+ lines.append(f"## Main Features\n\n{summary['main_features']}\n\n")
183
+
184
+ # Write to file
185
+ try:
186
+ with open(output_file, "w", encoding="utf-8") as f:
187
+ f.write("".join(lines))
188
+ logger.info(f"Summary file written: {output_file}")
189
+ return output_file
190
+ except Exception as e:
191
+ logger.error(f"Unable to write summary file {output_file}: {e}")
192
+ raise
193
+
194
+
195
+ async def process_single_repo(
196
+ repo_dir: Path,
197
+ base_url: str,
198
+ model: str,
199
+ api_key: str,
200
+ log_file: str,
201
+ overwrite: bool = False,
202
+ ) -> Dict[str, any]:
203
+ """
204
+ Process README summarization for a single repository
205
+
206
+ Args:
207
+ repo_dir: Repository root directory
208
+ base_url: LLM API base URL
209
+ model: Model name
210
+ api_key: API key
211
+ log_file: Log file path
212
+ overwrite: Whether to overwrite existing summary file
213
+
214
+ Returns:
215
+ Processing result dictionary
216
+ """
217
+ repo_name = repo_dir.name
218
+ summary_file = repo_dir / SUMMARY_FILENAME
219
+
220
+ # Check if summary file already exists
221
+ if summary_file.exists() and not overwrite:
222
+ return {
223
+ "repo": repo_name,
224
+ "status": "skipped",
225
+ "reason": "Summary file already exists",
226
+ }
227
+
228
+ # Find README files
229
+ readme_files = find_readme_files(repo_dir)
230
+ if not readme_files:
231
+ return {
232
+ "repo": repo_name,
233
+ "status": "no_readme",
234
+ "reason": "README file not found",
235
+ }
236
+
237
+ # Read README content
238
+ readme_content = read_readme_content(readme_files)
239
+ if not readme_content:
240
+ return {
241
+ "repo": repo_name,
242
+ "status": "empty_readme",
243
+ "reason": "README file is empty",
244
+ }
245
+
246
+ # Call LLM to generate summary
247
+ summary = await summarize_readme(
248
+ readme_content=readme_content,
249
+ base_url=base_url,
250
+ model=model,
251
+ api_key=api_key,
252
+ log_file=log_file,
253
+ )
254
+
255
+ if summary is None:
256
+ return {
257
+ "repo": repo_name,
258
+ "status": "llm_failed",
259
+ "reason": "LLM call failed",
260
+ }
261
+
262
+ # Write summary file
263
+ try:
264
+ write_summary_file(repo_dir, summary, readme_files)
265
+ return {
266
+ "repo": repo_name,
267
+ "status": "success",
268
+ "summary_file": str(summary_file),
269
+ "readme_count": len(readme_files),
270
+ }
271
+ except Exception as e:
272
+ return {
273
+ "repo": repo_name,
274
+ "status": "write_failed",
275
+ "reason": str(e),
276
+ }
277
+
278
+
279
+ async def process_all_repos(
280
+ repos_dir: Path,
281
+ base_url: str,
282
+ model: str,
283
+ api_key: str,
284
+ log_file: str,
285
+ max_concurrency: int = 8,
286
+ overwrite: bool = False,
287
+ ) -> List[Dict]:
288
+ """
289
+ Process README summarization for all repositories
290
+
291
+ Args:
292
+ repos_dir: Repository root directory
293
+ base_url: LLM API base URL
294
+ model: Model name
295
+ api_key: API key
296
+ log_file: Log file path
297
+ max_concurrency: Maximum concurrency
298
+ overwrite: Whether to overwrite existing summary files
299
+
300
+ Returns:
301
+ List of processing results for all repositories
302
+ """
303
+ # Get all repository directories
304
+ repo_dirs = [
305
+ d for d in repos_dir.iterdir()
306
+ if d.is_dir() and not d.name.startswith(".")
307
+ ]
308
+ repo_dirs.sort()
309
+
310
+ logger.info(f"Found {len(repo_dirs)} repositories, starting processing...")
311
+
312
+ # Use semaphore to control concurrency
313
+ semaphore = asyncio.Semaphore(max_concurrency)
314
+
315
+ async def process_with_semaphore(repo_dir: Path):
316
+ async with semaphore:
317
+ return await process_single_repo(
318
+ repo_dir=repo_dir,
319
+ base_url=base_url,
320
+ model=model,
321
+ api_key=api_key,
322
+ log_file=log_file,
323
+ overwrite=overwrite,
324
+ )
325
+
326
+ # Process all repositories concurrently
327
+ tasks = [process_with_semaphore(repo_dir) for repo_dir in repo_dirs]
328
+ results = []
329
+
330
+ for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing repos"):
331
+ result = await task
332
+ results.append(result)
333
+
334
+ return results
335
+
336
+
337
+ if __name__ == "__main__":
338
+ parser = argparse.ArgumentParser(description="README/Markdown Summarization Tool")
339
+ parser.add_argument(
340
+ "--repos_dir",
341
+ type=str,
342
+ default="/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered",
343
+ help="Repository root directory path",
344
+ )
345
+ parser.add_argument(
346
+ "--base_url",
347
+ type=str,
348
+ default=os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1"),
349
+ help="LLM API base URL (default: http://localhost:8000/v1)",
350
+ )
351
+ parser.add_argument(
352
+ "--model",
353
+ type=str,
354
+ default="Qwen3",
355
+ help="Model name (default: Qwen3)",
356
+ )
357
+ parser.add_argument(
358
+ "--api_key_env",
359
+ type=str,
360
+ default="OPENAI_API_KEY",
361
+ help="API key environment variable name (default: OPENAI_API_KEY)",
362
+ )
363
+ parser.add_argument(
364
+ "--max_concurrency",
365
+ type=int,
366
+ default=8,
367
+ help="Maximum concurrency (default: 8)",
368
+ )
369
+ parser.add_argument(
370
+ "--overwrite",
371
+ action="store_true",
372
+ help="Overwrite existing summary files",
373
+ )
374
+ parser.add_argument(
375
+ "--log_file",
376
+ type=str,
377
+ default="instruction_generation/workdir/logs/summarize.log",
378
+ help="Log file path",
379
+ )
380
+
381
+ args = parser.parse_args()
382
+
383
+ # Initialize logger
384
+ init_logger(args.log_file, level="INFO")
385
+
386
+ # Get API key
387
+ api_key = os.getenv(args.api_key_env, "none")
388
+
389
+ # Process all repositories
390
+ repos_dir = Path(args.repos_dir)
391
+ if not repos_dir.exists():
392
+ logger.error(f"Repository directory does not exist: {repos_dir}")
393
+ sys.exit(1)
394
+
395
+ # Create log directory
396
+ log_file_path = Path(args.log_file)
397
+ log_file_path.parent.mkdir(parents=True, exist_ok=True)
398
+
399
+ # Run main logic
400
+ results = asyncio.run(
401
+ process_all_repos(
402
+ repos_dir=repos_dir,
403
+ base_url=args.base_url,
404
+ model=args.model,
405
+ api_key=api_key,
406
+ log_file=str(log_file_path),
407
+ max_concurrency=args.max_concurrency,
408
+ overwrite=args.overwrite,
409
+ )
410
+ )
411
+
412
+ # Statistics
413
+ status_counts = {}
414
+ for result in results:
415
+ status = result["status"]
416
+ status_counts[status] = status_counts.get(status, 0) + 1
417
+
418
+ logger.info("\n" + "=" * 80)
419
+ logger.info("Processing complete!")
420
+ logger.info("=" * 80)
421
+ logger.info(f"Total: {len(results)} repositories")
422
+ for status, count in status_counts.items():
423
+ logger.info(f" {status}: {count}")
424
+ logger.info("=" * 80)