|
|
""" |
|
|
Stage C: 代码文件级统计(优化版 - 大幅提速) |
|
|
|
|
|
优化策略: |
|
|
1. 使用简化的统计方法替代复杂正则匹配 |
|
|
2. 对大文件使用粗略估计 |
|
|
3. 断点续传支持 |
|
|
4. 批量处理减少IPC开销 |
|
|
5. 跳过详细函数参数分析,使用快速计数 |
|
|
""" |
|
|
import os |
|
|
import json |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from collections import defaultdict, Counter |
|
|
from tqdm import tqdm |
|
|
import statistics |
|
|
import math |
|
|
from multiprocessing import Pool, cpu_count |
|
|
import pandas as pd |
|
|
import pickle |
|
|
import hashlib |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FUNC_KEYWORDS = { |
|
|
'python': [b'def '], |
|
|
'jupyter': [b'def '], |
|
|
'java': [b'public ', b'private ', b'protected ', b'void ', b'static '], |
|
|
'c/c++': [b'void ', b'int ', b'float ', b'double ', b'char ', b'bool '], |
|
|
'go': [b'func '], |
|
|
'rust': [b'fn '], |
|
|
'r': [b'function(', b'function ('], |
|
|
'matlab': [b'function '], |
|
|
'shell': [b'function ', b'() {'], |
|
|
'fortran': [b'subroutine ', b'function ', b'SUBROUTINE ', b'FUNCTION '], |
|
|
} |
|
|
|
|
|
|
|
|
COMMENT_MARKERS = { |
|
|
'python': (b'#', b'"""', b"'''"), |
|
|
'jupyter': (b'#', b'"""', b"'''"), |
|
|
'java': (b'//', b'/*'), |
|
|
'c/c++': (b'//', b'/*'), |
|
|
'go': (b'//', b'/*'), |
|
|
'rust': (b'//', b'/*'), |
|
|
'r': (b'#',), |
|
|
'matlab': (b'%', b'%{'), |
|
|
'shell': (b'#',), |
|
|
'fortran': (b'!',), |
|
|
} |
|
|
|
|
|
|
|
|
EXT_MAP = { |
|
|
'.py': 'python', '.java': 'java', '.c': 'c/c++', '.h': 'c/c++', |
|
|
'.hh': 'c/c++', '.hpp': 'c/c++', '.cpp': 'c/c++', '.cc': 'c/c++', |
|
|
'.cxx': 'c/c++', '.c++': 'c/c++', '.f': 'fortran', '.f90': 'fortran', |
|
|
'.f95': 'fortran', '.F': 'fortran', '.r': 'r', '.m': 'matlab', |
|
|
'.sh': 'shell', '.bash': 'shell', '.rs': 'rust', '.go': 'go', |
|
|
'.ipynb': 'jupyter' |
|
|
} |
|
|
|
|
|
|
|
|
def detect_language_fast(file_path: str) -> str: |
|
|
"""快速语言检测""" |
|
|
ext = os.path.splitext(file_path)[1].lower() |
|
|
return EXT_MAP.get(ext, 'unknown') |
|
|
|
|
|
|
|
|
def fast_analyze_file(file_path: Path, repo_name: str, max_file_size_bytes: int = 2*1024*1024) -> dict: |
|
|
""" |
|
|
快速分析单个代码文件(使用字节操作,比字符串快得多) |
|
|
""" |
|
|
try: |
|
|
file_size = file_path.stat().st_size |
|
|
if file_size > max_file_size_bytes: |
|
|
return None |
|
|
|
|
|
ext = file_path.suffix.lower() |
|
|
|
|
|
|
|
|
if ext == '.ipynb': |
|
|
return fast_analyze_notebook(file_path, repo_name, file_size) |
|
|
|
|
|
|
|
|
try: |
|
|
with open(file_path, 'rb') as f: |
|
|
content = f.read() |
|
|
except: |
|
|
return None |
|
|
|
|
|
lang = detect_language_fast(str(file_path)) |
|
|
|
|
|
|
|
|
lines = content.count(b'\n') + 1 |
|
|
|
|
|
|
|
|
comment_lines = 0 |
|
|
if lang in COMMENT_MARKERS: |
|
|
for marker in COMMENT_MARKERS[lang]: |
|
|
comment_lines += content.count(marker) |
|
|
|
|
|
comment_lines = min(comment_lines, lines // 2) |
|
|
|
|
|
|
|
|
functions = 0 |
|
|
if lang in FUNC_KEYWORDS: |
|
|
for keyword in FUNC_KEYWORDS[lang]: |
|
|
functions += content.count(keyword) |
|
|
|
|
|
|
|
|
tokens = len(content.split()) |
|
|
|
|
|
|
|
|
empty_lines = content.count(b'\n\n') + content.count(b'\r\n\r\n') |
|
|
|
|
|
code_lines = max(0, lines - empty_lines - comment_lines) |
|
|
|
|
|
return { |
|
|
'repo_name': repo_name, |
|
|
'file_path': str(file_path.name), |
|
|
'file_size_bytes': file_size, |
|
|
'language': lang, |
|
|
'total_lines': lines, |
|
|
'comment_lines': comment_lines, |
|
|
'code_lines': code_lines, |
|
|
'tokens': tokens, |
|
|
'functions': functions, |
|
|
'parameters': functions * 2, |
|
|
} |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
|
|
|
def fast_analyze_notebook(file_path: Path, repo_name: str, file_size: int) -> dict: |
|
|
"""快速分析 Jupyter Notebook""" |
|
|
try: |
|
|
with open(file_path, 'rb') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
code_cell_count = content.count(b'"cell_type": "code"') + content.count(b'"cell_type":"code"') |
|
|
|
|
|
|
|
|
lines = content.count(b'\n') + 1 |
|
|
code_lines = code_cell_count * 10 |
|
|
|
|
|
return { |
|
|
'repo_name': repo_name, |
|
|
'file_path': str(file_path.name), |
|
|
'file_size_bytes': file_size, |
|
|
'language': 'jupyter', |
|
|
'total_lines': lines, |
|
|
'comment_lines': code_cell_count, |
|
|
'code_lines': code_lines, |
|
|
'tokens': len(content.split()), |
|
|
'functions': content.count(b'def '), |
|
|
'parameters': content.count(b'def ') * 2, |
|
|
} |
|
|
except: |
|
|
return None |
|
|
|
|
|
|
|
|
def _default_repo_stats(): |
|
|
"""Factory function for defaultdict""" |
|
|
return { |
|
|
'total_files': 0, |
|
|
'total_lines': 0, |
|
|
'total_code_lines': 0, |
|
|
'total_comment_lines': 0, |
|
|
'total_tokens': 0, |
|
|
'total_functions': 0, |
|
|
'total_parameters': 0, |
|
|
'languages': Counter(), |
|
|
'file_sizes': [], |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
SKIP_DIRS = { |
|
|
'.git', 'node_modules', 'vendor', 'dist', 'build', '__pycache__', |
|
|
'.pytest_cache', '.ipynb_checkpoints', 'venv', 'env', '.venv', |
|
|
'target', '.idea', '.vscode', '.mypy_cache', '.tox', '.eggs', |
|
|
'site-packages', 'lib', 'libs', 'third_party', 'external' |
|
|
} |
|
|
|
|
|
|
|
|
CODE_EXTENSIONS = { |
|
|
'.py', '.java', '.c', '.h', '.hh', '.hpp', '.cpp', '.cc', '.cxx', '.c++', |
|
|
'.f', '.f90', '.f95', '.F', '.r', '.m', '.sh', '.bash', '.rs', '.go', |
|
|
'.ipynb' |
|
|
} |
|
|
|
|
|
|
|
|
def scan_repo_fast(args): |
|
|
"""快速扫描单个仓库(用于多进程)""" |
|
|
repo_path, max_file_size_bytes, max_files_per_repo = args |
|
|
repo_name = repo_path.name |
|
|
repo_files = [] |
|
|
file_count = 0 |
|
|
|
|
|
try: |
|
|
for root, dirs, files in os.walk(repo_path): |
|
|
|
|
|
dirs[:] = [d for d in dirs if d not in SKIP_DIRS] |
|
|
|
|
|
for file in files: |
|
|
if file_count >= max_files_per_repo: |
|
|
break |
|
|
|
|
|
file_path = Path(root) / file |
|
|
ext = file_path.suffix.lower() |
|
|
|
|
|
|
|
|
if ext in CODE_EXTENSIONS: |
|
|
result = fast_analyze_file(file_path, repo_name, max_file_size_bytes) |
|
|
if result: |
|
|
repo_files.append(result) |
|
|
file_count += 1 |
|
|
|
|
|
if file_count >= max_files_per_repo: |
|
|
break |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
return repo_files |
|
|
|
|
|
|
|
|
class CodeFileStatsFast: |
|
|
def __init__(self, repos_dir, output_dir, top_n=None, max_file_size_mb=2, max_files_per_repo=500): |
|
|
self.repos_dir = Path(repos_dir) |
|
|
self.output_dir = Path(output_dir) |
|
|
self.output_dir.mkdir(parents=True, exist_ok=True) |
|
|
self.top_n = top_n |
|
|
self.max_file_size_bytes = max_file_size_mb * 1024 * 1024 |
|
|
self.max_files_per_repo = max_files_per_repo |
|
|
|
|
|
self.file_stats = [] |
|
|
self.repo_stats = defaultdict(_default_repo_stats) |
|
|
|
|
|
|
|
|
self.checkpoint_file = self.output_dir / 'checkpoint.pkl' |
|
|
self.processed_repos = set() |
|
|
|
|
|
def load_checkpoint(self): |
|
|
"""加载断点""" |
|
|
if self.checkpoint_file.exists(): |
|
|
try: |
|
|
with open(self.checkpoint_file, 'rb') as f: |
|
|
data = pickle.load(f) |
|
|
self.processed_repos = data.get('processed_repos', set()) |
|
|
self.file_stats = data.get('file_stats', []) |
|
|
print(f"Loaded checkpoint: {len(self.processed_repos)} repos already processed") |
|
|
return True |
|
|
except: |
|
|
pass |
|
|
return False |
|
|
|
|
|
def save_checkpoint(self): |
|
|
"""保存断点""" |
|
|
try: |
|
|
with open(self.checkpoint_file, 'wb') as f: |
|
|
pickle.dump({ |
|
|
'processed_repos': self.processed_repos, |
|
|
'file_stats': self.file_stats, |
|
|
}, f) |
|
|
except: |
|
|
pass |
|
|
|
|
|
def scan_all_repos(self, num_workers=None): |
|
|
"""扫描所有仓库(优化版)""" |
|
|
if num_workers is None: |
|
|
num_workers = min(cpu_count(), 48) |
|
|
|
|
|
|
|
|
self.load_checkpoint() |
|
|
|
|
|
|
|
|
all_repos = sorted([d for d in self.repos_dir.iterdir() if d.is_dir()]) |
|
|
if self.top_n is None: |
|
|
selected_repos = all_repos |
|
|
else: |
|
|
selected_repos = all_repos[:self.top_n] |
|
|
|
|
|
|
|
|
repos_to_process = [r for r in selected_repos if r.name not in self.processed_repos] |
|
|
|
|
|
print(f"Total repos: {len(selected_repos)} ({'all' if self.top_n is None else f'top {self.top_n}'}), Already processed: {len(self.processed_repos)}, To process: {len(repos_to_process)}") |
|
|
print(f"Using {num_workers} workers...") |
|
|
|
|
|
if not repos_to_process: |
|
|
print("All repos already processed!") |
|
|
return |
|
|
|
|
|
|
|
|
args_list = [(repo, self.max_file_size_bytes, self.max_files_per_repo) for repo in repos_to_process] |
|
|
|
|
|
|
|
|
chunksize = max(1, len(repos_to_process) // (num_workers * 10)) |
|
|
|
|
|
|
|
|
processed_count = 0 |
|
|
checkpoint_interval = 500 |
|
|
|
|
|
with Pool(processes=num_workers) as pool: |
|
|
for repo_files in tqdm( |
|
|
pool.imap_unordered(scan_repo_fast, args_list, chunksize=chunksize), |
|
|
total=len(repos_to_process), |
|
|
desc="Scanning repos" |
|
|
): |
|
|
if repo_files: |
|
|
self.file_stats.extend(repo_files) |
|
|
if repo_files: |
|
|
self.processed_repos.add(repo_files[0]['repo_name']) |
|
|
|
|
|
processed_count += 1 |
|
|
|
|
|
|
|
|
if processed_count % checkpoint_interval == 0: |
|
|
self.save_checkpoint() |
|
|
print(f"\nCheckpoint saved: {len(self.processed_repos)} repos processed, {len(self.file_stats)} files found") |
|
|
|
|
|
|
|
|
self.save_checkpoint() |
|
|
print(f"Found {len(self.file_stats)} code files from {len(self.processed_repos)} repos") |
|
|
|
|
|
def aggregate_repo_stats(self): |
|
|
"""聚合仓库级统计(与原版兼容)""" |
|
|
for file_stat in self.file_stats: |
|
|
repo = file_stat['repo_name'] |
|
|
self.repo_stats[repo]['total_files'] += 1 |
|
|
self.repo_stats[repo]['total_lines'] += file_stat['total_lines'] |
|
|
self.repo_stats[repo]['total_code_lines'] += file_stat['code_lines'] |
|
|
self.repo_stats[repo]['total_comment_lines'] += file_stat['comment_lines'] |
|
|
self.repo_stats[repo]['total_tokens'] += file_stat['tokens'] |
|
|
self.repo_stats[repo]['total_functions'] += file_stat['functions'] |
|
|
self.repo_stats[repo]['total_parameters'] += file_stat['parameters'] |
|
|
self.repo_stats[repo]['languages'][file_stat['language']] += 1 |
|
|
self.repo_stats[repo]['file_sizes'].append(file_stat['file_size_bytes']) |
|
|
|
|
|
|
|
|
repo_stats_list = [] |
|
|
for repo, stats in self.repo_stats.items(): |
|
|
total_files = stats['total_files'] |
|
|
if total_files == 0: |
|
|
continue |
|
|
|
|
|
stats_dict = { |
|
|
'repo_name': repo, |
|
|
'full_name': repo.replace('___', '/'), |
|
|
'total_files': total_files, |
|
|
'total_lines': stats['total_lines'], |
|
|
'total_code_lines': stats['total_code_lines'], |
|
|
'total_comment_lines': stats['total_comment_lines'], |
|
|
'total_tokens': stats['total_tokens'], |
|
|
'total_functions': stats['total_functions'], |
|
|
'total_parameters': stats['total_parameters'], |
|
|
'language_count': len(stats['languages']), |
|
|
'primary_language': stats['languages'].most_common(1)[0][0] if stats['languages'] else 'unknown', |
|
|
'primary_language_files': stats['languages'].most_common(1)[0][1] if stats['languages'] else 0, |
|
|
} |
|
|
|
|
|
|
|
|
if stats['total_lines'] > 0: |
|
|
stats_dict['comment_ratio'] = stats['total_comment_lines'] / stats['total_lines'] |
|
|
else: |
|
|
stats_dict['comment_ratio'] = 0 |
|
|
|
|
|
if stats['total_functions'] > 0: |
|
|
stats_dict['avg_func_length'] = stats['total_code_lines'] / stats['total_functions'] |
|
|
stats_dict['avg_params_per_func'] = stats['total_parameters'] / stats['total_functions'] |
|
|
else: |
|
|
stats_dict['avg_func_length'] = 0 |
|
|
stats_dict['avg_params_per_func'] = 0 |
|
|
|
|
|
|
|
|
if stats['languages']: |
|
|
total_lang_files = sum(stats['languages'].values()) |
|
|
entropy = 0 |
|
|
for count in stats['languages'].values(): |
|
|
p = count / total_lang_files |
|
|
if p > 0: |
|
|
entropy -= p * math.log2(p) |
|
|
stats_dict['language_entropy'] = entropy |
|
|
else: |
|
|
stats_dict['language_entropy'] = 0 |
|
|
|
|
|
|
|
|
if stats['file_sizes']: |
|
|
stats_dict['avg_file_size_kb'] = statistics.mean(stats['file_sizes']) / 1024 |
|
|
stats_dict['max_file_size_mb'] = max(stats['file_sizes']) / (1024 * 1024) |
|
|
else: |
|
|
stats_dict['avg_file_size_kb'] = 0 |
|
|
stats_dict['max_file_size_mb'] = 0 |
|
|
|
|
|
|
|
|
if stats['languages']: |
|
|
primary_lang_count = stats['languages'].most_common(1)[0][1] |
|
|
stats_dict['primary_language_ratio'] = primary_lang_count / total_files |
|
|
else: |
|
|
stats_dict['primary_language_ratio'] = 0 |
|
|
|
|
|
repo_stats_list.append(stats_dict) |
|
|
|
|
|
return repo_stats_list |
|
|
|
|
|
def save_results(self): |
|
|
"""保存结果""" |
|
|
|
|
|
file_df = pd.DataFrame(self.file_stats) |
|
|
if len(file_df) > 10000: |
|
|
file_df_sample = file_df.sample(n=10000, random_state=42) |
|
|
else: |
|
|
file_df_sample = file_df |
|
|
|
|
|
|
|
|
file_df_sample.to_csv(self.output_dir / 'file_level_metrics_sampled.csv', index=False) |
|
|
|
|
|
|
|
|
repo_stats_list = self.aggregate_repo_stats() |
|
|
repo_df = pd.DataFrame(repo_stats_list) |
|
|
top_n_suffix = f"_top{self.top_n}" if self.top_n else "" |
|
|
repo_df.to_csv(self.output_dir / f'repo_level_metrics{top_n_suffix}.csv', index=False) |
|
|
|
|
|
|
|
|
summary = { |
|
|
'total_files': len(self.file_stats), |
|
|
'total_repos': len(self.repo_stats), |
|
|
'avg_files_per_repo': len(self.file_stats) / len(self.repo_stats) if self.repo_stats else 0, |
|
|
} |
|
|
|
|
|
|
|
|
lang_counter = Counter(f['language'] for f in self.file_stats) |
|
|
summary['files_by_language'] = dict(lang_counter.most_common(20)) |
|
|
|
|
|
|
|
|
with open(self.output_dir / 'code_stats_summary.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(summary, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
if self.checkpoint_file.exists(): |
|
|
self.checkpoint_file.unlink() |
|
|
|
|
|
def run(self, num_workers=None): |
|
|
"""执行完整流程""" |
|
|
print("Stage C (Fast): Analyzing code files...") |
|
|
self.scan_all_repos(num_workers=num_workers) |
|
|
print("Aggregating repo-level stats...") |
|
|
print("Saving results...") |
|
|
self.save_results() |
|
|
print(f"Code file stats complete! Results saved to {self.output_dir}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
repos_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered" |
|
|
output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/code_stats" |
|
|
|
|
|
|
|
|
stats = CodeFileStatsFast( |
|
|
repos_dir, |
|
|
output_dir, |
|
|
top_n=15000, |
|
|
max_file_size_mb=2, |
|
|
max_files_per_repo=500 |
|
|
) |
|
|
stats.run(num_workers=48) |
|
|
|
|
|
|