| | import os |
| | import pandas as pd |
| | import hashlib |
| | from pathlib import Path |
| | from tqdm import tqdm |
| |
|
| | |
| | repos_df = pd.read_csv("workdir/repos_checked.csv") |
| | repo_meta = repos_df.set_index("full_name")[["keyword", "license"]].to_dict("index") |
| |
|
| | |
| | print("Processing crawled repos...") |
| | crawl_rows = [] |
| | filtered_dir = Path("workdir/repos_filtered") |
| |
|
| | for repo_dir in tqdm(list(filtered_dir.iterdir()), desc="Reading filtered repos"): |
| | if not repo_dir.is_dir() or repo_dir.name.startswith("."): |
| | continue |
| |
|
| | full_name = repo_dir.name.replace("___", "/", 1) |
| | meta = repo_meta.get(full_name, {"keyword": "", "license": ""}) |
| |
|
| | for file_path in repo_dir.rglob("*"): |
| | if not file_path.is_file(): |
| | continue |
| |
|
| | try: |
| | with open(file_path, "r", encoding="utf-8", errors="ignore") as f: |
| | text = f.read() |
| |
|
| | crawl_rows.append( |
| | { |
| | "text": text, |
| | "repo_name": full_name, |
| | "path": str(file_path.relative_to(repo_dir)), |
| | "language": file_path.suffix.lstrip(".") or "unknown", |
| | "license": meta["license"], |
| | "size": len(text), |
| | "keyword": meta["keyword"], |
| | "text_hash": hashlib.sha256(text.encode()).hexdigest(), |
| | "config": "", |
| | "split": "", |
| | "repo_path": "", |
| | "ds_source": "crawl", |
| | } |
| | ) |
| | except Exception as e: |
| | print(f"Error reading {file_path}: {e}") |
| |
|
| | crawl_df = pd.DataFrame(crawl_rows) |
| |
|
| | |
| | print("\nLoading chempile data...") |
| | chempile_files = sorted(Path("./datasets/all_chempile_code").glob("chempile_code_complete_*.csv")) |
| | chempile_df = pd.concat([pd.read_csv(f) for f in tqdm(chempile_files)], ignore_index=True) |
| | chempile_df["ds_source"] = "chempile" |
| |
|
| | |
| | print("\nMerging datasets...") |
| | merged_df = pd.concat([chempile_df, crawl_df], ignore_index=True) |
| | original_count = len(merged_df) |
| |
|
| | |
| | print("Computing unified text_hash for all rows...") |
| | merged_df["text_hash"] = merged_df["text"].apply(lambda x: hashlib.sha1(str(x).encode()).hexdigest()) |
| |
|
| | |
| | print("Deduplicating by text_hash...") |
| | merged_df = merged_df.drop_duplicates(subset=["text_hash"], keep="first") |
| |
|
| | |
| | print("\nSaving in 500MB chunks...") |
| | merged_data_dir = "./datasets/data_merged" |
| | os.makedirs(merged_data_dir, exist_ok=True) |
| | merged_df.to_csv(f"{merged_data_dir}/dataset_all.csv") |
| | MAX_SIZE_MB = 500 |
| | chunk_num = 1 |
| | rows_per_chunk = 50000 |
| | start_idx = 0 |
| |
|
| | while start_idx < len(merged_df): |
| | end_idx = min(start_idx + rows_per_chunk, len(merged_df)) |
| | chunk_df = merged_df.iloc[start_idx:end_idx] |
| |
|
| | output_path = f"{merged_data_dir}/{chunk_num:03d}.csv" |
| | chunk_df.to_csv(output_path, index=False) |
| | size_mb = os.path.getsize(output_path) / (1024 * 1024) |
| |
|
| | if size_mb > 0: |
| | rows_per_chunk = int(rows_per_chunk * (MAX_SIZE_MB / size_mb) * 0.95) |
| |
|
| | print(f"Saved {output_path}: {size_mb:.1f}MB, {len(chunk_df):,} rows") |
| | start_idx = end_idx |
| | chunk_num += 1 |
| |
|
| | print(f"\nTotal: {len(merged_df):,} rows ({len(crawl_df):,} crawl + {len(chempile_df):,} chempile)") |
| | print(f"Deduplicated: {len(chempile_df) + len(crawl_df) - len(merged_df):,} rows removed") |
| |
|