dataset-builder / data1 /rename.py
DouDou
Upload data1/rename.py with huggingface_hub
218123c verified
import os
import pandas as pd
from pathlib import Path
def rename_repos(csv_path: str, repos_dir: str):
"""Match and rename repos from various formats to owner___repo"""
repos_dir = Path(repos_dir)
df = pd.read_csv(csv_path)
# Build lookup: possible old names -> new name (owner___repo)
lookup = {}
for _, row in df.iterrows():
owner, repo = row["full_name"].split("/")
new_name = f"{owner}___{repo}" # three underscores
# Possible old patterns
lookup[repo] = new_name # just repo name
lookup[f"{owner}_{repo}"] = new_name # owner_repo (single underscore)
lookup[f"{owner}__{repo}"] = new_name # owner__repo (double underscore, just in case)
lookup[new_name] = new_name # already correct
# Get existing directories
existing = [d for d in repos_dir.iterdir() if d.is_dir() and not d.name.startswith(".")]
renamed, skipped = 0, 0
for d in existing:
if d.name in lookup:
new_name = lookup[d.name]
if d.name != new_name:
new_path = repos_dir / new_name
if new_path.exists():
print(f"[SKIP] Target exists: {d.name} -> {new_name}")
skipped += 1
else:
d.rename(new_path)
print(f"[OK] {d.name} -> {new_name}")
renamed += 1
else:
print(f"[WARN] Not in CSV: {d.name}")
skipped += 1
print(f"\nDone: {renamed} renamed, {skipped} skipped")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--csv", default="./workdir/repos_checked.csv")
parser.add_argument("--repos", default="./workdir/repos_raw")
args = parser.parse_args()
rename_repos(args.csv, args.repos)