Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| def _repo_title(repo_slug: str) -> str: | |
| name = repo_slug.split("/", 1)[-1] | |
| return name.replace("-", " ").replace("_", " ").title() | |
| def build_hf_dataset_card( | |
| repo_slug: str, | |
| snapshot_id: str, | |
| *, | |
| include_new_contributors: bool = False, | |
| notes: list[str] | None = None, | |
| ) -> str: | |
| repo_title = _repo_title(repo_slug) | |
| dataset_title = f"{repo_title} PR Dataset" | |
| new_contributor_config = "" | |
| new_contributor_files = "" | |
| if include_new_contributors: | |
| new_contributor_config = """- config_name: new_contributors | |
| data_files: | |
| - split: train | |
| path: new_contributors.parquet | |
| """ | |
| new_contributor_files = """- `new_contributors.parquet` | |
| - `new-contributors-report.json` | |
| - `new-contributors-report.md` | |
| """ | |
| note_lines = "\n".join(f"- {note}" for note in (notes or [])) | |
| if note_lines: | |
| note_lines = f"{note_lines}\n" | |
| return f"""--- | |
| pretty_name: {dataset_title} | |
| configs: | |
| - config_name: issues | |
| data_files: | |
| - split: train | |
| path: issues.parquet | |
| default: true | |
| - config_name: prs | |
| data_files: | |
| - split: train | |
| path: pull_requests.parquet | |
| - config_name: issue_comments | |
| data_files: | |
| - split: train | |
| path: issue_comments.parquet | |
| - config_name: pr_comments | |
| data_files: | |
| - split: train | |
| path: pr_comments.parquet | |
| - config_name: pr_reviews | |
| data_files: | |
| - split: train | |
| path: reviews.parquet | |
| - config_name: pr_files | |
| data_files: | |
| - split: train | |
| path: pr_files.parquet | |
| - config_name: pr_diffs | |
| data_files: | |
| - split: train | |
| path: pr_diffs.parquet | |
| - config_name: review_comments | |
| data_files: | |
| - split: train | |
| path: review_comments.parquet | |
| - config_name: links | |
| data_files: | |
| - split: train | |
| path: links.parquet | |
| - config_name: events | |
| data_files: | |
| - split: train | |
| path: events.parquet | |
| {new_contributor_config}--- | |
| --- | |
| # {dataset_title} | |
| Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo_slug}`. | |
| Files: | |
| - `issues.parquet` | |
| - `pull_requests.parquet` | |
| - `comments.parquet` | |
| - `issue_comments.parquet` (derived view of issue discussion comments) | |
| - `pr_comments.parquet` (derived view of pull request discussion comments) | |
| - `reviews.parquet` | |
| - `pr_files.parquet` | |
| - `pr_diffs.parquet` | |
| - `review_comments.parquet` | |
| - `links.parquet` | |
| - `events.parquet` | |
| {new_contributor_files} | |
| Use: | |
| - duplicate PR and issue analysis | |
| - triage and ranking experiments | |
| - eval set creation | |
| Notes: | |
| - latest snapshot: `{snapshot_id}` | |
| - raw data only; no labels or moderation decisions | |
| - PR metadata, file-level patch hunks, and full unified diffs are included | |
| - full file contents for changed files are not included | |
| {note_lines}""" | |