Spaces:
Sleeping
Sleeping
File size: 2,692 Bytes
dbf7313 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | from __future__ import annotations
def _repo_title(repo_slug: str) -> str:
name = repo_slug.split("/", 1)[-1]
return name.replace("-", " ").replace("_", " ").title()
def build_hf_dataset_card(
repo_slug: str,
snapshot_id: str,
*,
include_new_contributors: bool = False,
notes: list[str] | None = None,
) -> str:
repo_title = _repo_title(repo_slug)
dataset_title = f"{repo_title} PR Dataset"
new_contributor_config = ""
new_contributor_files = ""
if include_new_contributors:
new_contributor_config = """- config_name: new_contributors
data_files:
- split: train
path: new_contributors.parquet
"""
new_contributor_files = """- `new_contributors.parquet`
- `new-contributors-report.json`
- `new-contributors-report.md`
"""
note_lines = "\n".join(f"- {note}" for note in (notes or []))
if note_lines:
note_lines = f"{note_lines}\n"
return f"""---
pretty_name: {dataset_title}
configs:
- config_name: issues
data_files:
- split: train
path: issues.parquet
default: true
- config_name: prs
data_files:
- split: train
path: pull_requests.parquet
- config_name: issue_comments
data_files:
- split: train
path: issue_comments.parquet
- config_name: pr_comments
data_files:
- split: train
path: pr_comments.parquet
- config_name: pr_reviews
data_files:
- split: train
path: reviews.parquet
- config_name: pr_files
data_files:
- split: train
path: pr_files.parquet
- config_name: pr_diffs
data_files:
- split: train
path: pr_diffs.parquet
- config_name: review_comments
data_files:
- split: train
path: review_comments.parquet
- config_name: links
data_files:
- split: train
path: links.parquet
- config_name: events
data_files:
- split: train
path: events.parquet
{new_contributor_config}---
---
# {dataset_title}
Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo_slug}`.
Files:
- `issues.parquet`
- `pull_requests.parquet`
- `comments.parquet`
- `issue_comments.parquet` (derived view of issue discussion comments)
- `pr_comments.parquet` (derived view of pull request discussion comments)
- `reviews.parquet`
- `pr_files.parquet`
- `pr_diffs.parquet`
- `review_comments.parquet`
- `links.parquet`
- `events.parquet`
{new_contributor_files}
Use:
- duplicate PR and issue analysis
- triage and ranking experiments
- eval set creation
Notes:
- latest snapshot: `{snapshot_id}`
- raw data only; no labels or moderation decisions
- PR metadata, file-level patch hunks, and full unified diffs are included
- full file contents for changed files are not included
{note_lines}"""
|