File size: 2,692 Bytes
dbf7313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from __future__ import annotations


def _repo_title(repo_slug: str) -> str:
    name = repo_slug.split("/", 1)[-1]
    return name.replace("-", " ").replace("_", " ").title()


def build_hf_dataset_card(
    repo_slug: str,
    snapshot_id: str,
    *,
    include_new_contributors: bool = False,
    notes: list[str] | None = None,
) -> str:
    repo_title = _repo_title(repo_slug)
    dataset_title = f"{repo_title} PR Dataset"
    new_contributor_config = ""
    new_contributor_files = ""
    if include_new_contributors:
        new_contributor_config = """- config_name: new_contributors
  data_files:
  - split: train
    path: new_contributors.parquet
"""
        new_contributor_files = """- `new_contributors.parquet`
- `new-contributors-report.json`
- `new-contributors-report.md`
"""
    note_lines = "\n".join(f"- {note}" for note in (notes or []))
    if note_lines:
        note_lines = f"{note_lines}\n"
    return f"""---
pretty_name: {dataset_title}
configs:
- config_name: issues
  data_files:
  - split: train
    path: issues.parquet
  default: true
- config_name: prs
  data_files:
  - split: train
    path: pull_requests.parquet
- config_name: issue_comments
  data_files:
  - split: train
    path: issue_comments.parquet
- config_name: pr_comments
  data_files:
  - split: train
    path: pr_comments.parquet
- config_name: pr_reviews
  data_files:
  - split: train
    path: reviews.parquet
- config_name: pr_files
  data_files:
  - split: train
    path: pr_files.parquet
- config_name: pr_diffs
  data_files:
  - split: train
    path: pr_diffs.parquet
- config_name: review_comments
  data_files:
  - split: train
    path: review_comments.parquet
- config_name: links
  data_files:
  - split: train
    path: links.parquet
- config_name: events
  data_files:
  - split: train
    path: events.parquet
{new_contributor_config}---
---

# {dataset_title}

Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo_slug}`.

Files:
- `issues.parquet`
- `pull_requests.parquet`
- `comments.parquet`
- `issue_comments.parquet` (derived view of issue discussion comments)
- `pr_comments.parquet` (derived view of pull request discussion comments)
- `reviews.parquet`
- `pr_files.parquet`
- `pr_diffs.parquet`
- `review_comments.parquet`
- `links.parquet`
- `events.parquet`
{new_contributor_files}
Use:
- duplicate PR and issue analysis
- triage and ranking experiments
- eval set creation

Notes:
- latest snapshot: `{snapshot_id}`
- raw data only; no labels or moderation decisions
- PR metadata, file-level patch hunks, and full unified diffs are included
- full file contents for changed files are not included
{note_lines}"""