cgeorgiaw HF Staff commited on
Commit
78c0906
·
verified ·
1 Parent(s): bf2aed6

Materialize triplets as real files in working dir

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +19 -12
.gitignore CHANGED
@@ -2,3 +2,4 @@ __pycache__/
2
  *.pyc
3
  .gradio/
4
  flagged/
 
 
2
  *.pyc
3
  .gradio/
4
  flagged/
5
+ triplets_local/
app.py CHANGED
@@ -13,6 +13,7 @@ import json
13
  import os
14
  import random
15
  import re
 
16
  import uuid
17
  from datetime import datetime, timezone
18
  from pathlib import Path
@@ -33,19 +34,29 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
33
 
34
 
35
  def load_triplets() -> tuple[Path, list[dict]]:
36
- local = snapshot_download(
37
- repo_id=TRIPLETS_REPO,
38
- repo_type="dataset",
39
- token=HF_TOKEN,
 
 
 
 
 
 
40
  )
41
- root = Path(local)
 
 
 
 
42
  rows: list[dict] = []
43
- with open(root / "metadata.jsonl") as f:
44
  for line in f:
45
  line = line.strip()
46
  if line:
47
  rows.append(json.loads(line))
48
- return root, rows
49
 
50
 
51
  def email_slug(email: str) -> str:
@@ -77,10 +88,6 @@ TRIPLETS_ROOT, TRIPLETS = load_triplets()
77
  TRIPLET_BY_ID = {r["triplet_id"]: r for r in TRIPLETS}
78
  print(f"loaded {len(TRIPLETS)} triplets from {TRIPLETS_ROOT}")
79
 
80
- # Triplets live in the HF snapshot cache (outside the gradio working dir),
81
- # so Gradio refuses to serve them by default. Whitelist the snapshot dir.
82
- gr.set_static_paths([str(TRIPLETS_ROOT)])
83
-
84
 
85
  # ---------- session helpers ---------------------------------------------------
86
 
@@ -344,4 +351,4 @@ with gr.Blocks(title="Denoising A/B Judging", theme=gr.themes.Soft()) as demo:
344
 
345
 
346
  if __name__ == "__main__":
347
- demo.launch(allowed_paths=[str(TRIPLETS_ROOT)])
 
13
  import os
14
  import random
15
  import re
16
+ import shutil
17
  import uuid
18
  from datetime import datetime, timezone
19
  from pathlib import Path
 
34
 
35
 
36
  def load_triplets() -> tuple[Path, list[dict]]:
37
+ """Download the triplets dataset, then materialize as real files under the
38
+ working directory so Gradio will serve them. The HF cache uses symlinks
39
+ into a `blobs/` sibling, which Gradio's symlink-resolving allowlist check
40
+ rejects even when the snapshot dir is whitelisted."""
41
+ snapshot = Path(
42
+ snapshot_download(
43
+ repo_id=TRIPLETS_REPO,
44
+ repo_type="dataset",
45
+ token=HF_TOKEN,
46
+ )
47
  )
48
+ local_root = Path(__file__).resolve().parent / "triplets_local"
49
+ if local_root.exists():
50
+ shutil.rmtree(local_root)
51
+ # follow_symlinks=True (default) copies file contents from the blobs sibling.
52
+ shutil.copytree(snapshot, local_root)
53
  rows: list[dict] = []
54
+ with open(local_root / "metadata.jsonl") as f:
55
  for line in f:
56
  line = line.strip()
57
  if line:
58
  rows.append(json.loads(line))
59
+ return local_root, rows
60
 
61
 
62
  def email_slug(email: str) -> str:
 
88
  TRIPLET_BY_ID = {r["triplet_id"]: r for r in TRIPLETS}
89
  print(f"loaded {len(TRIPLETS)} triplets from {TRIPLETS_ROOT}")
90
 
 
 
 
 
91
 
92
  # ---------- session helpers ---------------------------------------------------
93
 
 
351
 
352
 
353
  if __name__ == "__main__":
354
+ demo.launch()