Spaces:
Sleeping
Sleeping
Materialize triplets as real files in working dir
Browse files- .gitignore +1 -0
- app.py +19 -12
.gitignore
CHANGED
|
@@ -2,3 +2,4 @@ __pycache__/
|
|
| 2 |
*.pyc
|
| 3 |
.gradio/
|
| 4 |
flagged/
|
|
|
|
|
|
| 2 |
*.pyc
|
| 3 |
.gradio/
|
| 4 |
flagged/
|
| 5 |
+
triplets_local/
|
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import json
|
|
| 13 |
import os
|
| 14 |
import random
|
| 15 |
import re
|
|
|
|
| 16 |
import uuid
|
| 17 |
from datetime import datetime, timezone
|
| 18 |
from pathlib import Path
|
|
@@ -33,19 +34,29 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
| 33 |
|
| 34 |
|
| 35 |
def load_triplets() -> tuple[Path, list[dict]]:
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
)
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
rows: list[dict] = []
|
| 43 |
-
with open(
|
| 44 |
for line in f:
|
| 45 |
line = line.strip()
|
| 46 |
if line:
|
| 47 |
rows.append(json.loads(line))
|
| 48 |
-
return
|
| 49 |
|
| 50 |
|
| 51 |
def email_slug(email: str) -> str:
|
|
@@ -77,10 +88,6 @@ TRIPLETS_ROOT, TRIPLETS = load_triplets()
|
|
| 77 |
TRIPLET_BY_ID = {r["triplet_id"]: r for r in TRIPLETS}
|
| 78 |
print(f"loaded {len(TRIPLETS)} triplets from {TRIPLETS_ROOT}")
|
| 79 |
|
| 80 |
-
# Triplets live in the HF snapshot cache (outside the gradio working dir),
|
| 81 |
-
# so Gradio refuses to serve them by default. Whitelist the snapshot dir.
|
| 82 |
-
gr.set_static_paths([str(TRIPLETS_ROOT)])
|
| 83 |
-
|
| 84 |
|
| 85 |
# ---------- session helpers ---------------------------------------------------
|
| 86 |
|
|
@@ -344,4 +351,4 @@ with gr.Blocks(title="Denoising A/B Judging", theme=gr.themes.Soft()) as demo:
|
|
| 344 |
|
| 345 |
|
| 346 |
if __name__ == "__main__":
|
| 347 |
-
demo.launch(
|
|
|
|
| 13 |
import os
|
| 14 |
import random
|
| 15 |
import re
|
| 16 |
+
import shutil
|
| 17 |
import uuid
|
| 18 |
from datetime import datetime, timezone
|
| 19 |
from pathlib import Path
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def load_triplets() -> tuple[Path, list[dict]]:
|
| 37 |
+
"""Download the triplets dataset, then materialize as real files under the
|
| 38 |
+
working directory so Gradio will serve them. The HF cache uses symlinks
|
| 39 |
+
into a `blobs/` sibling, which Gradio's symlink-resolving allowlist check
|
| 40 |
+
rejects even when the snapshot dir is whitelisted."""
|
| 41 |
+
snapshot = Path(
|
| 42 |
+
snapshot_download(
|
| 43 |
+
repo_id=TRIPLETS_REPO,
|
| 44 |
+
repo_type="dataset",
|
| 45 |
+
token=HF_TOKEN,
|
| 46 |
+
)
|
| 47 |
)
|
| 48 |
+
local_root = Path(__file__).resolve().parent / "triplets_local"
|
| 49 |
+
if local_root.exists():
|
| 50 |
+
shutil.rmtree(local_root)
|
| 51 |
+
# follow_symlinks=True (default) copies file contents from the blobs sibling.
|
| 52 |
+
shutil.copytree(snapshot, local_root)
|
| 53 |
rows: list[dict] = []
|
| 54 |
+
with open(local_root / "metadata.jsonl") as f:
|
| 55 |
for line in f:
|
| 56 |
line = line.strip()
|
| 57 |
if line:
|
| 58 |
rows.append(json.loads(line))
|
| 59 |
+
return local_root, rows
|
| 60 |
|
| 61 |
|
| 62 |
def email_slug(email: str) -> str:
|
|
|
|
| 88 |
TRIPLET_BY_ID = {r["triplet_id"]: r for r in TRIPLETS}
|
| 89 |
print(f"loaded {len(TRIPLETS)} triplets from {TRIPLETS_ROOT}")
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# ---------- session helpers ---------------------------------------------------
|
| 93 |
|
|
|
|
| 351 |
|
| 352 |
|
| 353 |
if __name__ == "__main__":
|
| 354 |
+
demo.launch()
|