File size: 5,907 Bytes
a301de7
 
 
 
 
 
 
 
 
 
 
f718aea
 
 
a301de7
f718aea
a301de7
f718aea
 
 
 
 
 
 
a301de7
 
 
f718aea
a301de7
 
f718aea
 
 
 
a301de7
 
f718aea
a301de7
e265a14
 
f718aea
a301de7
f718aea
 
a301de7
 
 
 
 
 
 
 
 
 
 
 
 
f718aea
a301de7
 
 
 
f718aea
 
a301de7
 
 
 
 
 
f718aea
 
 
a301de7
f718aea
a301de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f718aea
a301de7
 
 
e3c4a23
a301de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f718aea
 
 
a301de7
f718aea
 
a301de7
 
 
f718aea
 
a301de7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""Harbor Visualiser β€” FastAPI backend + Harbor Hub UI.

Serves a single-page "Harbor Hub" themed UI (static/) plus a JSON API that
reuses the existing loader/parser:

    GET /                       β†’ the SPA (static/index.html)
    GET /api/hub/datasets       β†’ live list of Harbor-tagged HF datasets
    GET /api/hub/count?id=      β†’ task count for one Hub dataset (memoised)
    GET /api/dataset?uri=       β†’ fetch a dataset, return its task ids + meta
    GET /api/task?uri=&task=    β†’ one task's parsed spec (files + metadata)
    GET /healthz

Run locally:
    pip install -r requirements.txt
    uvicorn app:app --reload --port 7860      # β†’ http://127.0.0.1:7860

On a Hugging Face Docker Space it runs via the Dockerfile (uvicorn :7860).
"""

from __future__ import annotations

import logging
from pathlib import Path

from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles

from viewer import fetch_dataset, fetch_hf_task, list_tasks, load_task, parse_dataset_uri
from viewer.hub import count_tasks, list_harbor_datasets, list_hf_tasks

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
logger = logging.getLogger("harbor-visualiser")

HERE = Path(__file__).resolve().parent
STATIC = HERE / "static"

app = FastAPI(title="Harbor Visualiser", docs_url="/api/docs")


# ---------------------------------------------------------------------------
# API
# ---------------------------------------------------------------------------

@app.get("/api/hub/datasets")
def api_hub_datasets(
    q: str | None = Query(None, description="substring filter on dataset id"),
    sort: str = Query("downloads"),
    limit: int = Query(500, ge=1, le=2000),
) -> JSONResponse:
    """Live list of Harbor-tagged datasets on the HF Hub (no stale cache)."""
    try:
        ds = list_harbor_datasets(query=q, sort=sort, limit=limit)
    except Exception as exc:  # noqa: BLE001
        raise HTTPException(502, f"HF Hub listing failed: {exc}") from exc
    return JSONResponse({"datasets": [d.as_dict() for d in ds], "count": len(ds)})


@app.get("/api/hub/count")
def api_hub_count(id: str = Query(..., description="dataset id, e.g. owner/name")) -> JSONResponse:
    """Task count for a single Hub dataset (one cheap list_repo_files call)."""
    return JSONResponse({"id": id, "tasks": count_tasks(id)})


@app.get("/api/dataset")
def api_dataset(
    uri: str = Query(..., description="owner/name | hf:// | gh:// | harbor:// | local path"),
    refresh: int = Query(0, description="1 = force re-fetch (bypass cache)"),
) -> JSONResponse:
    """Fetch a dataset and return its task ids + source metadata."""
    try:
        source = parse_dataset_uri(uri)
    except ValueError as exc:
        raise HTTPException(400, str(exc)) from exc
    try:
        if source.kind == "hf":
            # List task ids via the Hub API β€” no download. Critical for large
            # datasets (2k+ tasks) which would otherwise snapshot the whole repo.
            tasks = list_hf_tasks(source.ident, source.revision)
        else:
            root = fetch_dataset(source, force=bool(refresh))
            tasks = list_tasks(root)
    except Exception as exc:  # noqa: BLE001
        raise HTTPException(502, f"fetch failed: {exc}") from exc
    return JSONResponse({
        "uri": uri,
        "display": source.display,
        "kind": source.kind,
        "ident": source.ident,
        "revision": source.revision,
        "tasks": tasks,
        "count": len(tasks),
    })


@app.get("/api/task")
def api_task(
    uri: str = Query(...),
    task: str = Query(..., description="task id (directory name)"),
    refresh: int = Query(0),
) -> JSONResponse:
    """Return one task's full parsed spec β€” metadata + every file."""
    try:
        source = parse_dataset_uri(uri)
    except ValueError as exc:
        raise HTTPException(400, str(exc)) from exc
    try:
        if source.kind == "hf":
            # Pull just this one task's files, not the entire dataset.
            root = fetch_hf_task(source, task, force=bool(refresh))
        else:
            root = fetch_dataset(source, force=bool(refresh))
        t = load_task(root, task)
    except FileNotFoundError as exc:
        raise HTTPException(404, str(exc)) from exc
    except Exception as exc:  # noqa: BLE001
        raise HTTPException(502, f"load failed: {exc}") from exc
    return JSONResponse({
        "id": t.id,
        "name": t.name,
        "org": t.org,
        "version": t.version,
        "description": t.description,
        "instruction_inline": t.instruction_inline,
        "difficulty": t.difficulty,
        "category": t.category,
        "keywords": t.keywords,
        "agent_timeout_sec": t.agent_timeout_sec,
        "verifier_timeout_sec": t.verifier_timeout_sec,
        "repo2env": t.repo2env,
        "task_toml_raw": t.task_toml_raw,
        "files": t.files,
    })


@app.get("/api/config")
def api_config() -> JSONResponse:
    """Runtime config for the UI. On a Hugging Face Space, $SPACE_HOST is the
    public app host (e.g. owner-name.hf.space) β€” we surface it so the deep-link
    / badge examples show the real Space URL instead of localhost."""
    import os
    return JSONResponse({
        "space_host": os.environ.get("SPACE_HOST") or None,
        "space_id": os.environ.get("SPACE_ID") or None,
    })


@app.get("/healthz")
def healthz() -> dict:
    return {"ok": True}


# ---------------------------------------------------------------------------
# UI (static SPA)
# ---------------------------------------------------------------------------

@app.get("/")
def index() -> FileResponse:
    return FileResponse(STATIC / "index.html", media_type="text/html")


app.mount("/static", StaticFiles(directory=str(STATIC)), name="static")