Spaces:
Running
Running
Recognise registry.json as a positive Harbor-dataset signal
Browse filesPer Harbor docs (harborframework.com/docs/datasets), a registry.json at the
dataset root registers it with the Harbor CLI (--registry-path). It's NOT
required β Terminal-Bench, DABstep, TitanBench all ship without one β but
when present it's a definitive Harbor-dataset signal.
- list_hf_tasks now treats registry.json at root as a positive marker that
skips the task.toml subdir sampling (cheaper, more correct).
- Empty-state message lists all three recognised markers: registry.json,
tasks/ (nested), or top-level dirs with task.toml (flat).
- Verified: Repo2RLEnv (registry+nested) 127, DABstep (flat) 450,
Terminal-Bench (flat) 89, TitanBench (nested) 2, TaskTrove (not Harbor) 0.
- static/app.js +1 -1
- viewer/hub.py +22 -13
static/app.js
CHANGED
|
@@ -373,7 +373,7 @@ async function renderWorkspace(params) {
|
|
| 373 |
<p>Select a task from the list to view its spec, files & run command.</p></div>`
|
| 374 |
: `<div class="emptysel"><div class="ic">${ICON.info}</div>
|
| 375 |
<p><strong style="color:var(--text)">No Harbor tasks found in this dataset.</strong><br>
|
| 376 |
-
The visualiser
|
| 377 |
}
|
| 378 |
|
| 379 |
// ββ load one task's detail into the tree + content (no full re-render) ββ
|
|
|
|
| 373 |
<p>Select a task from the list to view its spec, files & run command.</p></div>`
|
| 374 |
: `<div class="emptysel"><div class="ic">${ICON.info}</div>
|
| 375 |
<p><strong style="color:var(--text)">No Harbor tasks found in this dataset.</strong><br>
|
| 376 |
+
The visualiser recognises Harbor datasets by either a <code>registry.json</code> at the root, a <code>tasks/</code> folder (nested layout), or top-level dirs containing <code>task.toml</code> (flat layout). This dataset doesn't follow any of those.</p></div>`;
|
| 377 |
}
|
| 378 |
|
| 379 |
// ββ load one task's detail into the tree + content (no full re-render) ββ
|
viewer/hub.py
CHANGED
|
@@ -101,25 +101,34 @@ def list_hf_tasks(dataset_id: str, revision: str | None = None, *, ttl: float =
|
|
| 101 |
root = list(api.list_repo_tree(dataset_id, repo_type="dataset", revision=revision, recursive=False))
|
| 102 |
names = {e.path: e for e in root}
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
if "tasks" in names and _is_dir(names["tasks"]):
|
| 105 |
sub = api.list_repo_tree(dataset_id, "tasks", repo_type="dataset", revision=revision, recursive=False)
|
| 106 |
ids = sorted(e.path.split("/")[-1] for e in sub if _is_dir(e))
|
| 107 |
else:
|
| 108 |
# Flat layout: top-level folders MAY be tasks (skip dotfiles/README/etc.).
|
| 109 |
-
#
|
| 110 |
-
# tasks β they hold `tasks.parquet` or similar. Verify
|
| 111 |
-
#
|
| 112 |
-
#
|
| 113 |
candidates = sorted(e.path for e in root if _is_dir(e) and not e.path.startswith("."))
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
_TASKS_CACHE[key] = (ids, now)
|
| 125 |
return ids
|
|
|
|
| 101 |
root = list(api.list_repo_tree(dataset_id, repo_type="dataset", revision=revision, recursive=False))
|
| 102 |
names = {e.path: e for e in root}
|
| 103 |
|
| 104 |
+
# `registry.json` at the root is a positive signal that this is a Harbor
|
| 105 |
+
# dataset (Repo2RLEnv pushes it; harbor's --registry-path consumes it).
|
| 106 |
+
# It's *not* required β terminal-bench-2.0, dabstep-harbor, titanbench all
|
| 107 |
+
# ship without one β but its presence skips the task.toml sampling below.
|
| 108 |
+
has_registry = "registry.json" in names
|
| 109 |
+
|
| 110 |
if "tasks" in names and _is_dir(names["tasks"]):
|
| 111 |
sub = api.list_repo_tree(dataset_id, "tasks", repo_type="dataset", revision=revision, recursive=False)
|
| 112 |
ids = sorted(e.path.split("/")[-1] for e in sub if _is_dir(e))
|
| 113 |
else:
|
| 114 |
# Flat layout: top-level folders MAY be tasks (skip dotfiles/README/etc.).
|
| 115 |
+
# Some datasets (e.g. TaskTrove) have top-level dirs that aren't Harbor
|
| 116 |
+
# tasks β they hold `tasks.parquet` or similar. Verify by sampling the
|
| 117 |
+
# first few candidates for a `task.toml`. If `registry.json` is at the
|
| 118 |
+
# root we already know this is a Harbor dataset and skip the check.
|
| 119 |
candidates = sorted(e.path for e in root if _is_dir(e) and not e.path.startswith("."))
|
| 120 |
+
if has_registry:
|
| 121 |
+
ids = candidates
|
| 122 |
+
else:
|
| 123 |
+
ids = []
|
| 124 |
+
for sample in candidates[:3]:
|
| 125 |
+
try:
|
| 126 |
+
sub = list(api.list_repo_tree(dataset_id, sample, repo_type="dataset", revision=revision, recursive=False))
|
| 127 |
+
except Exception: # noqa: BLE001
|
| 128 |
+
continue
|
| 129 |
+
if any(getattr(e, "path", "").endswith("task.toml") for e in sub):
|
| 130 |
+
ids = candidates
|
| 131 |
+
break
|
| 132 |
|
| 133 |
_TASKS_CACHE[key] = (ids, now)
|
| 134 |
return ids
|