AdithyaSK HF Staff commited on
Commit
e587765
Β·
1 Parent(s): ff3a8e7

Recognise registry.json as a positive Harbor-dataset signal

Browse files

Per Harbor docs (harborframework.com/docs/datasets), a registry.json at the
dataset root registers it with the Harbor CLI (--registry-path). It's NOT
required β€” Terminal-Bench, DABstep, TitanBench all ship without one β€” but
when present it's a definitive Harbor-dataset signal.

- list_hf_tasks now treats registry.json at root as a positive marker that
skips the task.toml subdir sampling (cheaper, more correct).
- Empty-state message lists all three recognised markers: registry.json,
tasks/ (nested), or top-level dirs with task.toml (flat).
- Verified: Repo2RLEnv (registry+nested) 127, DABstep (flat) 450,
Terminal-Bench (flat) 89, TitanBench (nested) 2, TaskTrove (not Harbor) 0.

Files changed (2) hide show
  1. static/app.js +1 -1
  2. viewer/hub.py +22 -13
static/app.js CHANGED
@@ -373,7 +373,7 @@ async function renderWorkspace(params) {
373
  <p>Select a task from the list to view its spec, files & run command.</p></div>`
374
  : `<div class="emptysel"><div class="ic">${ICON.info}</div>
375
  <p><strong style="color:var(--text)">No Harbor tasks found in this dataset.</strong><br>
376
- The visualiser looks for <code>task.toml</code> files (either at the root or under <code>tasks/</code>). This dataset doesn't seem to follow the Harbor task-spec format.</p></div>`;
377
  }
378
 
379
  // ── load one task's detail into the tree + content (no full re-render) ──
 
373
  <p>Select a task from the list to view its spec, files & run command.</p></div>`
374
  : `<div class="emptysel"><div class="ic">${ICON.info}</div>
375
  <p><strong style="color:var(--text)">No Harbor tasks found in this dataset.</strong><br>
376
+ The visualiser recognises Harbor datasets by either a <code>registry.json</code> at the root, a <code>tasks/</code> folder (nested layout), or top-level dirs containing <code>task.toml</code> (flat layout). This dataset doesn't follow any of those.</p></div>`;
377
  }
378
 
379
  // ── load one task's detail into the tree + content (no full re-render) ──
viewer/hub.py CHANGED
@@ -101,25 +101,34 @@ def list_hf_tasks(dataset_id: str, revision: str | None = None, *, ttl: float =
101
  root = list(api.list_repo_tree(dataset_id, repo_type="dataset", revision=revision, recursive=False))
102
  names = {e.path: e for e in root}
103
 
 
 
 
 
 
 
104
  if "tasks" in names and _is_dir(names["tasks"]):
105
  sub = api.list_repo_tree(dataset_id, "tasks", repo_type="dataset", revision=revision, recursive=False)
106
  ids = sorted(e.path.split("/")[-1] for e in sub if _is_dir(e))
107
  else:
108
  # Flat layout: top-level folders MAY be tasks (skip dotfiles/README/etc.).
109
- # But some datasets (e.g. TaskTrove) have top-level dirs that aren't Harbor
110
- # tasks β€” they hold `tasks.parquet` or similar. Verify the layout by sampling
111
- # the first few candidates for a `task.toml`; if none have one, this isn't a
112
- # Harbor task-spec dataset and we return [] rather than listing random folders.
113
  candidates = sorted(e.path for e in root if _is_dir(e) and not e.path.startswith("."))
114
- ids = []
115
- for sample in candidates[:3]:
116
- try:
117
- sub = list(api.list_repo_tree(dataset_id, sample, repo_type="dataset", revision=revision, recursive=False))
118
- except Exception: # noqa: BLE001
119
- continue
120
- if any(getattr(e, "path", "").endswith("task.toml") for e in sub):
121
- ids = candidates
122
- break
 
 
 
123
 
124
  _TASKS_CACHE[key] = (ids, now)
125
  return ids
 
101
  root = list(api.list_repo_tree(dataset_id, repo_type="dataset", revision=revision, recursive=False))
102
  names = {e.path: e for e in root}
103
 
104
+ # `registry.json` at the root is a positive signal that this is a Harbor
105
+ # dataset (Repo2RLEnv pushes it; harbor's --registry-path consumes it).
106
+ # It's *not* required β€” terminal-bench-2.0, dabstep-harbor, titanbench all
107
+ # ship without one β€” but its presence skips the task.toml sampling below.
108
+ has_registry = "registry.json" in names
109
+
110
  if "tasks" in names and _is_dir(names["tasks"]):
111
  sub = api.list_repo_tree(dataset_id, "tasks", repo_type="dataset", revision=revision, recursive=False)
112
  ids = sorted(e.path.split("/")[-1] for e in sub if _is_dir(e))
113
  else:
114
  # Flat layout: top-level folders MAY be tasks (skip dotfiles/README/etc.).
115
+ # Some datasets (e.g. TaskTrove) have top-level dirs that aren't Harbor
116
+ # tasks β€” they hold `tasks.parquet` or similar. Verify by sampling the
117
+ # first few candidates for a `task.toml`. If `registry.json` is at the
118
+ # root we already know this is a Harbor dataset and skip the check.
119
  candidates = sorted(e.path for e in root if _is_dir(e) and not e.path.startswith("."))
120
+ if has_registry:
121
+ ids = candidates
122
+ else:
123
+ ids = []
124
+ for sample in candidates[:3]:
125
+ try:
126
+ sub = list(api.list_repo_tree(dataset_id, sample, repo_type="dataset", revision=revision, recursive=False))
127
+ except Exception: # noqa: BLE001
128
+ continue
129
+ if any(getattr(e, "path", "").endswith("task.toml") for e in sub):
130
+ ids = candidates
131
+ break
132
 
133
  _TASKS_CACHE[key] = (ids, now)
134
  return ids