Spaces:

HuggingFaceH4
/

harbor-visualiser

Running

App Files Files Community

harbor-visualiser / viewer /parse.py

AdithyaSK HF Staff

v0.4: surface ALL task files, not just an allowlist

3949fb1 9 days ago

raw

history blame contribute delete

7.74 kB

	"""Walk a Harbor dataset directory + load each task's spec files.

	A "Harbor task" is a directory containing `task.toml` at its root, plus
	any of: `instruction.md`, `solution/patch.diff`, `solution/solve.sh`,
	`tests/test.sh`, `environment/Dockerfile`. We tolerate missing files —
	not every task ships every artifact.

	Two dataset layouts are accepted:

	flat: <root>/<task-id>/task.toml
	nested: <root>/tasks/<task-id>/task.toml

	We discover both transparently.
	"""

	from __future__ import annotations

	import logging
	import tomllib
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any

	logger = logging.getLogger(__name__)


	@dataclass(slots=True)
	class HarborTask:
	"""The full parsed view of a single Harbor task directory."""

	# Identity
	id: str # the directory name, e.g. "pallets__click-3373"
	root: Path # absolute path to the task dir

	# task.toml top-level fields
	name: str \| None = None
	org: str \| None = None
	version: str \| None = None

	# [task]
	description: str \| None = None
	instruction_inline: str \| None = None # if task.toml has `instruction =`

	# [metadata]
	difficulty: str \| None = None
	category: str \| None = None
	keywords: list[str] = field(default_factory=list)

	# [agent] / [verifier] timeouts (if set)
	agent_timeout_sec: float \| None = None
	verifier_timeout_sec: float \| None = None

	# Repo2RLEnv extension — [metadata.repo2env] block, opaque dict
	repo2env: dict[str, Any] \| None = None

	# File contents (None = file not present)
	instruction_md: str \| None = None
	oracle_patch: str \| None = None
	solve_sh: str \| None = None
	test_sh: str \| None = None
	dockerfile: str \| None = None
	task_toml_raw: str = ""

	# Generic discovery: every readable text file under the task dir,
	# keyed by path relative to the task root. e.g. "tests/grader.py",
	# "environment/pull_bucket.py", "solution/patch.diff". Populated by
	# load_task() walking the directory tree — the file viewer surfaces
	# everything in here so the dataset is shown faithfully, not via
	# a hardcoded allowlist.
	files: dict[str, str] = field(default_factory=dict)


	def _read_text(path: Path) -> str \| None:
	"""Read a text file, return None if it doesn't exist."""
	try:
	return path.read_text(encoding="utf-8")
	except FileNotFoundError:
	return None
	except Exception as exc:
	logger.warning("could not read %s: %s", path, exc)
	return None


	# Files we never surface in the file viewer (binaries, caches, secrets).
	_SKIP_DIRS: set[str] = {".git", "__pycache__", ".cache", "node_modules", ".venv", ".pytest_cache", ".mypy_cache"}
	_SKIP_NAME_PREFIXES: tuple[str, ...] = (".DS_Store",)
	# Hard cap on file size — anything bigger we treat as non-displayable.
	_MAX_FILE_BYTES = 512 * 1024 # 512 KiB; viewer's code panel chokes well below this


	def _discover_task_files(task_dir: Path) -> dict[str, str]:
	"""Walk `task_dir` recursively and return every readable text file as
	{relative_path: content}. Skips binaries, hidden noise, and oversized files.
	Paths use forward slashes (POSIX-style) for stable file_ids across OSes.
	"""
	out: dict[str, str] = {}
	for path in sorted(task_dir.rglob("*")):
	if not path.is_file():
	continue
	# Skip files inside excluded directories (any level)
	if any(part in _SKIP_DIRS for part in path.relative_to(task_dir).parts):
	continue
	if path.name.startswith(_SKIP_NAME_PREFIXES):
	continue
	try:
	size = path.stat().st_size
	except OSError:
	continue
	if size > _MAX_FILE_BYTES:
	continue
	try:
	content = path.read_text(encoding="utf-8")
	except (UnicodeDecodeError, OSError):
	continue
	rel = path.relative_to(task_dir).as_posix()
	out[rel] = content
	return out


	def _discover_task_roots(dataset_root: Path) -> list[Path]:
	"""Find every directory under `dataset_root` that contains a `task.toml`.

	Handles both flat (`<root>/<id>/task.toml`) and nested
	(`<root>/tasks/<id>/task.toml`) Harbor layouts. Prefers nested when
	`<root>/tasks/` exists (that's what `repo2rlenv push` stages on HF).
	Also handles the case where the dataset is itself a single task root.
	"""
	if (dataset_root / "task.toml").exists():
	return [dataset_root]

	tasks_dir = dataset_root / "tasks"
	if tasks_dir.is_dir():
	return sorted(
	[p for p in tasks_dir.iterdir() if p.is_dir() and (p / "task.toml").exists()]
	)

	# Flat layout — every immediate subdir that has task.toml is a task
	return sorted(
	[
	p
	for p in dataset_root.iterdir()
	if p.is_dir() and not p.name.startswith(".") and (p / "task.toml").exists()
	]
	)


	def list_tasks(dataset_root: Path) -> list[str]:
	"""Return the task-id (directory name) of every task under `dataset_root`."""
	return [p.name for p in _discover_task_roots(dataset_root)]


	def _resolve_task_dir(dataset_root: Path, task_id: str) -> Path:
	"""Find the on-disk directory for a given task id (handles flat + nested)."""
	flat = dataset_root / task_id
	if (flat / "task.toml").exists():
	return flat
	nested = dataset_root / "tasks" / task_id
	if (nested / "task.toml").exists():
	return nested
	if (dataset_root / "task.toml").exists() and dataset_root.name == task_id:
	return dataset_root
	raise FileNotFoundError(f"no task {task_id!r} under {dataset_root}")


	def load_task(dataset_root: Path, task_id: str) -> HarborTask:
	"""Load every spec file for a single Harbor task. Tolerant of missing pieces."""
	task_dir = _resolve_task_dir(dataset_root, task_id)
	toml_path = task_dir / "task.toml"
	raw = toml_path.read_text(encoding="utf-8")
	data = tomllib.loads(raw)

	task_block = data.get("task") or {}
	metadata_block = data.get("metadata") or {}
	agent_block = data.get("agent") or {}
	verifier_block = data.get("verifier") or {}

	# [metadata.repo2env] is the Repo2RLEnv extension — surfaced as a separate
	# opaque dict so the UI can render it specially if present.
	repo2env = metadata_block.get("repo2env")
	if repo2env is not None and not isinstance(repo2env, dict):
	repo2env = None

	# Walk the whole task dir so any present file (grader.py, pull_bucket.py,
	# helper modules, multiple test scripts, ...) ends up in the viewer.
	files = _discover_task_files(task_dir)

	return HarborTask(
	id=task_id,
	root=task_dir,
	name=task_block.get("name"),
	org=task_block.get("org"),
	version=data.get("version"),
	description=task_block.get("description"),
	instruction_inline=task_block.get("instruction"),
	difficulty=metadata_block.get("difficulty"),
	category=metadata_block.get("category"),
	keywords=list(metadata_block.get("keywords") or []),
	agent_timeout_sec=agent_block.get("timeout_sec"),
	verifier_timeout_sec=verifier_block.get("timeout_sec"),
	repo2env=repo2env,
	# Convenience fields — kept populated for backwards compat with callers
	# that reach in by name. Source of truth for the viewer is `files`.
	instruction_md=files.get("instruction.md"),
	oracle_patch=files.get("solution/patch.diff"),
	solve_sh=files.get("solution/solve.sh"),
	test_sh=files.get("tests/test.sh"),
	dockerfile=files.get("environment/Dockerfile"),
	task_toml_raw=raw,
	files=files,
	)