Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from tasks import task_ids | |
| ROOT = Path(__file__).resolve().parents[1] | |
| START_RE = re.compile(r"^\[START\] task=([a-z0-9-]+)$") | |
| STEP_RE = re.compile(r"^\[STEP\] step=(\d+) reward=(-?\d+(?:\.\d+)?)$") | |
| END_RE = re.compile(r"^\[END\] task=([a-z0-9-]+) score=(\d+(?:\.\d+)?) steps=(\d+)$") | |
| def test_inference_emits_structured_stdout_for_all_tasks(): | |
| env = os.environ.copy() | |
| env.pop("API_BASE_URL", None) | |
| env.pop("HF_TOKEN", None) | |
| env["MODEL_NAME"] = "mock-model" | |
| result = subprocess.run( | |
| [sys.executable, "inference.py"], | |
| cwd=ROOT, | |
| capture_output=True, | |
| text=True, | |
| timeout=120, | |
| env=env, | |
| check=False, | |
| ) | |
| assert result.returncode == 0 | |
| assert "[START]" not in result.stderr | |
| assert "[STEP]" not in result.stderr | |
| assert "[END]" not in result.stderr | |
| lines = [line.strip() for line in result.stdout.splitlines() if line.strip()] | |
| expected_tasks = task_ids() | |
| seen_tasks = [] | |
| line_index = 0 | |
| while line_index < len(lines): | |
| start_match = START_RE.match(lines[line_index]) | |
| assert start_match, f"Invalid START line: {lines[line_index]}" | |
| task_id = start_match.group(1) | |
| seen_tasks.append(task_id) | |
| line_index += 1 | |
| step_count = 0 | |
| while line_index < len(lines) and STEP_RE.match(lines[line_index]): | |
| step_count += 1 | |
| step_match = STEP_RE.match(lines[line_index]) | |
| assert step_match is not None | |
| assert int(step_match.group(1)) == step_count | |
| reward = float(step_match.group(2)) | |
| assert -1.0 <= reward <= 1.0 | |
| line_index += 1 | |
| assert step_count >= 1 | |
| assert line_index < len(lines), "Missing END line" | |
| end_match = END_RE.match(lines[line_index]) | |
| assert end_match, f"Invalid END line: {lines[line_index]}" | |
| assert end_match.group(1) == task_id | |
| assert 0.0 <= float(end_match.group(2)) <= 1.0 | |
| assert int(end_match.group(3)) == step_count | |
| line_index += 1 | |
| assert seen_tasks == expected_tasks | |