Spaces:
Sleeping
Sleeping
File size: 2,159 Bytes
3ec70de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import os
import re
import subprocess
import sys
from pathlib import Path
from tasks import task_ids
ROOT = Path(__file__).resolve().parents[1]
START_RE = re.compile(r"^\[START\] task=([a-z0-9-]+)$")
STEP_RE = re.compile(r"^\[STEP\] step=(\d+) reward=(-?\d+(?:\.\d+)?)$")
END_RE = re.compile(r"^\[END\] task=([a-z0-9-]+) score=(\d+(?:\.\d+)?) steps=(\d+)$")
def test_inference_emits_structured_stdout_for_all_tasks():
env = os.environ.copy()
env.pop("API_BASE_URL", None)
env.pop("HF_TOKEN", None)
env["MODEL_NAME"] = "mock-model"
result = subprocess.run(
[sys.executable, "inference.py"],
cwd=ROOT,
capture_output=True,
text=True,
timeout=120,
env=env,
check=False,
)
assert result.returncode == 0
assert "[START]" not in result.stderr
assert "[STEP]" not in result.stderr
assert "[END]" not in result.stderr
lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
expected_tasks = task_ids()
seen_tasks = []
line_index = 0
while line_index < len(lines):
start_match = START_RE.match(lines[line_index])
assert start_match, f"Invalid START line: {lines[line_index]}"
task_id = start_match.group(1)
seen_tasks.append(task_id)
line_index += 1
step_count = 0
while line_index < len(lines) and STEP_RE.match(lines[line_index]):
step_count += 1
step_match = STEP_RE.match(lines[line_index])
assert step_match is not None
assert int(step_match.group(1)) == step_count
reward = float(step_match.group(2))
assert -1.0 <= reward <= 1.0
line_index += 1
assert step_count >= 1
assert line_index < len(lines), "Missing END line"
end_match = END_RE.match(lines[line_index])
assert end_match, f"Invalid END line: {lines[line_index]}"
assert end_match.group(1) == task_id
assert 0.0 <= float(end_match.group(2)) <= 1.0
assert int(end_match.group(3)) == step_count
line_index += 1
assert seen_tasks == expected_tasks
|