File size: 2,975 Bytes
e037397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598a9fb
e037397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))


def run(command: list[str]) -> tuple[int, str]:
    proc = subprocess.run(
        command,
        cwd=ROOT,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
    )
    return proc.returncode, proc.stdout


def require(path: Path, label: str) -> bool:
    if path.exists():
        print(f"[PASS] {label}: {path}")
        return True
    print(f"[FAIL] {label}: missing {path}")
    return False


def check_tasks_and_scores() -> bool:
    from env.environment import IDPIExfilEnv
    from env.models import Action

    env = IDPIExfilEnv()
    scores = []
    for difficulty in ("easy", "medium", "hard"):
        env.reset(difficulty=difficulty, seed=42)
        env.step(Action(type="ReadDoc"))
        env.step(Action(type="FlagInjection", reason="suspicious instruction"))
        if difficulty == "hard":
            env.step(Action(type="WriteDB", content="clean internal summary"))
        result = env.get_result()
        scores.append((difficulty, result.final_score))

    ok = all(0.0 < score < 1.0 for _, score in scores)
    if ok:
        print(f"[PASS] 3+ tasks with graders: {scores}")
    else:
        print(f"[FAIL] score out of range: {scores}")
    return ok


def main() -> int:
    checks = []
    checks.append(require(ROOT / "inference.py", "inference entrypoint"))
    checks.append(require(ROOT / "openenv.yaml", "OpenEnv spec file"))
    checks.append(require(ROOT / "Dockerfile", "Dockerfile"))
    checks.append(require(ROOT / "uv.lock", "uv lockfile"))
    checks.append(require(ROOT / ".env.example", "environment variable template"))
    checks.append(require(ROOT / "docs" / "person_a_benchmark_manifest.json", "task manifest"))

    code, out = run(["openenv", "validate"])
    print(out.rstrip())
    checks.append(code == 0)

    code, out = run(["python", "-m", "unittest", "discover", "-s", "tests", "-p", "test_*.py"])
    print(out.rstrip())
    checks.append(code == 0)

    checks.append(check_tasks_and_scores())

    baseline_path = ROOT / "baseline" / "baseline_results.json"
    if baseline_path.exists():
        try:
            json.loads(baseline_path.read_text(encoding="utf-8"))
            print(f"[PASS] baseline output json parse: {baseline_path}")
            checks.append(True)
        except Exception as exc:
            print(f"[FAIL] baseline output invalid json: {exc}")
            checks.append(False)
    else:
        print(f"[WARN] baseline output missing (acceptable pre-key): {baseline_path}")

    if all(checks):
        print("\n[OK] Local pre-submission validation passed.")
        return 0
    print("\n[FAIL] Local pre-submission validation failed.")
    return 1


if __name__ == "__main__":
    raise SystemExit(main())