payops_env

Paused

File size: 21,419 Bytes

#!/usr/bin/env python3
"""
validate.py — Pre-submission Validation Script
===============================================
Runs all checklist items before submitting to the competition.
All checks must pass or the submission will be disqualified.

Usage
-----
  # With server already running:
  python validate.py

  # Start server automatically:
  python validate.py --start-server

  # Custom server URL:
  python validate.py --url http://localhost:7860
"""

from __future__ import annotations

import argparse
import json
import os
import subprocess
import sys
import time
import ssl
import urllib.request
import urllib.error
from pathlib import Path

# ── ANSI colours ───────────────────────────────────────────────────────────
GREEN  = "\033[32m"
RED    = "\033[31m"
YELLOW = "\033[33m"
CYAN   = "\033[36m"
BOLD   = "\033[1m"
RESET  = "\033[0m"

PASS_COUNT = 0
FAIL_COUNT = 0
WARN_COUNT = 0
_SERVER_PROC = None


def ok(msg: str):
    global PASS_COUNT
    PASS_COUNT += 1
    print(f"  {GREEN}✓{RESET}  {msg}")


def fail(msg: str):
    global FAIL_COUNT
    FAIL_COUNT += 1
    print(f"  {RED}✗{RESET}  {msg}")


def warn(msg: str):
    global WARN_COUNT
    WARN_COUNT += 1
    print(f"  {YELLOW}!{RESET}  {msg}")


def section(title: str):
    print(f"\n{CYAN}{BOLD}── {title} ──{RESET}")


# SSL context that accepts HF Space / Let's Encrypt certs on all Python versions.
_SSL_CTX = ssl.create_default_context()
_SSL_CTX.check_hostname = False
_SSL_CTX.verify_mode    = ssl.CERT_NONE


def http_get(url: str, timeout: int = 10) -> dict:
    req = urllib.request.Request(url, method="GET")
    ctx = _SSL_CTX if url.startswith("https://") else None
    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as r:
        return json.loads(r.read())


def http_post(url: str, body: dict | None = None, timeout: int = 30) -> dict:
    data = json.dumps(body).encode() if body else b""
    req = urllib.request.Request(
        url, data=data,
        headers={"Content-Type": "application/json"},
        method="POST"
    )
    ctx = _SSL_CTX if url.startswith("https://") else None
    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as r:
        return json.loads(r.read())


# ═══════════════════════════════════════════════════════════════════════════
# CHECK FUNCTIONS
# ═══════════════════════════════════════════════════════════════════════════

def check_repo_structure():
    section("1. Repository Structure")
    root = Path(__file__).parent

    required_files = [
        ("inference.py",      "Inference script (root)"),
        ("openenv.yaml",      "OpenEnv spec"),
        ("Dockerfile",        "Docker build file"),
        ("requirements.txt",  "Python dependencies"),
        ("environment.py",    "Environment implementation"),
        ("grader.py",         "Grader implementation"),
        ("models.py",         "Typed models"),
        ("tasks.py",          "Task bank"),
        ("server/app.py",     "FastAPI server"),
    ]

    for filename, label in required_files:
        path = root / filename
        if path.exists():
            ok(f"{label} found: {filename}")
        else:
            fail(f"{label} MISSING: {filename}")

    # inference.py must be in root
    if (root / "inference.py").exists():
        ok("inference.py is in root directory")
    else:
        fail("inference.py must be in the ROOT directory")


def check_openenv_yaml():
    section("2. openenv.yaml Spec Compliance")
    root = Path(__file__).parent
    yaml_path = root / "openenv.yaml"

    if not yaml_path.exists():
        fail("openenv.yaml not found — cannot validate")
        return

    content = yaml_path.read_text()

    checks = [
        ("spec_version",    "spec_version field"),
        ("name:",           "name field"),
        ("version:",        "version field"),
        ("type: space",     "type=space"),
        ("runtime: fastapi","runtime=fastapi"),
        ("port: 7860",      "port=7860 (HF Space)"),
        ("POST /reset",     "reset endpoint declared"),
        ("POST /step",      "step endpoint declared"),
        ("GET  /state",     "state endpoint declared"),
        ("GET  /health",    "health endpoint declared"),
        ("API_BASE_URL",    "API_BASE_URL env var declared"),
        ("MODEL_NAME",      "MODEL_NAME env var declared"),
        ("HF_TOKEN",        "HF_TOKEN env var declared"),
        ("count: 30",       "tasks count=30"),
        ("inference.py",    "inference script reference"),
        ("approve",         "approve action"),
        ("reject",          "reject action"),
        ("inspect",         "inspect action"),
        ("partial_credit: true", "partial credit enabled"),
    ]

    for pattern, label in checks:
        if pattern in content:
            ok(label)
        else:
            fail(f"Missing in openenv.yaml: {label} (pattern: '{pattern}')")


def check_typed_models():
    section("3. Typed Models")
    root = Path(__file__).parent
    sys.path.insert(0, str(root.parent))

    try:
        from payops_env.models import PayOpsAction, PayOpsObservation, PayOpsState
        ok("PayOpsAction importable")
        ok("PayOpsObservation importable")
        ok("PayOpsState importable")

        # Check action types
        action = PayOpsAction(action_type="approve", transaction_id="TXN-TEST")
        ok(f"PayOpsAction instantiates (action_type={action.action_type})")

        # Check all 10 action types are valid
        required_actions = [
            "approve", "reject", "flag", "escalate", "hold",
            "inspect", "request_docs", "verify_kyc", "contact_sender", "file_sar"
        ]
        for a in required_actions:
            try:
                PayOpsAction(action_type=a, transaction_id="TXN-TEST")
                ok(f"Action type '{a}' is valid")
            except Exception as e:
                fail(f"Action type '{a}' rejected: {e}")

    except ImportError as e:
        fail(f"Cannot import models: {e}")


def check_environment():
    section("4. Environment (step / reset / state)")
    sys.path.insert(0, str(Path(__file__).parent.parent))

    try:
        import asyncio
        from payops_env.environment import PayOpsEnvironment
        from payops_env.models import PayOpsAction

        env = PayOpsEnvironment()

        async def run_env_checks():
            # reset()
            obs = await env.reset_async()
            ok(f"reset() returns observation (task={obs.task_id})")

            if obs.budget_remaining == 5.0:
                ok("reset() budget_remaining=5.0")
            else:
                fail(f"reset() budget_remaining expected 5.0, got {obs.budget_remaining}")

            # step() investigation
            obs2 = await env.step_async(
                PayOpsAction(action_type="inspect", transaction_id=obs.transaction_id)
            )
            if obs2.reward == 0.15:
                ok("step(inspect) reward=0.15")
            else:
                warn(f"step(inspect) reward={obs2.reward} (expected 0.15)")

            if obs2.budget_remaining == 4.9:
                ok("step(inspect) budget_remaining=4.9")
            else:
                warn(f"step(inspect) budget={obs2.budget_remaining}")

            if obs2.task_id == obs.task_id:
                ok("inspect does not advance task")
            else:
                fail("inspect advanced task (should not)")

            # step() terminal
            obs3 = await env.step_async(
                PayOpsAction(action_type="approve", transaction_id=obs.transaction_id)
            )
            ok(f"step(approve) reward={obs3.reward}")

            # state()
            state = env._state
            if state.step_count > 0:
                ok(f"state() step_count={state.step_count}")
            else:
                fail("state() step_count=0 after steps")

            if isinstance(state.investigation_actions_used, list):
                ok("state() investigation_actions_used is list")
            else:
                fail("state() investigation_actions_used is not a list")

        asyncio.run(run_env_checks())

    except Exception as e:
        fail(f"Environment check failed: {e}")


def check_grader():
    section("5. Grader — 3+ tasks, scores in [0.0, 1.0]")
    sys.path.insert(0, str(Path(__file__).parent.parent))

    try:
        from payops_env.grader import grade_episode
        from payops_env.tasks import TASKS

        if len(TASKS) >= 3:
            ok(f"Task bank has {len(TASKS)} tasks (>= 3 required)")
        else:
            fail(f"Task bank has only {len(TASKS)} tasks (need >= 3)")

        # Grade a minimal episode using the real grade_episode signature:
        # grade_episode(actions, tasks, confidences, budget_limit)
        # Use correct terminal actions for first 5 tasks.
        sample_tasks = list(TASKS[:5])
        sample_actions = [t.correct_action for t in sample_tasks]

        result = grade_episode(sample_actions, sample_tasks, budget_limit=5.0)

        if 0.0 <= result.normalised_score <= 1.0:
            ok(f"grade_episode() score in [0,1]: {result.normalised_score:.4f}")
        else:
            fail(f"grade_episode() score out of range: {result.normalised_score}")

        # Check each task graded
        for pt in result.per_task_rewards:
            score = pt.get("weighted_reward", pt.get("reward", 0))
            ok(f"  {pt.get('task_id'):12s} reward={score:+.3f}")

        ok("grade_episode() completes without error")

    except Exception as e:
        fail(f"Grader check failed: {e}")


def check_server(base_url: str):
    section("6. Server Health & OpenEnv Endpoints")

    # Health / reset
    try:
        health = http_get(f"{base_url}/health")
        if health.get("status") == "ok":
            ok(f"GET /health → status=ok (v{health.get('version','?')})")
        else:
            fail(f"GET /health → unexpected: {health}")
    except Exception as e:
        fail(f"GET /health failed: {e}")
        return  # Can't continue without server

    # reset() — must return 200 and a valid observation
    try:
        raw_reset = http_post(f"{base_url}/reset")
        # Support both wrapped {"observation":{...}} and legacy flat format
        obs = raw_reset.get("observation", raw_reset) if isinstance(raw_reset.get("observation"), dict) else raw_reset
        if "task_id" in obs:
            ok(f"POST /reset → 200, task_id={obs['task_id']}")
        else:
            fail(f"POST /reset → missing task_id in response")

        if obs.get("budget_remaining") == 5.0:
            ok("POST /reset → budget_remaining=5.0")
        else:
            fail(f"POST /reset → budget_remaining={obs.get('budget_remaining')}")
    except Exception as e:
        fail(f"POST /reset failed: {e}")

    # step()
    try:
        raw_step = http_post(f"{base_url}/step", {"action_type": "inspect", "transaction_id": "TXN-E001"})
        step = raw_step.get("observation", raw_step) if isinstance(raw_step.get("observation"), dict) else raw_step
        # reward is at top level in both wrapped and flat formats
        reward_val = raw_step.get("reward", step.get("reward"))
        if reward_val is not None:
            ok(f"POST /step (inspect) → reward={reward_val}")
        else:
            fail("POST /step → missing reward in response")
    except Exception as e:
        fail(f"POST /step failed: {e}")

    # state()
    try:
        state = http_get(f"{base_url}/state")
        if "episode_id" in state:
            ok("GET /state → episode_id present")
        else:
            fail("GET /state → missing episode_id")
        if isinstance(state.get("investigation_actions_used"), list):
            ok("GET /state → investigation_actions_used is list")
        else:
            fail("GET /state → investigation_actions_used is not a list")
    except Exception as e:
        fail(f"GET /state failed: {e}")

    # tasks
    try:
        tasks = http_get(f"{base_url}/tasks")
        count = tasks.get("count", 0) if isinstance(tasks, dict) else len(tasks)
        if count >= 3:
            ok(f"GET /tasks → count={count} (>= 3 required)")
        else:
            fail(f"GET /tasks → count={count} (need >= 3)")
    except Exception as e:
        fail(f"GET /tasks failed: {e}")

    # grader
    try:
        grader = http_get(f"{base_url}/grader")
        score = grader.get("normalised_score", -1)
        if 0.0 <= score <= 1.0:
            ok(f"GET /grader → normalised_score={score:.4f} (in [0,1])")
        else:
            fail(f"GET /grader → score={score} out of range")
    except Exception as e:
        fail(f"GET /grader failed: {e}")

    # schema
    try:
        schema = http_get(f"{base_url}/schema")
        body = json.dumps(schema)
        for model in ["PayOpsAction", "PayOpsObservation"]:
            if model in body:
                ok(f"GET /schema → {model} present")
            else:
                fail(f"GET /schema → {model} missing")
    except Exception as e:
        fail(f"GET /schema failed: {e}")


def check_env_vars():
    section("7. Required Environment Variables")
    required = {
        "API_BASE_URL": "LLM API endpoint",
        "MODEL_NAME":   "Model identifier",
        "HF_TOKEN":     "API / HF token",
    }
    for var, desc in required.items():
        val = os.environ.get(var, "")
        if val:
            # Show only first 10 chars for secrets
            display = val[:10] + "..." if len(val) > 10 else val
            ok(f"{var} is set ({desc}): {display}")
        else:
            warn(f"{var} is NOT set ({desc}) — required at inference time")


def check_inference_script():
    section("8. inference.py Validation")
    root = Path(__file__).parent
    inf_path = root / "inference.py"

    if not inf_path.exists():
        fail("inference.py not found in root directory")
        return

    content = inf_path.read_text()

    checks = [
        ("from openai import OpenAI", "Uses OpenAI client"),
        ("API_BASE_URL",              "Reads API_BASE_URL env var"),
        ("MODEL_NAME",                "Reads MODEL_NAME env var"),
        ("HF_TOKEN",                  "Reads HF_TOKEN env var"),
        ("chat.completions.create",   "Uses chat.completions.create"),
        ("/reset",                    "Calls /reset endpoint"),
        ("/step",                     "Calls /step endpoint"),
        ("/grader",                   "Retrieves grader results"),
        ("normalised_score",          "Reports normalised_score"),
    ]

    for pattern, label in checks:
        if pattern in content:
            ok(label)
        else:
            fail(f"inference.py missing: {label} (pattern: '{pattern}')")


def check_dockerfile():
    section("9. Dockerfile")
    root = Path(__file__).parent
    df_path = root / "Dockerfile"

    if not df_path.exists():
        fail("Dockerfile not found")
        return

    content = df_path.read_text()

    checks = [
        ("FROM python:",      "Uses Python base image"),
        ("EXPOSE 7860",       "Exposes port 7860 (HF Space)"),
        ("7860",              "References port 7860"),
        ("uvicorn",           "Starts uvicorn server"),
        ("HEALTHCHECK",       "Has HEALTHCHECK"),
        ("requirements.txt",  "Installs requirements.txt"),
        ("appuser",           "Non-root user (security)"),
    ]

    for pattern, label in checks:
        if pattern in content:
            ok(label)
        else:
            fail(f"Dockerfile missing: {label}")

    # Attempt docker build (dry-run, only if docker is available)
    try:
        result = subprocess.run(
            ["docker", "build", "--check", "-f", str(df_path), str(root)],
            capture_output=True, text=True, timeout=60
        )
        if result.returncode == 0:
            ok("docker build --check passed")
        else:
            # --check flag not available in older Docker; try plain build
            warn(f"docker build --check not supported; run 'docker build .' manually")
    except FileNotFoundError:
        warn("docker not installed — skipping build check")
    except subprocess.TimeoutExpired:
        warn("docker build check timed out")
    except Exception as e:
        warn(f"docker check skipped: {e}")


def check_runtime_constraint():
    section("10. Runtime Constraint (< 20 min / 2vCPU / 8GB)")
    # We can't fully test hardware constraints here, but we validate
    # that the inference script doesn't import heavy GPU libs
    root = Path(__file__).parent
    inf_path = root / "inference.py"

    if not inf_path.exists():
        warn("inference.py not found, skipping runtime check")
        return

    content = inf_path.read_text()
    heavy_libs = ["torch", "transformers", "tensorflow", "keras", "jax"]

    for lib in heavy_libs:
        if f"import {lib}" in content or f"from {lib}" in content:
            warn(f"inference.py imports '{lib}' — ensure it runs within 8GB RAM on 2vCPU")

    ok("inference.py uses lightweight OpenAI client (no local model loading)")

    # Check no huge loops that would blow 20min
    if "MAX_STEPS" in content:
        ok("MAX_STEPS guard present")
    else:
        warn("Consider adding a MAX_STEPS guard in inference.py")


# ═══════════════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════════════

def main():
    parser = argparse.ArgumentParser(description="PayOps pre-submission validator")
    parser.add_argument("--url", default="http://localhost:7860",
                        help="PayOps server base URL (default: http://localhost:7860)")
    parser.add_argument("--start-server", action="store_true",
                        help="Start the server automatically before validating")
    args = parser.parse_args()

    base_url = args.url.rstrip("/")

    print(f"\n{BOLD}{'='*60}{RESET}")
    print(f"{BOLD}  PayOps Pre-Submission Validator{RESET}")
    print(f"  Target server : {base_url}")
    print(f"{'='*60}{RESET}\n")

    # Static checks (no server needed)
    check_repo_structure()
    check_openenv_yaml()
    check_typed_models()
    check_environment()
    check_grader()
    check_env_vars()
    check_inference_script()
    check_dockerfile()
    check_runtime_constraint()

    # Server-dependent checks
    # Try to reach the server; if not up, attempt to start
    server_available = False
    try:
        http_get(f"{base_url}/health", timeout=3)
        server_available = True
    except Exception:
        if args.start_server:
            print(f"\n  Starting server at {base_url}...")
            root = Path(__file__).parent.parent
            env = os.environ.copy()
            env["PYTHONPATH"] = str(root)
            port = base_url.rsplit(":", 1)[-1] if ":" in base_url else "7860"
            global _SERVER_PROC
            _SERVER_PROC = subprocess.Popen(
                [sys.executable, "-m", "uvicorn",
                 "payops_env.server.app:app",
                 "--host", "0.0.0.0", "--port", port],
                env=env, cwd=str(root)
            )
            for _ in range(15):
                time.sleep(1)
                try:
                    http_get(f"{base_url}/health", timeout=2)
                    server_available = True
                    print("  Server started.")
                    break
                except Exception:
                    pass
            if not server_available:
                warn("Could not start server automatically")
        else:
            warn(f"Server not reachable at {base_url}. Start it first, or pass --start-server")

    if server_available:
        check_server(base_url)
    else:
        section("6. Server Health & OpenEnv Endpoints")
        warn("Skipped — server not available")

    # ── Summary ────────────────────────────────────────────────────────────
    total = PASS_COUNT + FAIL_COUNT
    print(f"\n{BOLD}{'='*60}")
    if FAIL_COUNT == 0:
        print(f"{GREEN}  ALL CHECKS PASSED  ({PASS_COUNT}/{total} passed, {WARN_COUNT} warnings){RESET}")
        print(f"{BOLD}  Ready to submit! ✓{RESET}")
    else:
        print(f"{RED}  {FAIL_COUNT} CHECK(S) FAILED{RESET}  "
              f"({PASS_COUNT} passed, {FAIL_COUNT} failed, {WARN_COUNT} warnings)")
        print(f"{RED}{BOLD}  Fix failures before submitting.{RESET}")
    print(f"{BOLD}{'='*60}{RESET}\n")

    if _SERVER_PROC:
        _SERVER_PROC.terminate()

    sys.exit(0 if FAIL_COUNT == 0 else 1)


if __name__ == "__main__":
    main()