payops_env

Paused

App Files Files Community

payops_env / validate.py

padmapriyagosakan

fix: add enabled:true to all graders, restore /tasks dict format, add per_task_rewards alias

8575841 about 1 month ago

raw

history blame contribute delete

21.4 kB

	#!/usr/bin/env python3
	"""
	validate.py — Pre-submission Validation Script
	===============================================
	Runs all checklist items before submitting to the competition.
	All checks must pass or the submission will be disqualified.

	Usage
	-----
	# With server already running:
	python validate.py

	# Start server automatically:
	python validate.py --start-server

	# Custom server URL:
	python validate.py --url http://localhost:7860
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import subprocess
	import sys
	import time
	import ssl
	import urllib.request
	import urllib.error
	from pathlib import Path

	# ── ANSI colours ───────────────────────────────────────────────────────────
	GREEN = "\033[32m"
	RED = "\033[31m"
	YELLOW = "\033[33m"
	CYAN = "\033[36m"
	BOLD = "\033[1m"
	RESET = "\033[0m"

	PASS_COUNT = 0
	FAIL_COUNT = 0
	WARN_COUNT = 0
	_SERVER_PROC = None


	def ok(msg: str):
	global PASS_COUNT
	PASS_COUNT += 1
	print(f" {GREEN}✓{RESET} {msg}")


	def fail(msg: str):
	global FAIL_COUNT
	FAIL_COUNT += 1
	print(f" {RED}✗{RESET} {msg}")


	def warn(msg: str):
	global WARN_COUNT
	WARN_COUNT += 1
	print(f" {YELLOW}!{RESET} {msg}")


	def section(title: str):
	print(f"\n{CYAN}{BOLD}── {title} ──{RESET}")


	# SSL context that accepts HF Space / Let's Encrypt certs on all Python versions.
	_SSL_CTX = ssl.create_default_context()
	_SSL_CTX.check_hostname = False
	_SSL_CTX.verify_mode = ssl.CERT_NONE


	def http_get(url: str, timeout: int = 10) -> dict:
	req = urllib.request.Request(url, method="GET")
	ctx = _SSL_CTX if url.startswith("https://") else None
	with urllib.request.urlopen(req, timeout=timeout, context=ctx) as r:
	return json.loads(r.read())


	def http_post(url: str, body: dict \| None = None, timeout: int = 30) -> dict:
	data = json.dumps(body).encode() if body else b""
	req = urllib.request.Request(
	url, data=data,
	headers={"Content-Type": "application/json"},
	method="POST"
	)
	ctx = _SSL_CTX if url.startswith("https://") else None
	with urllib.request.urlopen(req, timeout=timeout, context=ctx) as r:
	return json.loads(r.read())


	# ═══════════════════════════════════════════════════════════════════════════
	# CHECK FUNCTIONS
	# ═══════════════════════════════════════════════════════════════════════════

	def check_repo_structure():
	section("1. Repository Structure")
	root = Path(__file__).parent

	required_files = [
	("inference.py", "Inference script (root)"),
	("openenv.yaml", "OpenEnv spec"),
	("Dockerfile", "Docker build file"),
	("requirements.txt", "Python dependencies"),
	("environment.py", "Environment implementation"),
	("grader.py", "Grader implementation"),
	("models.py", "Typed models"),
	("tasks.py", "Task bank"),
	("server/app.py", "FastAPI server"),
	]

	for filename, label in required_files:
	path = root / filename
	if path.exists():
	ok(f"{label} found: {filename}")
	else:
	fail(f"{label} MISSING: {filename}")

	# inference.py must be in root
	if (root / "inference.py").exists():
	ok("inference.py is in root directory")
	else:
	fail("inference.py must be in the ROOT directory")


	def check_openenv_yaml():
	section("2. openenv.yaml Spec Compliance")
	root = Path(__file__).parent
	yaml_path = root / "openenv.yaml"

	if not yaml_path.exists():
	fail("openenv.yaml not found — cannot validate")
	return

	content = yaml_path.read_text()

	checks = [
	("spec_version", "spec_version field"),
	("name:", "name field"),
	("version:", "version field"),
	("type: space", "type=space"),
	("runtime: fastapi","runtime=fastapi"),
	("port: 7860", "port=7860 (HF Space)"),
	("POST /reset", "reset endpoint declared"),
	("POST /step", "step endpoint declared"),
	("GET /state", "state endpoint declared"),
	("GET /health", "health endpoint declared"),
	("API_BASE_URL", "API_BASE_URL env var declared"),
	("MODEL_NAME", "MODEL_NAME env var declared"),
	("HF_TOKEN", "HF_TOKEN env var declared"),
	("count: 30", "tasks count=30"),
	("inference.py", "inference script reference"),
	("approve", "approve action"),
	("reject", "reject action"),
	("inspect", "inspect action"),
	("partial_credit: true", "partial credit enabled"),
	]

	for pattern, label in checks:
	if pattern in content:
	ok(label)
	else:
	fail(f"Missing in openenv.yaml: {label} (pattern: '{pattern}')")


	def check_typed_models():
	section("3. Typed Models")
	root = Path(__file__).parent
	sys.path.insert(0, str(root.parent))

	try:
	from payops_env.models import PayOpsAction, PayOpsObservation, PayOpsState
	ok("PayOpsAction importable")
	ok("PayOpsObservation importable")
	ok("PayOpsState importable")

	# Check action types
	action = PayOpsAction(action_type="approve", transaction_id="TXN-TEST")
	ok(f"PayOpsAction instantiates (action_type={action.action_type})")

	# Check all 10 action types are valid
	required_actions = [
	"approve", "reject", "flag", "escalate", "hold",
	"inspect", "request_docs", "verify_kyc", "contact_sender", "file_sar"
	]
	for a in required_actions:
	try:
	PayOpsAction(action_type=a, transaction_id="TXN-TEST")
	ok(f"Action type '{a}' is valid")
	except Exception as e:
	fail(f"Action type '{a}' rejected: {e}")

	except ImportError as e:
	fail(f"Cannot import models: {e}")


	def check_environment():
	section("4. Environment (step / reset / state)")
	sys.path.insert(0, str(Path(__file__).parent.parent))

	try:
	import asyncio
	from payops_env.environment import PayOpsEnvironment
	from payops_env.models import PayOpsAction

	env = PayOpsEnvironment()

	async def run_env_checks():
	# reset()
	obs = await env.reset_async()
	ok(f"reset() returns observation (task={obs.task_id})")

	if obs.budget_remaining == 5.0:
	ok("reset() budget_remaining=5.0")
	else:
	fail(f"reset() budget_remaining expected 5.0, got {obs.budget_remaining}")

	# step() investigation
	obs2 = await env.step_async(
	PayOpsAction(action_type="inspect", transaction_id=obs.transaction_id)
	)
	if obs2.reward == 0.15:
	ok("step(inspect) reward=0.15")
	else:
	warn(f"step(inspect) reward={obs2.reward} (expected 0.15)")

	if obs2.budget_remaining == 4.9:
	ok("step(inspect) budget_remaining=4.9")
	else:
	warn(f"step(inspect) budget={obs2.budget_remaining}")

	if obs2.task_id == obs.task_id:
	ok("inspect does not advance task")
	else:
	fail("inspect advanced task (should not)")

	# step() terminal
	obs3 = await env.step_async(
	PayOpsAction(action_type="approve", transaction_id=obs.transaction_id)
	)
	ok(f"step(approve) reward={obs3.reward}")

	# state()
	state = env._state
	if state.step_count > 0:
	ok(f"state() step_count={state.step_count}")
	else:
	fail("state() step_count=0 after steps")

	if isinstance(state.investigation_actions_used, list):
	ok("state() investigation_actions_used is list")
	else:
	fail("state() investigation_actions_used is not a list")

	asyncio.run(run_env_checks())

	except Exception as e:
	fail(f"Environment check failed: {e}")


	def check_grader():
	section("5. Grader — 3+ tasks, scores in [0.0, 1.0]")
	sys.path.insert(0, str(Path(__file__).parent.parent))

	try:
	from payops_env.grader import grade_episode
	from payops_env.tasks import TASKS

	if len(TASKS) >= 3:
	ok(f"Task bank has {len(TASKS)} tasks (>= 3 required)")
	else:
	fail(f"Task bank has only {len(TASKS)} tasks (need >= 3)")

	# Grade a minimal episode using the real grade_episode signature:
	# grade_episode(actions, tasks, confidences, budget_limit)
	# Use correct terminal actions for first 5 tasks.
	sample_tasks = list(TASKS[:5])
	sample_actions = [t.correct_action for t in sample_tasks]

	result = grade_episode(sample_actions, sample_tasks, budget_limit=5.0)

	if 0.0 <= result.normalised_score <= 1.0:
	ok(f"grade_episode() score in [0,1]: {result.normalised_score:.4f}")
	else:
	fail(f"grade_episode() score out of range: {result.normalised_score}")

	# Check each task graded
	for pt in result.per_task_rewards:
	score = pt.get("weighted_reward", pt.get("reward", 0))
	ok(f" {pt.get('task_id'):12s} reward={score:+.3f}")

	ok("grade_episode() completes without error")

	except Exception as e:
	fail(f"Grader check failed: {e}")


	def check_server(base_url: str):
	section("6. Server Health & OpenEnv Endpoints")

	# Health / reset
	try:
	health = http_get(f"{base_url}/health")
	if health.get("status") == "ok":
	ok(f"GET /health → status=ok (v{health.get('version','?')})")
	else:
	fail(f"GET /health → unexpected: {health}")
	except Exception as e:
	fail(f"GET /health failed: {e}")
	return # Can't continue without server

	# reset() — must return 200 and a valid observation
	try:
	raw_reset = http_post(f"{base_url}/reset")
	# Support both wrapped {"observation":{...}} and legacy flat format
	obs = raw_reset.get("observation", raw_reset) if isinstance(raw_reset.get("observation"), dict) else raw_reset
	if "task_id" in obs:
	ok(f"POST /reset → 200, task_id={obs['task_id']}")
	else:
	fail(f"POST /reset → missing task_id in response")

	if obs.get("budget_remaining") == 5.0:
	ok("POST /reset → budget_remaining=5.0")
	else:
	fail(f"POST /reset → budget_remaining={obs.get('budget_remaining')}")
	except Exception as e:
	fail(f"POST /reset failed: {e}")

	# step()
	try:
	raw_step = http_post(f"{base_url}/step", {"action_type": "inspect", "transaction_id": "TXN-E001"})
	step = raw_step.get("observation", raw_step) if isinstance(raw_step.get("observation"), dict) else raw_step
	# reward is at top level in both wrapped and flat formats
	reward_val = raw_step.get("reward", step.get("reward"))
	if reward_val is not None:
	ok(f"POST /step (inspect) → reward={reward_val}")
	else:
	fail("POST /step → missing reward in response")
	except Exception as e:
	fail(f"POST /step failed: {e}")

	# state()
	try:
	state = http_get(f"{base_url}/state")
	if "episode_id" in state:
	ok("GET /state → episode_id present")
	else:
	fail("GET /state → missing episode_id")
	if isinstance(state.get("investigation_actions_used"), list):
	ok("GET /state → investigation_actions_used is list")
	else:
	fail("GET /state → investigation_actions_used is not a list")
	except Exception as e:
	fail(f"GET /state failed: {e}")

	# tasks
	try:
	tasks = http_get(f"{base_url}/tasks")
	count = tasks.get("count", 0) if isinstance(tasks, dict) else len(tasks)
	if count >= 3:
	ok(f"GET /tasks → count={count} (>= 3 required)")
	else:
	fail(f"GET /tasks → count={count} (need >= 3)")
	except Exception as e:
	fail(f"GET /tasks failed: {e}")

	# grader
	try:
	grader = http_get(f"{base_url}/grader")
	score = grader.get("normalised_score", -1)
	if 0.0 <= score <= 1.0:
	ok(f"GET /grader → normalised_score={score:.4f} (in [0,1])")
	else:
	fail(f"GET /grader → score={score} out of range")
	except Exception as e:
	fail(f"GET /grader failed: {e}")

	# schema
	try:
	schema = http_get(f"{base_url}/schema")
	body = json.dumps(schema)
	for model in ["PayOpsAction", "PayOpsObservation"]:
	if model in body:
	ok(f"GET /schema → {model} present")
	else:
	fail(f"GET /schema → {model} missing")
	except Exception as e:
	fail(f"GET /schema failed: {e}")


	def check_env_vars():
	section("7. Required Environment Variables")
	required = {
	"API_BASE_URL": "LLM API endpoint",
	"MODEL_NAME": "Model identifier",
	"HF_TOKEN": "API / HF token",
	}
	for var, desc in required.items():
	val = os.environ.get(var, "")
	if val:
	# Show only first 10 chars for secrets
	display = val[:10] + "..." if len(val) > 10 else val
	ok(f"{var} is set ({desc}): {display}")
	else:
	warn(f"{var} is NOT set ({desc}) — required at inference time")


	def check_inference_script():
	section("8. inference.py Validation")
	root = Path(__file__).parent
	inf_path = root / "inference.py"

	if not inf_path.exists():
	fail("inference.py not found in root directory")
	return

	content = inf_path.read_text()

	checks = [
	("from openai import OpenAI", "Uses OpenAI client"),
	("API_BASE_URL", "Reads API_BASE_URL env var"),
	("MODEL_NAME", "Reads MODEL_NAME env var"),
	("HF_TOKEN", "Reads HF_TOKEN env var"),
	("chat.completions.create", "Uses chat.completions.create"),
	("/reset", "Calls /reset endpoint"),
	("/step", "Calls /step endpoint"),
	("/grader", "Retrieves grader results"),
	("normalised_score", "Reports normalised_score"),
	]

	for pattern, label in checks:
	if pattern in content:
	ok(label)
	else:
	fail(f"inference.py missing: {label} (pattern: '{pattern}')")


	def check_dockerfile():
	section("9. Dockerfile")
	root = Path(__file__).parent
	df_path = root / "Dockerfile"

	if not df_path.exists():
	fail("Dockerfile not found")
	return

	content = df_path.read_text()

	checks = [
	("FROM python:", "Uses Python base image"),
	("EXPOSE 7860", "Exposes port 7860 (HF Space)"),
	("7860", "References port 7860"),
	("uvicorn", "Starts uvicorn server"),
	("HEALTHCHECK", "Has HEALTHCHECK"),
	("requirements.txt", "Installs requirements.txt"),
	("appuser", "Non-root user (security)"),
	]

	for pattern, label in checks:
	if pattern in content:
	ok(label)
	else:
	fail(f"Dockerfile missing: {label}")

	# Attempt docker build (dry-run, only if docker is available)
	try:
	result = subprocess.run(
	["docker", "build", "--check", "-f", str(df_path), str(root)],
	capture_output=True, text=True, timeout=60
	)
	if result.returncode == 0:
	ok("docker build --check passed")
	else:
	# --check flag not available in older Docker; try plain build
	warn(f"docker build --check not supported; run 'docker build .' manually")
	except FileNotFoundError:
	warn("docker not installed — skipping build check")
	except subprocess.TimeoutExpired:
	warn("docker build check timed out")
	except Exception as e:
	warn(f"docker check skipped: {e}")


	def check_runtime_constraint():
	section("10. Runtime Constraint (< 20 min / 2vCPU / 8GB)")
	# We can't fully test hardware constraints here, but we validate
	# that the inference script doesn't import heavy GPU libs
	root = Path(__file__).parent
	inf_path = root / "inference.py"

	if not inf_path.exists():
	warn("inference.py not found, skipping runtime check")
	return

	content = inf_path.read_text()
	heavy_libs = ["torch", "transformers", "tensorflow", "keras", "jax"]

	for lib in heavy_libs:
	if f"import {lib}" in content or f"from {lib}" in content:
	warn(f"inference.py imports '{lib}' — ensure it runs within 8GB RAM on 2vCPU")

	ok("inference.py uses lightweight OpenAI client (no local model loading)")

	# Check no huge loops that would blow 20min
	if "MAX_STEPS" in content:
	ok("MAX_STEPS guard present")
	else:
	warn("Consider adding a MAX_STEPS guard in inference.py")


	# ═══════════════════════════════════════════════════════════════════════════
	# MAIN
	# ═══════════════════════════════════════════════════════════════════════════

	def main():
	parser = argparse.ArgumentParser(description="PayOps pre-submission validator")
	parser.add_argument("--url", default="http://localhost:7860",
	help="PayOps server base URL (default: http://localhost:7860)")
	parser.add_argument("--start-server", action="store_true",
	help="Start the server automatically before validating")
	args = parser.parse_args()

	base_url = args.url.rstrip("/")

	print(f"\n{BOLD}{'='*60}{RESET}")
	print(f"{BOLD} PayOps Pre-Submission Validator{RESET}")
	print(f" Target server : {base_url}")
	print(f"{'='*60}{RESET}\n")

	# Static checks (no server needed)
	check_repo_structure()
	check_openenv_yaml()
	check_typed_models()
	check_environment()
	check_grader()
	check_env_vars()
	check_inference_script()
	check_dockerfile()
	check_runtime_constraint()

	# Server-dependent checks
	# Try to reach the server; if not up, attempt to start
	server_available = False
	try:
	http_get(f"{base_url}/health", timeout=3)
	server_available = True
	except Exception:
	if args.start_server:
	print(f"\n Starting server at {base_url}...")
	root = Path(__file__).parent.parent
	env = os.environ.copy()
	env["PYTHONPATH"] = str(root)
	port = base_url.rsplit(":", 1)[-1] if ":" in base_url else "7860"
	global _SERVER_PROC
	_SERVER_PROC = subprocess.Popen(
	[sys.executable, "-m", "uvicorn",
	"payops_env.server.app:app",
	"--host", "0.0.0.0", "--port", port],
	env=env, cwd=str(root)
	)
	for _ in range(15):
	time.sleep(1)
	try:
	http_get(f"{base_url}/health", timeout=2)
	server_available = True
	print(" Server started.")
	break
	except Exception:
	pass
	if not server_available:
	warn("Could not start server automatically")
	else:
	warn(f"Server not reachable at {base_url}. Start it first, or pass --start-server")

	if server_available:
	check_server(base_url)
	else:
	section("6. Server Health & OpenEnv Endpoints")
	warn("Skipped — server not available")

	# ── Summary ────────────────────────────────────────────────────────────
	total = PASS_COUNT + FAIL_COUNT
	print(f"\n{BOLD}{'='*60}")
	if FAIL_COUNT == 0:
	print(f"{GREEN} ALL CHECKS PASSED ({PASS_COUNT}/{total} passed, {WARN_COUNT} warnings){RESET}")
	print(f"{BOLD} Ready to submit! ✓{RESET}")
	else:
	print(f"{RED} {FAIL_COUNT} CHECK(S) FAILED{RESET} "
	f"({PASS_COUNT} passed, {FAIL_COUNT} failed, {WARN_COUNT} warnings)")
	print(f"{RED}{BOLD} Fix failures before submitting.{RESET}")
	print(f"{BOLD}{'='*60}{RESET}\n")

	if _SERVER_PROC:
	_SERVER_PROC.terminate()

	sys.exit(0 if FAIL_COUNT == 0 else 1)


	if __name__ == "__main__":
	main()