Spaces:
Sleeping
Sleeping
File size: 5,757 Bytes
558b89d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """Deterministic grading for optimization tasks."""
from __future__ import annotations
import json
import subprocess
import sys
import tempfile
from pathlib import Path
from .common import clamp_score, compile_tree, nested_loop_depth, style_score
from .pytest_runner import run_pytest_suite
from ..models import TaskGrade
from ..tasks.task_bank import TaskSpec
def _benchmark_script(task: TaskSpec) -> str:
return f"""import json
import time
from candidate import {task.benchmark_entrypoint}
{task.benchmark_builder}
events = build_benchmark_events()
start = time.perf_counter()
for _ in range({task.benchmark_repeats}):
result = {task.benchmark_entrypoint}(events)
elapsed = time.perf_counter() - start
Path = __import__("pathlib").Path
Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
"""
def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
assert task.benchmark_entrypoint is not None
try:
with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
temp_path = Path(temp_dir)
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
(temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
(temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
(temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
try:
starter_run = subprocess.run(
[sys.executable, "starter_runner.py"],
cwd=temp_path,
capture_output=True,
text=True,
timeout=task.benchmark_timeout_s,
check=False,
)
starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
candidate_run = subprocess.run(
[sys.executable, "candidate_runner.py"],
cwd=temp_path,
capture_output=True,
text=True,
timeout=task.benchmark_timeout_s,
check=False,
)
candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
except subprocess.TimeoutExpired as exc:
output = (exc.stdout or "") + (exc.stderr or "")
return 0.0, True, (output or "benchmark timed out").strip()
except Exception as exc:
return 0.0, False, str(exc)
starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
speedup = starter_elapsed / candidate_elapsed
runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
output = "\n".join(
part
for part in [
starter_run.stdout.strip(),
starter_run.stderr.strip(),
candidate_run.stdout.strip(),
candidate_run.stderr.strip(),
f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
]
if part
)
return runtime_score, False, output
except Exception as exc:
return 0.0, False, str(exc)
def ast_quality_score(code: str, task: TaskSpec) -> float:
tree, _ = compile_tree(code)
if tree is None:
return 0.0
import ast
function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
marker_points = 0.0
for marker in task.expected_quality_markers:
if marker in code:
marker_points += 0.2
return clamp_score(docstring_points + nested_points + marker_points)
def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
test_fraction = execution.passed / execution.total if execution.total else 0.0
if execution.timed_out:
return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
if timed_out:
return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
quality_score = ast_quality_score(candidate_code, task)
pep8_score = style_score(candidate_code, task.style_max_line_length)
score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
return TaskGrade(
score=score,
syntax_score=1.0,
tests_passed=execution.passed,
tests_total=execution.total,
quality_score=quality_score,
runtime_score=runtime_score,
details={
"tests": execution.output,
"benchmark": benchmark_output,
"test_fraction": round(test_fraction, 4),
"runtime_score": round(runtime_score, 4),
"style_score": round(pep8_score, 4),
},
)
|