Spaces:
Sleeping
Sleeping
File size: 4,943 Bytes
3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 4451363 3d44779 4451363 3d44779 989722c 3d44779 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | """Optimization task grader."""
from __future__ import annotations
try:
from ..models import TaskGrade
from ..tasks.catalog import ReviewTask
except ImportError:
from models import TaskGrade
from tasks.catalog import ReviewTask
from .shared import (
base_grade,
benchmark_candidate,
compile_code,
composite_grade_score,
component_score,
execute_cases,
quality_metrics,
similarity_score,
summarize_results,
)
def grade_optimization_task(
task: ReviewTask,
code: str,
*,
include_hidden: bool,
timeout_s: float = 3.0,
) -> TaskGrade:
"""Grade an optimization/refactor task with correctness, quality, and runtime."""
compiled, compile_error = compile_code(code)
quality = quality_metrics(code, task.function_name)
similarity = similarity_score(code, task.reference_code)
details = {
"compile_error": compile_error,
"quality_notes": quality["quality_notes"],
"style_score": quality["style_score"],
"visibility": "full" if include_hidden else "public",
}
if not compiled:
details["test_results"] = []
details["test_summary"] = "Code does not compile."
return base_grade(
score=composite_grade_score(
correctness=0.0,
quality=0.05,
runtime=0.0,
syntax=0.0,
similarity=similarity,
baseline=0.04,
penalty=0.06,
),
syntax_score=component_score(0.01),
tests_passed=0,
tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
quality_score=component_score(0.01),
runtime_score=component_score(0.01),
timed_out=False,
details=details,
)
cases = task.public_cases + (task.hidden_cases if include_hidden else [])
result = execute_cases(code, task.function_name, cases, timeout_s=timeout_s)
if result.get("timed_out"):
details["test_results"] = []
details["test_summary"] = result["error"]
return base_grade(
score=composite_grade_score(
correctness=0.08,
quality=quality["score"],
runtime=0.0,
syntax=0.95,
similarity=similarity,
baseline=0.05,
penalty=0.14,
),
syntax_score=component_score(0.95),
tests_passed=0,
tests_total=len(cases),
quality_score=quality["score"],
runtime_score=component_score(0.01),
timed_out=True,
details=details,
)
if "error" in result:
details["test_results"] = []
details["test_summary"] = result["error"]
return base_grade(
score=composite_grade_score(
correctness=0.10,
quality=quality["score"],
runtime=0.0,
syntax=0.95,
similarity=similarity,
baseline=0.05,
penalty=0.08,
),
syntax_score=component_score(0.95),
tests_passed=0,
tests_total=len(cases),
quality_score=quality["score"],
runtime_score=component_score(0.01),
timed_out=False,
details=details,
)
data = result["data"]
pass_rate = data["passed"] / max(data["total"], 1)
runtime_score = component_score(0.01)
benchmark_summary = "Benchmark deferred until hidden evaluation."
timed_out = False
if include_hidden and pass_rate == 1.0:
benchmark = benchmark_candidate(task, code, timeout_s=timeout_s)
runtime_score = benchmark["runtime_score"]
timed_out = benchmark.get("timed_out", False)
benchmark_summary = benchmark["details"]
if timed_out:
runtime_score = component_score(0.01)
details["test_results"] = data["results"]
details["test_summary"] = summarize_results("Test results", data["results"])
details["benchmark"] = benchmark_summary
runtime_progress = 0.0 if benchmark_summary == "Benchmark deferred until hidden evaluation." else runtime_score
return base_grade(
score=composite_grade_score(
correctness=pass_rate,
quality=quality["score"],
runtime=runtime_progress if include_hidden else 0.10,
syntax=0.95,
similarity=similarity,
baseline=0.08 if include_hidden else 0.07,
penalty=0.10 if timed_out else 0.0,
),
syntax_score=component_score(0.95),
tests_passed=data["passed"],
tests_total=data["total"],
quality_score=quality["score"],
runtime_score=runtime_score,
timed_out=timed_out,
details=details,
)
|