File size: 4,943 Bytes
3d44779
 
 
 
 
989722c
3d44779
 
989722c
3d44779
 
989722c
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
 
 
 
 
 
 
989722c
 
 
 
 
 
 
 
 
 
3d44779
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
 
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
 
 
 
4451363
3d44779
 
 
 
 
 
 
 
 
4451363
3d44779
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""Optimization task grader."""

from __future__ import annotations

try:
    from ..models import TaskGrade
    from ..tasks.catalog import ReviewTask
except ImportError:
    from models import TaskGrade
    from tasks.catalog import ReviewTask

from .shared import (
    base_grade,
    benchmark_candidate,
    compile_code,
    composite_grade_score,
    component_score,
    execute_cases,
    quality_metrics,
    similarity_score,
    summarize_results,
)


def grade_optimization_task(

    task: ReviewTask,

    code: str,

    *,

    include_hidden: bool,

    timeout_s: float = 3.0,

) -> TaskGrade:
    """Grade an optimization/refactor task with correctness, quality, and runtime."""

    compiled, compile_error = compile_code(code)
    quality = quality_metrics(code, task.function_name)
    similarity = similarity_score(code, task.reference_code)
    details = {
        "compile_error": compile_error,
        "quality_notes": quality["quality_notes"],
        "style_score": quality["style_score"],
        "visibility": "full" if include_hidden else "public",
    }

    if not compiled:
        details["test_results"] = []
        details["test_summary"] = "Code does not compile."
        return base_grade(
            score=composite_grade_score(
                correctness=0.0,
                quality=0.05,
                runtime=0.0,
                syntax=0.0,
                similarity=similarity,
                baseline=0.04,
                penalty=0.06,
            ),
            syntax_score=component_score(0.01),
            tests_passed=0,
            tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
            quality_score=component_score(0.01),
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    cases = task.public_cases + (task.hidden_cases if include_hidden else [])
    result = execute_cases(code, task.function_name, cases, timeout_s=timeout_s)
    if result.get("timed_out"):
        details["test_results"] = []
        details["test_summary"] = result["error"]
        return base_grade(
            score=composite_grade_score(
                correctness=0.08,
                quality=quality["score"],
                runtime=0.0,
                syntax=0.95,
                similarity=similarity,
                baseline=0.05,
                penalty=0.14,
            ),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=True,
            details=details,
        )
    if "error" in result:
        details["test_results"] = []
        details["test_summary"] = result["error"]
        return base_grade(
            score=composite_grade_score(
                correctness=0.10,
                quality=quality["score"],
                runtime=0.0,
                syntax=0.95,
                similarity=similarity,
                baseline=0.05,
                penalty=0.08,
            ),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    data = result["data"]
    pass_rate = data["passed"] / max(data["total"], 1)
    runtime_score = component_score(0.01)
    benchmark_summary = "Benchmark deferred until hidden evaluation."
    timed_out = False

    if include_hidden and pass_rate == 1.0:
        benchmark = benchmark_candidate(task, code, timeout_s=timeout_s)
        runtime_score = benchmark["runtime_score"]
        timed_out = benchmark.get("timed_out", False)
        benchmark_summary = benchmark["details"]
        if timed_out:
            runtime_score = component_score(0.01)

    details["test_results"] = data["results"]
    details["test_summary"] = summarize_results("Test results", data["results"])
    details["benchmark"] = benchmark_summary

    runtime_progress = 0.0 if benchmark_summary == "Benchmark deferred until hidden evaluation." else runtime_score
    return base_grade(
        score=composite_grade_score(
            correctness=pass_rate,
            quality=quality["score"],
            runtime=runtime_progress if include_hidden else 0.10,
            syntax=0.95,
            similarity=similarity,
            baseline=0.08 if include_hidden else 0.07,
            penalty=0.10 if timed_out else 0.0,
        ),
        syntax_score=component_score(0.95),
        tests_passed=data["passed"],
        tests_total=data["total"],
        quality_score=quality["score"],
        runtime_score=runtime_score,
        timed_out=timed_out,
        details=details,
    )