File size: 3,339 Bytes
0695520
 
 
 
 
692f802
0695520
 
692f802
0695520
 
5d806ad
 
 
 
 
 
 
 
 
 
0695520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d806ad
0695520
 
 
5d806ad
 
0695520
 
5d806ad
 
0695520
 
 
 
 
 
 
 
 
5d806ad
0695520
5d806ad
 
0695520
 
 
5d806ad
0695520
 
 
 
 
 
5d806ad
0695520
5d806ad
 
0695520
 
 
5d806ad
0695520
 
 
 
 
 
 
 
5d806ad
0695520
5d806ad
 
0695520
 
 
5d806ad
0695520
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""Bug-fix task grader."""

from __future__ import annotations

try:
    from ..models import TaskGrade
    from ..tasks.catalog import ReviewTask
except ImportError:
    from models import TaskGrade
    from tasks.catalog import ReviewTask

from .shared import (
    base_grade,
    compile_code,
    component_score,
    execute_cases,
    quality_metrics,
    shaped_score,
    similarity_score,
    summarize_results,
)


def grade_bug_fix_task(
    task: ReviewTask,
    code: str,
    *,
    include_hidden: bool,
    timeout_s: float = 2.0,
) -> TaskGrade:
    """Grade a bug-fix task against public or full test suites."""

    compiled, compile_error = compile_code(code)
    quality = quality_metrics(code, task.function_name)
    details = {
        "compile_error": compile_error,
        "quality_notes": quality["quality_notes"],
        "style_score": quality["style_score"],
        "visibility": "full" if include_hidden else "public",
    }

    if not compiled:
        progress = 0.02 + 0.12 * similarity_score(code, task.reference_code)
        details["test_results"] = []
        details["test_summary"] = "Code does not compile."
        return base_grade(
            score=shaped_score(progress),
            syntax_score=component_score(0.01),
            tests_passed=0,
            tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
            quality_score=component_score(0.01),
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    cases = task.public_cases + (task.hidden_cases if include_hidden else [])
    result = execute_cases(code, task.function_name, cases, timeout_s=timeout_s)
    if result.get("timed_out"):
        details["test_results"] = []
        details["test_summary"] = result["error"]
        progress = 0.12 + 0.18 * quality["score"]
        return base_grade(
            score=shaped_score(progress),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=True,
            details=details,
        )
    if "error" in result:
        details["test_results"] = []
        details["test_summary"] = result["error"]
        progress = 0.1 + 0.2 * quality["score"]
        return base_grade(
            score=shaped_score(progress),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    data = result["data"]
    pass_rate = data["passed"] / max(data["total"], 1)
    details["test_results"] = data["results"]
    details["test_summary"] = summarize_results("Test results", data["results"])
    progress = min(1.0, 0.05 + 0.8 * pass_rate + 0.15 * quality["score"])
    return base_grade(
        score=shaped_score(progress),
        syntax_score=component_score(0.95),
        tests_passed=data["passed"],
        tests_total=data["total"],
        quality_score=quality["score"],
        runtime_score=component_score(0.01),
        timed_out=False,
        details=details,
    )