File size: 5,047 Bytes

3bb15c1

"""Data models for PatchJudge."""

from dataclasses import dataclass, field, asdict
from typing import Optional
import json


@dataclass
class PatchExample:
    """Unified format for a single patch evaluation example."""
    instance_id: str
    repo: str
    problem_statement: str
    gold_patch: str              # Human-written reference patch
    agent_patch: str             # AI-generated patch
    agent_name: str              # Which agent produced this
    test_passed: bool            # Did the agent's patch pass tests?
    base_commit: str
    repo_context: dict = field(default_factory=dict)  # {filename: file_content}
    difficulty: str = ""
    
    def to_dict(self) -> dict:
        return asdict(self)
    
    @classmethod
    def from_dict(cls, d: dict) -> "PatchExample":
        return cls(**d)
    
    def to_json(self) -> str:
        return json.dumps(self.to_dict(), indent=2)
    
    @classmethod
    def from_json(cls, s: str) -> "PatchExample":
        return cls.from_dict(json.loads(s))


@dataclass
class PatchFeatures:
    """Structured features extracted from a patch."""
    # Diff statistics
    num_files_changed: int = 0
    num_lines_added: int = 0
    num_lines_removed: int = 0
    num_hunks: int = 0
    
    # Code structure
    added_functions: list = field(default_factory=list)
    modified_functions: list = field(default_factory=list)
    has_error_handling: bool = False
    has_edge_case_handling: bool = False
    
    # Issue-patch alignment
    issue_keywords_addressed: list = field(default_factory=list)
    issue_components_mentioned: list = field(default_factory=list)
    keyword_coverage_ratio: float = 0.0
    
    # Code quality signals
    has_todos: bool = False
    has_hardcoded_values: bool = False
    has_debug_statements: bool = False
    follows_project_style: bool = True
    style_violations: list = field(default_factory=list)
    
    # Risk signals
    modifies_core_files: bool = False
    change_scope: str = "minimal"  # minimal, moderate, extensive
    has_imports_added: bool = False
    new_imports: list = field(default_factory=list)
    touches_tests: bool = False
    
    # Complexity
    cyclomatic_complexity_delta: int = 0
    nesting_depth_max: int = 0
    
    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class DimensionScore:
    """Score for a single evaluation dimension."""
    score: int  # 0-10
    reasoning: str
    flags: list = field(default_factory=list)
    
    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class JudgeResult:
    """Complete judge evaluation output."""
    merge_score: float  # 0-100 weighted score
    dimension_scores: dict = field(default_factory=dict)  # dim_name -> DimensionScore
    raw_output: str = ""
    features: Optional[PatchFeatures] = None
    model_used: str = ""
    
    @property
    def correctness(self) -> int:
        return self.dimension_scores.get("correctness", {}).get("score", 0)
    
    @property
    def completeness(self) -> int:
        return self.dimension_scores.get("completeness", {}).get("score", 0)
    
    @property
    def code_quality(self) -> int:
        return self.dimension_scores.get("code_quality", {}).get("score", 0)
    
    @property
    def non_regression_risk(self) -> int:
        return self.dimension_scores.get("non_regression_risk", {}).get("score", 0)
    
    @property
    def merge_readiness(self) -> int:
        return self.dimension_scores.get("merge_readiness", {}).get("score", 0)
    
    def to_dict(self) -> dict:
        d = {
            "merge_score": self.merge_score,
            "dimension_scores": self.dimension_scores,
            "raw_output": self.raw_output,
            "model_used": self.model_used,
        }
        if self.features:
            d["features"] = self.features.to_dict()
        return d
    
    def summary(self) -> str:
        lines = [f"MergeScore: {self.merge_score:.1f}/100"]
        for dim, data in self.dimension_scores.items():
            score = data.get("score", "?")
            lines.append(f"  {dim}: {score}/10")
            if data.get("flags"):
                for flag in data["flags"]:
                    lines.append(f"    ⚠ {flag}")
        return "\n".join(lines)


@dataclass
class ValidationResult:
    """Result of validating PatchJudge against ground truth."""
    total_examples: int = 0
    # METR alignment: fraction of test-passing patches scoring below 50
    test_passing_below_50_pct: float = 0.0
    # Correlation metrics
    score_resolved_correlation: float = 0.0
    mean_score_resolved: float = 0.0
    mean_score_unresolved: float = 0.0
    # Known-bad detection
    known_bad_detected: int = 0
    known_bad_total: int = 0
    known_bad_detection_rate: float = 0.0
    # Score distribution
    score_mean: float = 0.0
    score_std: float = 0.0
    score_median: float = 0.0
    # Per-dimension stats
    dimension_stats: dict = field(default_factory=dict)
    
    def to_dict(self) -> dict:
        return asdict(self)