#!/usr/bin/env python3 """PatchJudge — Main runner script. Runs the full PatchJudge pipeline: 1. Load SWE-bench Verified + agent patches 2. Extract features 3. Judge patches with LLM 4. Validate results 5. Save everything """ import argparse import json import logging import os import sys import time from pathlib import Path from collections import defaultdict # Setup logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) logger = logging.getLogger("patchjudge") def run_data_loading(args): """Task 1: Load and prepare the dataset.""" from patchjudge.data_loader import SWEBenchLoader, get_diff_stats print("\n" + "=" * 70) print(" Task 1: Data Loading & SWE-bench Setup") print("=" * 70) loader = SWEBenchLoader(cache_dir=args.data_dir) # Load gold data gold = loader.load_gold_data() print(f"\nāœ… Loaded {len(gold)} SWE-bench Verified instances") # Load agent patches from HF datasets sources = args.sources.split(",") if args.sources else ["coderforge", "o1"] examples = loader.build_dataset(sources=sources) # Print stats passed = sum(1 for e in examples if e.test_passed) failed = len(examples) - passed repos = set(e.repo for e in examples) agents = set(e.agent_name for e in examples) instances = set(e.instance_id for e in examples) print(f"\nšŸ“Š Dataset Summary:") print(f" Total examples: {len(examples)}") print(f" Test passed: {passed} ({passed/len(examples):.1%})") print(f" Test failed: {failed} ({failed/len(examples):.1%})") print(f" Unique instances: {len(instances)}") print(f" Unique repos: {len(repos)}") print(f" Agent sources: {agents}") # Difficulty distribution diff_counts = defaultdict(int) for e in examples: diff_counts[e.difficulty or "unknown"] += 1 print(f"\n Difficulty:") for d, c in sorted(diff_counts.items()): print(f" {d}: {c}") # Repo distribution (top 10) repo_counts = defaultdict(int) for e in examples: repo_counts[e.repo] += 1 print(f"\n Top repos:") for repo, c in sorted(repo_counts.items(), key=lambda x: -x[1])[:10]: print(f" {repo}: {c}") # Diff stats summary print(f"\n Patch size stats (agent patches):") all_stats = [get_diff_stats(e.agent_patch) for e in examples] for key in ["lines_added", "lines_removed", "files_changed", "hunks"]: values = [s[key] for s in all_stats] if values: import statistics print(f" {key}: mean={statistics.mean(values):.1f}, " f"median={statistics.median(values):.0f}, " f"max={max(values)}") # Save path = loader.save_dataset(examples) print(f"\nšŸ’¾ Saved to: {path}") return examples, gold def run_feature_extraction(examples, args): """Task 2: Extract features from all patches.""" from patchjudge.feature_extractor import FeatureExtractor, extract_features_batch print("\n" + "=" * 70) print(" Task 2: Feature Extraction") print("=" * 70) results = extract_features_batch(examples, show_progress=True) features_list = [f for _, f in results] # Aggregate feature stats print(f"\nšŸ“ Feature Summary ({len(features_list)} patches):") bool_features = [ 'has_error_handling', 'has_edge_case_handling', 'has_todos', 'has_hardcoded_values', 'has_debug_statements', 'modifies_core_files', 'has_imports_added', 'touches_tests', ] for feat in bool_features: count = sum(1 for f in features_list if getattr(f, feat)) print(f" {feat:>30}: {count}/{len(features_list)} ({count/len(features_list):.1%})") # Scope distribution scope_counts = defaultdict(int) for f in features_list: scope_counts[f.change_scope] += 1 print(f"\n Change scope:") for scope, c in sorted(scope_counts.items()): print(f" {scope}: {c}") # Keyword coverage coverages = [f.keyword_coverage_ratio for f in features_list] if coverages: import statistics print(f"\n Keyword coverage: " f"mean={statistics.mean(coverages):.2f}, " f"median={statistics.median(coverages):.2f}") # Save features features_path = Path(args.data_dir) / "features.jsonl" with open(features_path, 'w') as f: for ex, feat in results: f.write(json.dumps({ "instance_id": ex.instance_id, "agent_name": ex.agent_name, "features": feat.to_dict(), }) + "\n") print(f"\nšŸ’¾ Features saved to: {features_path}") return features_list def run_judging(examples, features_list, args): """Task 3: LLM Judge evaluation.""" from patchjudge.judge import PatchJudge print("\n" + "=" * 70) print(" Task 3: LLM Judge Evaluation") print("=" * 70) # Select subset for judging n = min(args.judge_count, len(examples)) # Ensure mix of passed/failed passed = [i for i, e in enumerate(examples) if e.test_passed] failed = [i for i, e in enumerate(examples) if not e.test_passed] # Take proportional split n_passed = min(len(passed), int(n * 0.6)) n_failed = min(len(failed), n - n_passed) n_passed = n - n_failed # Adjust if not enough failed selected_idx = passed[:n_passed] + failed[:n_failed] selected_examples = [examples[i] for i in selected_idx] selected_features = [features_list[i] for i in selected_idx] if features_list else None print(f"\nšŸ” Judging {len(selected_examples)} patches " f"({n_passed} passed, {n_failed} failed)") print(f" Model: {args.model_id}") judge = PatchJudge( model_id=args.model_id, temperature=0.1, max_tokens=2000, ) start = time.time() results = judge.judge_batch( selected_examples, selected_features, show_progress=True, ) elapsed = time.time() - start print(f"\nā±ļø Judging complete in {elapsed:.1f}s " f"({elapsed/len(selected_examples):.1f}s per patch)") # Save results results_path = Path(args.data_dir) / "judge_results.jsonl" with open(results_path, 'w') as f: for ex, r in zip(selected_examples, results): f.write(json.dumps({ "instance_id": ex.instance_id, "agent_name": ex.agent_name, "test_passed": ex.test_passed, "merge_score": r.merge_score, "dimension_scores": r.dimension_scores, "model_used": r.model_used, }) + "\n") print(f"šŸ’¾ Results saved to: {results_path}") return selected_examples, results, judge def run_validation(examples, results, gold_data, judge, args): """Task 4: Validate PatchJudge against ground truth.""" from patchjudge.validation import run_full_validation print("\n" + "=" * 70) print(" Task 4: Validation") print("=" * 70) gold_list = list(gold_data.values())[:50] if gold_data else None vr, report = run_full_validation( examples=examples, results=results, gold_data=gold_list, judge=judge if args.validate_known_bad else None, ) print(report) # Save validation results val_path = Path(args.data_dir) / "validation_results.json" with open(val_path, 'w') as f: json.dump(vr.to_dict(), f, indent=2) print(f"\nšŸ’¾ Validation results saved to: {val_path}") # Save full report report_path = Path(args.data_dir) / "validation_report.txt" with open(report_path, 'w') as f: f.write(report) print(f"šŸ’¾ Report saved to: {report_path}") return vr def main(): parser = argparse.ArgumentParser(description="PatchJudge - Post-Test Code Quality Scorer") parser.add_argument("--data-dir", default="data", help="Data directory") parser.add_argument("--sources", default="coderforge,o1", help="Comma-separated data sources: coderforge,o1,s3") parser.add_argument("--model-id", default="Qwen/Qwen2.5-Coder-32B-Instruct", help="LLM model for judging") parser.add_argument("--judge-count", type=int, default=50, help="Number of patches to judge") parser.add_argument("--validate-known-bad", action="store_true", help="Also generate and judge known-bad patches for validation") parser.add_argument("--tasks", default="1,2,3,4", help="Comma-separated task numbers to run (1=load, 2=features, 3=judge, 4=validate)") parser.add_argument("--load-cached", action="store_true", help="Load previously saved dataset instead of re-downloading") args = parser.parse_args() tasks = [int(t) for t in args.tasks.split(",")] os.makedirs(args.data_dir, exist_ok=True) examples = None features_list = None results = None gold_data = None judge = None # Task 1: Data Loading if 1 in tasks: if args.load_cached: from patchjudge.data_loader import SWEBenchLoader loader = SWEBenchLoader(cache_dir=args.data_dir) examples = loader.load_saved_dataset() gold_data = loader.load_gold_data() else: examples, gold_data = run_data_loading(args) # Task 2: Feature Extraction if 2 in tasks and examples: features_list = run_feature_extraction(examples, args) # Task 3: LLM Judging if 3 in tasks and examples: if features_list is None: # Extract features first features_list = run_feature_extraction(examples, args) examples, results, judge = run_judging(examples, features_list, args) # Task 4: Validation if 4 in tasks and results: run_validation(examples, results, gold_data, judge, args) print("\nāœ… PatchJudge pipeline complete!") if __name__ == "__main__": main()