PatchJudge / run_patchjudge.py
VD10's picture
Upload run_patchjudge.py with huggingface_hub
37896ae verified
#!/usr/bin/env python3
"""PatchJudge — Main runner script.
Runs the full PatchJudge pipeline:
1. Load SWE-bench Verified + agent patches
2. Extract features
3. Judge patches with LLM
4. Validate results
5. Save everything
"""
import argparse
import json
import logging
import os
import sys
import time
from pathlib import Path
from collections import defaultdict
# Setup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger("patchjudge")
def run_data_loading(args):
"""Task 1: Load and prepare the dataset."""
from patchjudge.data_loader import SWEBenchLoader, get_diff_stats
print("\n" + "=" * 70)
print(" Task 1: Data Loading & SWE-bench Setup")
print("=" * 70)
loader = SWEBenchLoader(cache_dir=args.data_dir)
# Load gold data
gold = loader.load_gold_data()
print(f"\n✅ Loaded {len(gold)} SWE-bench Verified instances")
# Load agent patches from HF datasets
sources = args.sources.split(",") if args.sources else ["coderforge", "o1"]
examples = loader.build_dataset(sources=sources)
# Print stats
passed = sum(1 for e in examples if e.test_passed)
failed = len(examples) - passed
repos = set(e.repo for e in examples)
agents = set(e.agent_name for e in examples)
instances = set(e.instance_id for e in examples)
print(f"\n📊 Dataset Summary:")
print(f" Total examples: {len(examples)}")
print(f" Test passed: {passed} ({passed/len(examples):.1%})")
print(f" Test failed: {failed} ({failed/len(examples):.1%})")
print(f" Unique instances: {len(instances)}")
print(f" Unique repos: {len(repos)}")
print(f" Agent sources: {agents}")
# Difficulty distribution
diff_counts = defaultdict(int)
for e in examples:
diff_counts[e.difficulty or "unknown"] += 1
print(f"\n Difficulty:")
for d, c in sorted(diff_counts.items()):
print(f" {d}: {c}")
# Repo distribution (top 10)
repo_counts = defaultdict(int)
for e in examples:
repo_counts[e.repo] += 1
print(f"\n Top repos:")
for repo, c in sorted(repo_counts.items(), key=lambda x: -x[1])[:10]:
print(f" {repo}: {c}")
# Diff stats summary
print(f"\n Patch size stats (agent patches):")
all_stats = [get_diff_stats(e.agent_patch) for e in examples]
for key in ["lines_added", "lines_removed", "files_changed", "hunks"]:
values = [s[key] for s in all_stats]
if values:
import statistics
print(f" {key}: mean={statistics.mean(values):.1f}, "
f"median={statistics.median(values):.0f}, "
f"max={max(values)}")
# Save
path = loader.save_dataset(examples)
print(f"\n💾 Saved to: {path}")
return examples, gold
def run_feature_extraction(examples, args):
"""Task 2: Extract features from all patches."""
from patchjudge.feature_extractor import FeatureExtractor, extract_features_batch
print("\n" + "=" * 70)
print(" Task 2: Feature Extraction")
print("=" * 70)
results = extract_features_batch(examples, show_progress=True)
features_list = [f for _, f in results]
# Aggregate feature stats
print(f"\n📐 Feature Summary ({len(features_list)} patches):")
bool_features = [
'has_error_handling', 'has_edge_case_handling', 'has_todos',
'has_hardcoded_values', 'has_debug_statements', 'modifies_core_files',
'has_imports_added', 'touches_tests',
]
for feat in bool_features:
count = sum(1 for f in features_list if getattr(f, feat))
print(f" {feat:>30}: {count}/{len(features_list)} ({count/len(features_list):.1%})")
# Scope distribution
scope_counts = defaultdict(int)
for f in features_list:
scope_counts[f.change_scope] += 1
print(f"\n Change scope:")
for scope, c in sorted(scope_counts.items()):
print(f" {scope}: {c}")
# Keyword coverage
coverages = [f.keyword_coverage_ratio for f in features_list]
if coverages:
import statistics
print(f"\n Keyword coverage: "
f"mean={statistics.mean(coverages):.2f}, "
f"median={statistics.median(coverages):.2f}")
# Save features
features_path = Path(args.data_dir) / "features.jsonl"
with open(features_path, 'w') as f:
for ex, feat in results:
f.write(json.dumps({
"instance_id": ex.instance_id,
"agent_name": ex.agent_name,
"features": feat.to_dict(),
}) + "\n")
print(f"\n💾 Features saved to: {features_path}")
return features_list
def run_judging(examples, features_list, args):
"""Task 3: LLM Judge evaluation."""
from patchjudge.judge import PatchJudge
print("\n" + "=" * 70)
print(" Task 3: LLM Judge Evaluation")
print("=" * 70)
# Select subset for judging
n = min(args.judge_count, len(examples))
# Ensure mix of passed/failed
passed = [i for i, e in enumerate(examples) if e.test_passed]
failed = [i for i, e in enumerate(examples) if not e.test_passed]
# Take proportional split
n_passed = min(len(passed), int(n * 0.6))
n_failed = min(len(failed), n - n_passed)
n_passed = n - n_failed # Adjust if not enough failed
selected_idx = passed[:n_passed] + failed[:n_failed]
selected_examples = [examples[i] for i in selected_idx]
selected_features = [features_list[i] for i in selected_idx] if features_list else None
print(f"\n🔍 Judging {len(selected_examples)} patches "
f"({n_passed} passed, {n_failed} failed)")
print(f" Model: {args.model_id}")
judge = PatchJudge(
model_id=args.model_id,
temperature=0.1,
max_tokens=2000,
)
start = time.time()
results = judge.judge_batch(
selected_examples,
selected_features,
show_progress=True,
)
elapsed = time.time() - start
print(f"\n⏱️ Judging complete in {elapsed:.1f}s "
f"({elapsed/len(selected_examples):.1f}s per patch)")
# Save results
results_path = Path(args.data_dir) / "judge_results.jsonl"
with open(results_path, 'w') as f:
for ex, r in zip(selected_examples, results):
f.write(json.dumps({
"instance_id": ex.instance_id,
"agent_name": ex.agent_name,
"test_passed": ex.test_passed,
"merge_score": r.merge_score,
"dimension_scores": r.dimension_scores,
"model_used": r.model_used,
}) + "\n")
print(f"💾 Results saved to: {results_path}")
return selected_examples, results, judge
def run_validation(examples, results, gold_data, judge, args):
"""Task 4: Validate PatchJudge against ground truth."""
from patchjudge.validation import run_full_validation
print("\n" + "=" * 70)
print(" Task 4: Validation")
print("=" * 70)
gold_list = list(gold_data.values())[:50] if gold_data else None
vr, report = run_full_validation(
examples=examples,
results=results,
gold_data=gold_list,
judge=judge if args.validate_known_bad else None,
)
print(report)
# Save validation results
val_path = Path(args.data_dir) / "validation_results.json"
with open(val_path, 'w') as f:
json.dump(vr.to_dict(), f, indent=2)
print(f"\n💾 Validation results saved to: {val_path}")
# Save full report
report_path = Path(args.data_dir) / "validation_report.txt"
with open(report_path, 'w') as f:
f.write(report)
print(f"💾 Report saved to: {report_path}")
return vr
def main():
parser = argparse.ArgumentParser(description="PatchJudge - Post-Test Code Quality Scorer")
parser.add_argument("--data-dir", default="data", help="Data directory")
parser.add_argument("--sources", default="coderforge,o1",
help="Comma-separated data sources: coderforge,o1,s3")
parser.add_argument("--model-id", default="Qwen/Qwen2.5-Coder-32B-Instruct",
help="LLM model for judging")
parser.add_argument("--judge-count", type=int, default=50,
help="Number of patches to judge")
parser.add_argument("--validate-known-bad", action="store_true",
help="Also generate and judge known-bad patches for validation")
parser.add_argument("--tasks", default="1,2,3,4",
help="Comma-separated task numbers to run (1=load, 2=features, 3=judge, 4=validate)")
parser.add_argument("--load-cached", action="store_true",
help="Load previously saved dataset instead of re-downloading")
args = parser.parse_args()
tasks = [int(t) for t in args.tasks.split(",")]
os.makedirs(args.data_dir, exist_ok=True)
examples = None
features_list = None
results = None
gold_data = None
judge = None
# Task 1: Data Loading
if 1 in tasks:
if args.load_cached:
from patchjudge.data_loader import SWEBenchLoader
loader = SWEBenchLoader(cache_dir=args.data_dir)
examples = loader.load_saved_dataset()
gold_data = loader.load_gold_data()
else:
examples, gold_data = run_data_loading(args)
# Task 2: Feature Extraction
if 2 in tasks and examples:
features_list = run_feature_extraction(examples, args)
# Task 3: LLM Judging
if 3 in tasks and examples:
if features_list is None:
# Extract features first
features_list = run_feature_extraction(examples, args)
examples, results, judge = run_judging(examples, features_list, args)
# Task 4: Validation
if 4 in tasks and results:
run_validation(examples, results, gold_data, judge, args)
print("\n✅ PatchJudge pipeline complete!")
if __name__ == "__main__":
main()