PatchJudge / run_patchjudge.py

Upload run_patchjudge.py with huggingface_hub

37896ae verified 10 days ago

10.2 kB

	#!/usr/bin/env python3
	"""PatchJudge — Main runner script.

	Runs the full PatchJudge pipeline:
	1. Load SWE-bench Verified + agent patches
	2. Extract features
	3. Judge patches with LLM
	4. Validate results
	5. Save everything
	"""

	import argparse
	import json
	import logging
	import os
	import sys
	import time
	from pathlib import Path
	from collections import defaultdict

	# Setup
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	)
	logger = logging.getLogger("patchjudge")


	def run_data_loading(args):
	"""Task 1: Load and prepare the dataset."""
	from patchjudge.data_loader import SWEBenchLoader, get_diff_stats

	print("\n" + "=" * 70)
	print(" Task 1: Data Loading & SWE-bench Setup")
	print("=" * 70)

	loader = SWEBenchLoader(cache_dir=args.data_dir)

	# Load gold data
	gold = loader.load_gold_data()
	print(f"\n✅ Loaded {len(gold)} SWE-bench Verified instances")

	# Load agent patches from HF datasets
	sources = args.sources.split(",") if args.sources else ["coderforge", "o1"]
	examples = loader.build_dataset(sources=sources)

	# Print stats
	passed = sum(1 for e in examples if e.test_passed)
	failed = len(examples) - passed
	repos = set(e.repo for e in examples)
	agents = set(e.agent_name for e in examples)
	instances = set(e.instance_id for e in examples)

	print(f"\n📊 Dataset Summary:")
	print(f" Total examples: {len(examples)}")
	print(f" Test passed: {passed} ({passed/len(examples):.1%})")
	print(f" Test failed: {failed} ({failed/len(examples):.1%})")
	print(f" Unique instances: {len(instances)}")
	print(f" Unique repos: {len(repos)}")
	print(f" Agent sources: {agents}")

	# Difficulty distribution
	diff_counts = defaultdict(int)
	for e in examples:
	diff_counts[e.difficulty or "unknown"] += 1
	print(f"\n Difficulty:")
	for d, c in sorted(diff_counts.items()):
	print(f" {d}: {c}")

	# Repo distribution (top 10)
	repo_counts = defaultdict(int)
	for e in examples:
	repo_counts[e.repo] += 1
	print(f"\n Top repos:")
	for repo, c in sorted(repo_counts.items(), key=lambda x: -x[1])[:10]:
	print(f" {repo}: {c}")

	# Diff stats summary
	print(f"\n Patch size stats (agent patches):")
	all_stats = [get_diff_stats(e.agent_patch) for e in examples]
	for key in ["lines_added", "lines_removed", "files_changed", "hunks"]:
	values = [s[key] for s in all_stats]
	if values:
	import statistics
	print(f" {key}: mean={statistics.mean(values):.1f}, "
	f"median={statistics.median(values):.0f}, "
	f"max={max(values)}")

	# Save
	path = loader.save_dataset(examples)
	print(f"\n💾 Saved to: {path}")

	return examples, gold


	def run_feature_extraction(examples, args):
	"""Task 2: Extract features from all patches."""
	from patchjudge.feature_extractor import FeatureExtractor, extract_features_batch

	print("\n" + "=" * 70)
	print(" Task 2: Feature Extraction")
	print("=" * 70)

	results = extract_features_batch(examples, show_progress=True)
	features_list = [f for _, f in results]

	# Aggregate feature stats
	print(f"\n📐 Feature Summary ({len(features_list)} patches):")

	bool_features = [
	'has_error_handling', 'has_edge_case_handling', 'has_todos',
	'has_hardcoded_values', 'has_debug_statements', 'modifies_core_files',
	'has_imports_added', 'touches_tests',
	]

	for feat in bool_features:
	count = sum(1 for f in features_list if getattr(f, feat))
	print(f" {feat:>30}: {count}/{len(features_list)} ({count/len(features_list):.1%})")

	# Scope distribution
	scope_counts = defaultdict(int)
	for f in features_list:
	scope_counts[f.change_scope] += 1
	print(f"\n Change scope:")
	for scope, c in sorted(scope_counts.items()):
	print(f" {scope}: {c}")

	# Keyword coverage
	coverages = [f.keyword_coverage_ratio for f in features_list]
	if coverages:
	import statistics
	print(f"\n Keyword coverage: "
	f"mean={statistics.mean(coverages):.2f}, "
	f"median={statistics.median(coverages):.2f}")

	# Save features
	features_path = Path(args.data_dir) / "features.jsonl"
	with open(features_path, 'w') as f:
	for ex, feat in results:
	f.write(json.dumps({
	"instance_id": ex.instance_id,
	"agent_name": ex.agent_name,
	"features": feat.to_dict(),
	}) + "\n")
	print(f"\n💾 Features saved to: {features_path}")

	return features_list


	def run_judging(examples, features_list, args):
	"""Task 3: LLM Judge evaluation."""
	from patchjudge.judge import PatchJudge

	print("\n" + "=" * 70)
	print(" Task 3: LLM Judge Evaluation")
	print("=" * 70)

	# Select subset for judging
	n = min(args.judge_count, len(examples))

	# Ensure mix of passed/failed
	passed = [i for i, e in enumerate(examples) if e.test_passed]
	failed = [i for i, e in enumerate(examples) if not e.test_passed]

	# Take proportional split
	n_passed = min(len(passed), int(n * 0.6))
	n_failed = min(len(failed), n - n_passed)
	n_passed = n - n_failed # Adjust if not enough failed

	selected_idx = passed[:n_passed] + failed[:n_failed]
	selected_examples = [examples[i] for i in selected_idx]
	selected_features = [features_list[i] for i in selected_idx] if features_list else None

	print(f"\n🔍 Judging {len(selected_examples)} patches "
	f"({n_passed} passed, {n_failed} failed)")
	print(f" Model: {args.model_id}")

	judge = PatchJudge(
	model_id=args.model_id,
	temperature=0.1,
	max_tokens=2000,
	)

	start = time.time()
	results = judge.judge_batch(
	selected_examples,
	selected_features,
	show_progress=True,
	)
	elapsed = time.time() - start

	print(f"\n⏱️ Judging complete in {elapsed:.1f}s "
	f"({elapsed/len(selected_examples):.1f}s per patch)")

	# Save results
	results_path = Path(args.data_dir) / "judge_results.jsonl"
	with open(results_path, 'w') as f:
	for ex, r in zip(selected_examples, results):
	f.write(json.dumps({
	"instance_id": ex.instance_id,
	"agent_name": ex.agent_name,
	"test_passed": ex.test_passed,
	"merge_score": r.merge_score,
	"dimension_scores": r.dimension_scores,
	"model_used": r.model_used,
	}) + "\n")
	print(f"💾 Results saved to: {results_path}")

	return selected_examples, results, judge


	def run_validation(examples, results, gold_data, judge, args):
	"""Task 4: Validate PatchJudge against ground truth."""
	from patchjudge.validation import run_full_validation

	print("\n" + "=" * 70)
	print(" Task 4: Validation")
	print("=" * 70)

	gold_list = list(gold_data.values())[:50] if gold_data else None

	vr, report = run_full_validation(
	examples=examples,
	results=results,
	gold_data=gold_list,
	judge=judge if args.validate_known_bad else None,
	)

	print(report)

	# Save validation results
	val_path = Path(args.data_dir) / "validation_results.json"
	with open(val_path, 'w') as f:
	json.dump(vr.to_dict(), f, indent=2)
	print(f"\n💾 Validation results saved to: {val_path}")

	# Save full report
	report_path = Path(args.data_dir) / "validation_report.txt"
	with open(report_path, 'w') as f:
	f.write(report)
	print(f"💾 Report saved to: {report_path}")

	return vr


	def main():
	parser = argparse.ArgumentParser(description="PatchJudge - Post-Test Code Quality Scorer")
	parser.add_argument("--data-dir", default="data", help="Data directory")
	parser.add_argument("--sources", default="coderforge,o1",
	help="Comma-separated data sources: coderforge,o1,s3")
	parser.add_argument("--model-id", default="Qwen/Qwen2.5-Coder-32B-Instruct",
	help="LLM model for judging")
	parser.add_argument("--judge-count", type=int, default=50,
	help="Number of patches to judge")
	parser.add_argument("--validate-known-bad", action="store_true",
	help="Also generate and judge known-bad patches for validation")
	parser.add_argument("--tasks", default="1,2,3,4",
	help="Comma-separated task numbers to run (1=load, 2=features, 3=judge, 4=validate)")
	parser.add_argument("--load-cached", action="store_true",
	help="Load previously saved dataset instead of re-downloading")

	args = parser.parse_args()
	tasks = [int(t) for t in args.tasks.split(",")]

	os.makedirs(args.data_dir, exist_ok=True)

	examples = None
	features_list = None
	results = None
	gold_data = None
	judge = None

	# Task 1: Data Loading
	if 1 in tasks:
	if args.load_cached:
	from patchjudge.data_loader import SWEBenchLoader
	loader = SWEBenchLoader(cache_dir=args.data_dir)
	examples = loader.load_saved_dataset()
	gold_data = loader.load_gold_data()
	else:
	examples, gold_data = run_data_loading(args)

	# Task 2: Feature Extraction
	if 2 in tasks and examples:
	features_list = run_feature_extraction(examples, args)

	# Task 3: LLM Judging
	if 3 in tasks and examples:
	if features_list is None:
	# Extract features first
	features_list = run_feature_extraction(examples, args)
	examples, results, judge = run_judging(examples, features_list, args)

	# Task 4: Validation
	if 4 in tasks and results:
	run_validation(examples, results, gold_data, judge, args)

	print("\n✅ PatchJudge pipeline complete!")


	if __name__ == "__main__":
	main()