syscred_duplicate

Running

syscred_duplicate / syscred /run_benchmark.py

D Ф m i И i q ц e L Ф y e r

Deploy SysCRED with PyTorch

e70050b 24 days ago

4.62 kB


	import json
	import time
	import os
	import sys
	from pathlib import Path
	from typing import Dict, List
	import pandas as pd
	from datetime import datetime

	# Add project root to path (one level up from this script)
	sys.path.append(str(Path(__file__).parent.parent))

	from syscred.verification_system import CredibilityVerificationSystem
	from syscred.config import config

	def run_benchmark():
	print("="*60)
	print(" SysCRED v2.1 - Scientific Evaluation Benchmark ")
	print("="*60)

	# Load Benchmark Data
	data_path = Path(__file__).parent / "benchmark_data.json"
	if not data_path.exists():
	print(f"❌ Error: {data_path} not found.")
	return

	with open(data_path, 'r') as f:
	dataset = json.load(f)

	print(f"Loaded {len(dataset)} test cases.\n")

	# Initialize System with Full Capabilities
	print("Initializing SysCRED (ML Models + Google API)...")
	system = CredibilityVerificationSystem(
	ontology_base_path=str(config.ONTOLOGY_BASE_PATH),
	ontology_data_path=str(config.ONTOLOGY_DATA_PATH),
	load_ml_models=True, # Use full ML for benchmark
	google_api_key=config.GOOGLE_FACT_CHECK_API_KEY
	)
	print("System ready.\n")

	results = []

	# Run Evaluation
	for i, item in enumerate(dataset):
	url = item['url']
	label = item['label']
	print(f"[{i+1}/{len(dataset)}] Analyzing: {url} (Expected: {label})...")

	start_time = time.time()
	try:
	# Run analysis
	# We treat empty text fallbacks as valid logic path
	report = system.verify_information(url)
	score = report.get('score_credibilite', 0.5)

	# Determine System Verdict
	sys_verdict = "High" if score >= 0.55 else "Low"

	# Compare
	match = (sys_verdict == label) or (label == "High" and sys_verdict == "High") or (label == "Low" and sys_verdict == "Low")
	# Handling Medium? For binary benchmark, we assume simplified threshold.
	# Or we can map:
	# High (>=0.7)
	# Medium (0.4-0.7)
	# Low (<0.4)

	# Simple Binary Metric for Precision/Recall:
	# Positive Class = "High Credibility"

	results.append({
	"url": url,
	"expected": label,
	"score": score,
	"system_verdict": sys_verdict,
	"match": match,
	"time": time.time() - start_time,
	"error": None
	})
	print(f" -> Score: {score:.2f} \| Verdict: {sys_verdict} \| match: {'✅' if match else '❌'}")

	except Exception as e:
	print(f" -> ❌ Error: {e}")
	results.append({
	"url": url,
	"expected": label,
	"score": 0,
	"system_verdict": "Error",
	"match": False,
	"time": time.time() - start_time,
	"error": str(e)
	})

	# Calculate Metrics
	print("\n" + "="*60)
	print("RESULTS SUMMARY")
	print("="*60)

	df = pd.DataFrame(results)

	# Logic for metrics
	# TP: System=High, Expected=High
	# FP: System=High, Expected=Low
	# TN: System=Low, Expected=Low
	# FN: System=Low, Expected=High

	tp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'High')])
	fp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'Low')])
	tn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'Low')])
	fn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'High')])

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0
	accuracy = (tp + tn) / len(df) if len(df) > 0 else 0
	f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

	print(f"Total Cases: {len(df)}")
	print(f"Accuracy: {accuracy:.2%}")
	print(f"Precision: {precision:.2%}")
	print(f"Recall: {recall:.2%}")
	print(f"F1-Score: {f1:.2f}")

	print("\nConfusion Matrix:")
	print(f" \| Pred High \| Pred Low")
	print(f"True High \| {tp} \| {fn}")
	print(f"True Low \| {fp} \| {tn}")

	# Save detailed report
	report_path = Path(__file__).parent / "benchmark_results.csv"
	df.to_csv(report_path, index=False)
	print(f"\nDetailed CSV Saved to: {report_path}")

	if __name__ == "__main__":
	run_benchmark()