import json import time import os import sys from pathlib import Path from typing import Dict, List import pandas as pd from datetime import datetime # Add project root to path (one level up from this script) sys.path.append(str(Path(__file__).parent.parent)) from syscred.verification_system import CredibilityVerificationSystem from syscred.config import config def run_benchmark(): print("="*60) print(" SysCRED v2.1 - Scientific Evaluation Benchmark ") print("="*60) # Load Benchmark Data data_path = Path(__file__).parent / "benchmark_data.json" if not data_path.exists(): print(f"❌ Error: {data_path} not found.") return with open(data_path, 'r') as f: dataset = json.load(f) print(f"Loaded {len(dataset)} test cases.\n") # Initialize System with Full Capabilities print("Initializing SysCRED (ML Models + Google API)...") system = CredibilityVerificationSystem( ontology_base_path=str(config.ONTOLOGY_BASE_PATH), ontology_data_path=str(config.ONTOLOGY_DATA_PATH), load_ml_models=True, # Use full ML for benchmark google_api_key=config.GOOGLE_FACT_CHECK_API_KEY ) print("System ready.\n") results = [] # Run Evaluation for i, item in enumerate(dataset): url = item['url'] label = item['label'] print(f"[{i+1}/{len(dataset)}] Analyzing: {url} (Expected: {label})...") start_time = time.time() try: # Run analysis # We treat empty text fallbacks as valid logic path report = system.verify_information(url) score = report.get('score_credibilite', 0.5) # Determine System Verdict sys_verdict = "High" if score >= 0.55 else "Low" # Compare match = (sys_verdict == label) or (label == "High" and sys_verdict == "High") or (label == "Low" and sys_verdict == "Low") # Handling Medium? For binary benchmark, we assume simplified threshold. # Or we can map: # High (>=0.7) # Medium (0.4-0.7) # Low (<0.4) # Simple Binary Metric for Precision/Recall: # Positive Class = "High Credibility" results.append({ "url": url, "expected": label, "score": score, "system_verdict": sys_verdict, "match": match, "time": time.time() - start_time, "error": None }) print(f" -> Score: {score:.2f} | Verdict: {sys_verdict} | match: {'✅' if match else '❌'}") except Exception as e: print(f" -> ❌ Error: {e}") results.append({ "url": url, "expected": label, "score": 0, "system_verdict": "Error", "match": False, "time": time.time() - start_time, "error": str(e) }) # Calculate Metrics print("\n" + "="*60) print("RESULTS SUMMARY") print("="*60) df = pd.DataFrame(results) # Logic for metrics # TP: System=High, Expected=High # FP: System=High, Expected=Low # TN: System=Low, Expected=Low # FN: System=Low, Expected=High tp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'High')]) fp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'Low')]) tn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'Low')]) fn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'High')]) precision = tp / (tp + fp) if (tp + fp) > 0 else 0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0 accuracy = (tp + tn) / len(df) if len(df) > 0 else 0 f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 print(f"Total Cases: {len(df)}") print(f"Accuracy: {accuracy:.2%}") print(f"Precision: {precision:.2%}") print(f"Recall: {recall:.2%}") print(f"F1-Score: {f1:.2f}") print("\nConfusion Matrix:") print(f" | Pred High | Pred Low") print(f"True High | {tp} | {fn}") print(f"True Low | {fp} | {tn}") # Save detailed report report_path = Path(__file__).parent / "benchmark_results.csv" df.to_csv(report_path, index=False) print(f"\nDetailed CSV Saved to: {report_path}") if __name__ == "__main__": run_benchmark()