Spaces:
Running
Running
| import json | |
| import time | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import pandas as pd | |
| from datetime import datetime | |
| # Add project root to path (one level up from this script) | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from syscred.verification_system import CredibilityVerificationSystem | |
| from syscred.config import config | |
| def run_benchmark(): | |
| print("="*60) | |
| print(" SysCRED v2.1 - Scientific Evaluation Benchmark ") | |
| print("="*60) | |
| # Load Benchmark Data | |
| data_path = Path(__file__).parent / "benchmark_data.json" | |
| if not data_path.exists(): | |
| print(f"❌ Error: {data_path} not found.") | |
| return | |
| with open(data_path, 'r') as f: | |
| dataset = json.load(f) | |
| print(f"Loaded {len(dataset)} test cases.\n") | |
| # Initialize System with Full Capabilities | |
| print("Initializing SysCRED (ML Models + Google API)...") | |
| system = CredibilityVerificationSystem( | |
| ontology_base_path=str(config.ONTOLOGY_BASE_PATH), | |
| ontology_data_path=str(config.ONTOLOGY_DATA_PATH), | |
| load_ml_models=True, # Use full ML for benchmark | |
| google_api_key=config.GOOGLE_FACT_CHECK_API_KEY | |
| ) | |
| print("System ready.\n") | |
| results = [] | |
| # Run Evaluation | |
| for i, item in enumerate(dataset): | |
| url = item['url'] | |
| label = item['label'] | |
| print(f"[{i+1}/{len(dataset)}] Analyzing: {url} (Expected: {label})...") | |
| start_time = time.time() | |
| try: | |
| # Run analysis | |
| # We treat empty text fallbacks as valid logic path | |
| report = system.verify_information(url) | |
| score = report.get('score_credibilite', 0.5) | |
| # Determine System Verdict | |
| sys_verdict = "High" if score >= 0.55 else "Low" | |
| # Compare | |
| match = (sys_verdict == label) or (label == "High" and sys_verdict == "High") or (label == "Low" and sys_verdict == "Low") | |
| # Handling Medium? For binary benchmark, we assume simplified threshold. | |
| # Or we can map: | |
| # High (>=0.7) | |
| # Medium (0.4-0.7) | |
| # Low (<0.4) | |
| # Simple Binary Metric for Precision/Recall: | |
| # Positive Class = "High Credibility" | |
| results.append({ | |
| "url": url, | |
| "expected": label, | |
| "score": score, | |
| "system_verdict": sys_verdict, | |
| "match": match, | |
| "time": time.time() - start_time, | |
| "error": None | |
| }) | |
| print(f" -> Score: {score:.2f} | Verdict: {sys_verdict} | match: {'✅' if match else '❌'}") | |
| except Exception as e: | |
| print(f" -> ❌ Error: {e}") | |
| results.append({ | |
| "url": url, | |
| "expected": label, | |
| "score": 0, | |
| "system_verdict": "Error", | |
| "match": False, | |
| "time": time.time() - start_time, | |
| "error": str(e) | |
| }) | |
| # Calculate Metrics | |
| print("\n" + "="*60) | |
| print("RESULTS SUMMARY") | |
| print("="*60) | |
| df = pd.DataFrame(results) | |
| # Logic for metrics | |
| # TP: System=High, Expected=High | |
| # FP: System=High, Expected=Low | |
| # TN: System=Low, Expected=Low | |
| # FN: System=Low, Expected=High | |
| tp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'High')]) | |
| fp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'Low')]) | |
| tn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'Low')]) | |
| fn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'High')]) | |
| precision = tp / (tp + fp) if (tp + fp) > 0 else 0 | |
| recall = tp / (tp + fn) if (tp + fn) > 0 else 0 | |
| accuracy = (tp + tn) / len(df) if len(df) > 0 else 0 | |
| f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 | |
| print(f"Total Cases: {len(df)}") | |
| print(f"Accuracy: {accuracy:.2%}") | |
| print(f"Precision: {precision:.2%}") | |
| print(f"Recall: {recall:.2%}") | |
| print(f"F1-Score: {f1:.2f}") | |
| print("\nConfusion Matrix:") | |
| print(f" | Pred High | Pred Low") | |
| print(f"True High | {tp} | {fn}") | |
| print(f"True Low | {fp} | {tn}") | |
| # Save detailed report | |
| report_path = Path(__file__).parent / "benchmark_results.csv" | |
| df.to_csv(report_path, index=False) | |
| print(f"\nDetailed CSV Saved to: {report_path}") | |
| if __name__ == "__main__": | |
| run_benchmark() | |