Spaces:
Sleeping
Sleeping
File size: 4,616 Bytes
e70050b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import json
import time
import os
import sys
from pathlib import Path
from typing import Dict, List
import pandas as pd
from datetime import datetime
# Add project root to path (one level up from this script)
sys.path.append(str(Path(__file__).parent.parent))
from syscred.verification_system import CredibilityVerificationSystem
from syscred.config import config
def run_benchmark():
print("="*60)
print(" SysCRED v2.1 - Scientific Evaluation Benchmark ")
print("="*60)
# Load Benchmark Data
data_path = Path(__file__).parent / "benchmark_data.json"
if not data_path.exists():
print(f"❌ Error: {data_path} not found.")
return
with open(data_path, 'r') as f:
dataset = json.load(f)
print(f"Loaded {len(dataset)} test cases.\n")
# Initialize System with Full Capabilities
print("Initializing SysCRED (ML Models + Google API)...")
system = CredibilityVerificationSystem(
ontology_base_path=str(config.ONTOLOGY_BASE_PATH),
ontology_data_path=str(config.ONTOLOGY_DATA_PATH),
load_ml_models=True, # Use full ML for benchmark
google_api_key=config.GOOGLE_FACT_CHECK_API_KEY
)
print("System ready.\n")
results = []
# Run Evaluation
for i, item in enumerate(dataset):
url = item['url']
label = item['label']
print(f"[{i+1}/{len(dataset)}] Analyzing: {url} (Expected: {label})...")
start_time = time.time()
try:
# Run analysis
# We treat empty text fallbacks as valid logic path
report = system.verify_information(url)
score = report.get('score_credibilite', 0.5)
# Determine System Verdict
sys_verdict = "High" if score >= 0.55 else "Low"
# Compare
match = (sys_verdict == label) or (label == "High" and sys_verdict == "High") or (label == "Low" and sys_verdict == "Low")
# Handling Medium? For binary benchmark, we assume simplified threshold.
# Or we can map:
# High (>=0.7)
# Medium (0.4-0.7)
# Low (<0.4)
# Simple Binary Metric for Precision/Recall:
# Positive Class = "High Credibility"
results.append({
"url": url,
"expected": label,
"score": score,
"system_verdict": sys_verdict,
"match": match,
"time": time.time() - start_time,
"error": None
})
print(f" -> Score: {score:.2f} | Verdict: {sys_verdict} | match: {'✅' if match else '❌'}")
except Exception as e:
print(f" -> ❌ Error: {e}")
results.append({
"url": url,
"expected": label,
"score": 0,
"system_verdict": "Error",
"match": False,
"time": time.time() - start_time,
"error": str(e)
})
# Calculate Metrics
print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
df = pd.DataFrame(results)
# Logic for metrics
# TP: System=High, Expected=High
# FP: System=High, Expected=Low
# TN: System=Low, Expected=Low
# FN: System=Low, Expected=High
tp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'High')])
fp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'Low')])
tn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'Low')])
fn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'High')])
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
accuracy = (tp + tn) / len(df) if len(df) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Total Cases: {len(df)}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(f" | Pred High | Pred Low")
print(f"True High | {tp} | {fn}")
print(f"True Low | {fp} | {tn}")
# Save detailed report
report_path = Path(__file__).parent / "benchmark_results.csv"
df.to_csv(report_path, index=False)
print(f"\nDetailed CSV Saved to: {report_path}")
if __name__ == "__main__":
run_benchmark()
|