syscred_duplicate / syscred /run_benchmark.py
D Ф m i И i q ц e L Ф y e r
Deploy SysCRED with PyTorch
e70050b
import json
import time
import os
import sys
from pathlib import Path
from typing import Dict, List
import pandas as pd
from datetime import datetime
# Add project root to path (one level up from this script)
sys.path.append(str(Path(__file__).parent.parent))
from syscred.verification_system import CredibilityVerificationSystem
from syscred.config import config
def run_benchmark():
print("="*60)
print(" SysCRED v2.1 - Scientific Evaluation Benchmark ")
print("="*60)
# Load Benchmark Data
data_path = Path(__file__).parent / "benchmark_data.json"
if not data_path.exists():
print(f"❌ Error: {data_path} not found.")
return
with open(data_path, 'r') as f:
dataset = json.load(f)
print(f"Loaded {len(dataset)} test cases.\n")
# Initialize System with Full Capabilities
print("Initializing SysCRED (ML Models + Google API)...")
system = CredibilityVerificationSystem(
ontology_base_path=str(config.ONTOLOGY_BASE_PATH),
ontology_data_path=str(config.ONTOLOGY_DATA_PATH),
load_ml_models=True, # Use full ML for benchmark
google_api_key=config.GOOGLE_FACT_CHECK_API_KEY
)
print("System ready.\n")
results = []
# Run Evaluation
for i, item in enumerate(dataset):
url = item['url']
label = item['label']
print(f"[{i+1}/{len(dataset)}] Analyzing: {url} (Expected: {label})...")
start_time = time.time()
try:
# Run analysis
# We treat empty text fallbacks as valid logic path
report = system.verify_information(url)
score = report.get('score_credibilite', 0.5)
# Determine System Verdict
sys_verdict = "High" if score >= 0.55 else "Low"
# Compare
match = (sys_verdict == label) or (label == "High" and sys_verdict == "High") or (label == "Low" and sys_verdict == "Low")
# Handling Medium? For binary benchmark, we assume simplified threshold.
# Or we can map:
# High (>=0.7)
# Medium (0.4-0.7)
# Low (<0.4)
# Simple Binary Metric for Precision/Recall:
# Positive Class = "High Credibility"
results.append({
"url": url,
"expected": label,
"score": score,
"system_verdict": sys_verdict,
"match": match,
"time": time.time() - start_time,
"error": None
})
print(f" -> Score: {score:.2f} | Verdict: {sys_verdict} | match: {'✅' if match else '❌'}")
except Exception as e:
print(f" -> ❌ Error: {e}")
results.append({
"url": url,
"expected": label,
"score": 0,
"system_verdict": "Error",
"match": False,
"time": time.time() - start_time,
"error": str(e)
})
# Calculate Metrics
print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
df = pd.DataFrame(results)
# Logic for metrics
# TP: System=High, Expected=High
# FP: System=High, Expected=Low
# TN: System=Low, Expected=Low
# FN: System=Low, Expected=High
tp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'High')])
fp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'Low')])
tn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'Low')])
fn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'High')])
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
accuracy = (tp + tn) / len(df) if len(df) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Total Cases: {len(df)}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(f" | Pred High | Pred Low")
print(f"True High | {tp} | {fn}")
print(f"True Low | {fp} | {tn}")
# Save detailed report
report_path = Path(__file__).parent / "benchmark_results.csv"
df.to_csv(report_path, index=False)
print(f"\nDetailed CSV Saved to: {report_path}")
if __name__ == "__main__":
run_benchmark()