syscred_duplicate / syscred /backend_app.py
DomLoyer's picture
Sync: TREC IR metrics in verify, DB fallback, NER/EEAT fix, all API keys
ea9303b verified
# -*- coding: utf-8 -*-
"""
SysCRED Backend API - Flask Server
===================================
REST API for the credibility verification system.
Endpoints:
- POST /api/verify - Verify URL or text credibility
- POST /api/seo - Get SEO analysis only
- GET /api/ontology/stats - Get ontology statistics
- GET /api/health - Health check
- GET /api/config - View current configuration
(c) Dominique S. Loyer - PhD Thesis Prototype
"""
import sys
import os
import traceback
# Load environment variables from .env file
from pathlib import Path
try:
from dotenv import load_dotenv
# .env is at project root (parent of syscred/)
env_path = Path(__file__).resolve().parent.parent / '.env'
if not env_path.exists():
# Fallback: check syscred/ directory
env_path = Path(__file__).parent / '.env'
if env_path.exists():
load_dotenv(env_path)
print(f"[SysCRED Backend] Loaded .env from {env_path}")
else:
print(f"[SysCRED Backend] No .env file found, using system env vars")
except ImportError:
print("[SysCRED Backend] python-dotenv not installed, using system env vars")
from flask import Flask, request, jsonify, send_from_directory
from flask_cors import CORS
# Add syscred package to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import SysCRED modules (with graceful fallbacks)
SYSCRED_AVAILABLE = False
TREC_AVAILABLE = False
DB_AVAILABLE = False
# Core modules (required)
try:
from syscred.verification_system import CredibilityVerificationSystem
from syscred.seo_analyzer import SEOAnalyzer
from syscred.ontology_manager import OntologyManager
from syscred.config import config, Config
SYSCRED_AVAILABLE = True
print("[SysCRED Backend] Core modules imported successfully")
except ImportError as e:
print(f"[SysCRED Backend] Warning: Core modules failed: {e}")
# Fallback config
class Config:
HOST = "0.0.0.0"
PORT = 5000
DEBUG = True
ONTOLOGY_BASE_PATH = None
ONTOLOGY_DATA_PATH = None
LOAD_ML_MODELS = True
GOOGLE_FACT_CHECK_API_KEY = None
config = Config()
# Database (optional)
try:
from syscred.database import init_db, db, AnalysisResult
DB_AVAILABLE = True
print("[SysCRED Backend] Database module loaded")
except ImportError as e:
print(f"[SysCRED Backend] Database disabled: {e}")
def init_db(app): pass
# TREC modules (optional)
try:
from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult
from syscred.eval_metrics import EvaluationMetrics
TREC_AVAILABLE = True
print("[SysCRED Backend] TREC modules loaded")
except ImportError as e:
print(f"[SysCRED Backend] TREC modules disabled: {e}")
# --- Initialize Flask App ---
app = Flask(__name__)
CORS(app) # Enable CORS for frontend
# Allow iframe embedding on UQAM domains (for syscred.uqam.ca mirror)
@app.after_request
def add_security_headers(response):
"""Add security headers allowing UQAM iframe embedding."""
response.headers['X-Frame-Options'] = 'ALLOW-FROM https://syscred.uqam.ca'
response.headers['Content-Security-Policy'] = (
"frame-ancestors 'self' https://syscred.uqam.ca https://*.uqam.ca"
)
return response
# Initialize Database
try:
init_db(app) # [NEW] Setup DB connection
except Exception as e:
print(f"[SysCRED Backend] Warning: DB init failed: {e}")
# --- Initialize SysCRED System ---
credibility_system = None
seo_analyzer = None
trec_retriever = None
eval_metrics = None
# Demo corpus for TREC (AP88-90 style documents)
TREC_DEMO_CORPUS = {
"AP880101-0001": {
"text": "Climate change is primarily caused by human activities, particularly the burning of fossil fuels which release greenhouse gases into the atmosphere.",
"title": "Climate Science Report"
},
"AP880101-0002": {
"text": "The Earth's temperature has risen significantly over the past century due to greenhouse gas emissions from industrial activities and deforestation.",
"title": "Global Warming Study"
},
"AP880102-0001": {
"text": "Scientists warn that sea levels could rise dramatically if current warming trends continue, threatening coastal cities worldwide.",
"title": "Sea Level Warning"
},
"AP890215-0001": {
"text": "The presidential election campaign focused on economic policies, healthcare reform, and national security issues.",
"title": "Election Coverage"
},
"AP890216-0001": {
"text": "Stock markets rose sharply after positive economic indicators were released by the Federal Reserve, signaling economic recovery.",
"title": "Financial News"
},
"AP880201-0001": {
"text": "Renewable energy sources like solar and wind power are becoming more cost-effective alternatives to fossil fuels.",
"title": "Green Energy Report"
},
"AP890301-0001": {
"text": "The technology industry continues to grow rapidly, with artificial intelligence and machine learning driving innovation.",
"title": "Tech Industry Update"
},
}
def initialize_system():
"""Initialize the credibility system (lazy loading)."""
global credibility_system, seo_analyzer
if not SYSCRED_AVAILABLE:
print("[SysCRED Backend] Cannot initialize - modules not available")
return False
try:
# Initialize SEO analyzer (lightweight)
seo_analyzer = SEOAnalyzer()
print("[SysCRED Backend] SEO Analyzer initialized")
# Initialize full system (may take time to load ML models)
print("[SysCRED Backend] Initializing credibility system (loading ML models)...")
ontology_base = str(config.ONTOLOGY_BASE_PATH) if config.ONTOLOGY_BASE_PATH else None
ontology_data = str(config.ONTOLOGY_DATA_PATH) if config.ONTOLOGY_DATA_PATH else None
credibility_system = CredibilityVerificationSystem(
ontology_base_path=ontology_base if ontology_base and os.path.exists(ontology_base) else None,
ontology_data_path=ontology_data,
load_ml_models=config.LOAD_ML_MODELS,
google_api_key=config.GOOGLE_FACT_CHECK_API_KEY
)
print("[SysCRED Backend] System initialized successfully!")
return True
except Exception as e:
print(f"[SysCRED Backend] Error initializing system: {e}")
traceback.print_exc()
return False
# --- API Routes ---
@app.route('/')
def index():
"""Serve the frontend."""
return send_from_directory('static', 'index.html')
@app.route('/api/health', methods=['GET'])
def health_check():
"""Health check endpoint."""
return jsonify({
'status': 'healthy',
'syscred_available': SYSCRED_AVAILABLE,
'system_initialized': credibility_system is not None,
'seo_analyzer_ready': seo_analyzer is not None
})
@app.route('/api/verify', methods=['POST'])
def verify_endpoint():
"""
Main verification endpoint.
Request JSON:
{
"input_data": "URL or text to verify",
"include_seo": true/false (optional, default true),
"include_pagerank": true/false (optional, default true)
}
"""
global credibility_system
# Lazy initialization
if credibility_system is None:
if not initialize_system():
return jsonify({
'error': 'System initialization failed. Check server logs.'
}), 503
# Validate request
if not request.is_json:
return jsonify({'error': 'Request must be JSON'}), 400
data = request.get_json()
input_data = data.get('input_data', '').strip()
if not input_data:
return jsonify({'error': "'input_data' is required"}), 400
include_seo = data.get('include_seo', True)
include_pagerank = data.get('include_pagerank', True)
print(f"[SysCRED Backend] Verifying: {input_data[:100]}...")
try:
# Run main verification
result = credibility_system.verify_information(input_data)
if 'error' in result:
return jsonify(result), 400
# Add SEO analysis if requested and it's a URL
if include_seo and credibility_system.is_url(input_data):
try:
web_content = credibility_system.api_clients.fetch_web_content(input_data)
if web_content.success:
seo_result = seo_analyzer.analyze_seo(
url=input_data,
title=web_content.title,
meta_description=web_content.meta_description,
text_content=web_content.text_content
)
result['seoAnalysis'] = {
'titleLength': seo_result.title_length,
'titleHasKeywords': seo_result.title_has_keywords,
'metaDescriptionLength': seo_result.meta_description_length,
'wordCount': seo_result.word_count,
'readabilityScore': round(seo_result.readability_score, 2),
'seoScore': round(seo_result.seo_score, 2),
'topKeywords': list(seo_result.keyword_density.keys())
}
except Exception as e:
print(f"[SysCRED Backend] SEO analysis error: {e}")
result['seoAnalysis'] = {'error': str(e)}
# Add PageRank estimation if requested
if include_pagerank and credibility_system.is_url(input_data):
try:
external_data = credibility_system.api_clients.fetch_external_data(input_data)
pr_result = seo_analyzer.estimate_pagerank(
url=input_data,
domain_age_days=external_data.domain_age_days,
source_reputation=external_data.source_reputation
)
result['pageRankEstimation'] = {
'estimatedPR': round(pr_result.estimated_pr, 3),
'confidence': round(pr_result.confidence, 2),
'factors': pr_result.factors,
'explanation': pr_result.explanation_text
}
except Exception as e:
print(f"[SysCRED Backend] PageRank estimation error: {e}")
result['pageRankEstimation'] = {'error': str(e)}
print(f"[SysCRED Backend] Score: {result.get('scoreCredibilite', 'N/A')}")
# [NEW] TREC Evidence Search + IR Metrics
try:
global trec_retriever, eval_metrics
# Initialize TREC if needed
if trec_retriever is None and TREC_AVAILABLE:
trec_retriever = TRECRetriever(use_stemming=True, enable_prf=False)
trec_retriever.corpus = TREC_DEMO_CORPUS
eval_metrics = EvaluationMetrics()
print("[SysCRED Backend] TREC Retriever initialized with demo corpus")
if trec_retriever and eval_metrics:
import time
start_time = time.time()
# Use the input text as query
query_text = input_data[:200] if not credibility_system.is_url(input_data) else result.get('informationEntree', input_data)[:200]
trec_result = trec_retriever.retrieve_evidence(query_text, k=5, model='bm25')
search_time = (time.time() - start_time) * 1000
retrieved_ids = [e.doc_id for e in trec_result.evidences]
# Use climate-related docs as "relevant" for demo evaluation
# In production, this would come from qrels files
relevant_ids = set(TREC_DEMO_CORPUS.keys()) # All docs as relevant pool
# Compute IR metrics
k = len(retrieved_ids) if retrieved_ids else 1
precision = eval_metrics.precision_at_k(retrieved_ids, relevant_ids, k) if retrieved_ids else 0
recall = eval_metrics.recall_at_k(retrieved_ids, relevant_ids, k) if retrieved_ids else 0
ap = eval_metrics.average_precision(retrieved_ids, relevant_ids) if retrieved_ids else 0
mrr = eval_metrics.mrr(retrieved_ids, relevant_ids) if retrieved_ids else 0
relevance_dict = {doc: 1 for doc in relevant_ids}
ndcg = eval_metrics.ndcg_at_k(retrieved_ids, relevance_dict, k) if retrieved_ids else 0
# TF-IDF score from top result
tfidf_score = trec_result.evidences[0].score if trec_result.evidences else 0
result['trec_metrics'] = {
'precision': round(precision, 4),
'recall': round(recall, 4),
'map': round(ap, 4),
'ndcg': round(ndcg, 4),
'tfidf_score': round(tfidf_score, 4),
'mrr': round(mrr, 4),
'retrieved_count': len(retrieved_ids),
'corpus_size': len(TREC_DEMO_CORPUS),
'search_time_ms': round(search_time, 2)
}
print(f"[SysCRED Backend] TREC: P={precision:.3f} R={recall:.3f} MAP={ap:.3f} NDCG={ndcg:.3f} MRR={mrr:.3f}")
except Exception as e:
print(f"[SysCRED Backend] TREC metrics error: {e}")
result['trec_metrics'] = {'error': str(e)}
# [NEW] Persist to Database
try:
new_analysis = AnalysisResult(
url=input_data[:500],
credibility_score=result.get('scoreCredibilite', 0.5),
summary=result.get('resumeAnalyse', ''),
source_reputation=result.get('detailsScore', {}).get('factors', [{}])[0].get('value')
)
db.session.add(new_analysis)
db.session.commit()
print(f"[SysCRED-DB] Result saved. ID: {new_analysis.id}")
except Exception as e:
print(f"[SysCRED-DB] Save failed: {e}")
return jsonify(result), 200
except Exception as e:
print(f"[SysCRED Backend] Error: {e}")
traceback.print_exc()
return jsonify({'error': f'Internal error: {str(e)}'}), 500
@app.route('/api/seo', methods=['POST'])
def seo_endpoint():
"""
SEO-only analysis endpoint (faster, no ML models needed).
Request JSON:
{
"url": "URL to analyze"
}
"""
global seo_analyzer
if seo_analyzer is None:
seo_analyzer = SEOAnalyzer()
if not request.is_json:
return jsonify({'error': 'Request must be JSON'}), 400
data = request.get_json()
url = data.get('url', '').strip()
if not url or not url.startswith('http'):
return jsonify({'error': 'Valid URL is required'}), 400
try:
# Fetch content
from syscred.api_clients import ExternalAPIClients
api_client = ExternalAPIClients()
web_content = api_client.fetch_web_content(url)
if not web_content.success:
return jsonify({'error': f'Failed to fetch URL: {web_content.error}'}), 400
# SEO analysis
seo_result = seo_analyzer.analyze_seo(
url=url,
title=web_content.title,
meta_description=web_content.meta_description,
text_content=web_content.text_content
)
# IR metrics
ir_metrics = seo_analyzer.get_ir_metrics(web_content.text_content)
# PageRank estimation
external_data = api_client.fetch_external_data(url)
pr_result = seo_analyzer.estimate_pagerank(
url=url,
domain_age_days=external_data.domain_age_days,
source_reputation=external_data.source_reputation
)
return jsonify({
'url': url,
'title': web_content.title,
'seo': {
'titleLength': seo_result.title_length,
'metaDescriptionLength': seo_result.meta_description_length,
'wordCount': seo_result.word_count,
'readabilityScore': round(seo_result.readability_score, 2),
'seoScore': round(seo_result.seo_score, 2),
'keywordDensity': seo_result.keyword_density
},
'irMetrics': {
'documentLength': ir_metrics.document_length,
'topTerms': ir_metrics.top_terms[:5],
'avgTermFrequency': round(ir_metrics.avg_term_frequency, 4)
},
'pageRank': {
'estimated': round(pr_result.estimated_pr, 3),
'confidence': round(pr_result.confidence, 2),
'factors': pr_result.factors
},
'domain': {
'reputation': external_data.source_reputation,
'ageDays': external_data.domain_age_days
}
}), 200
except Exception as e:
print(f"[SysCRED Backend] SEO endpoint error: {e}")
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/api/ontology/graph', methods=['GET'])
def ontology_graph():
"""Get ontology graph data for D3.js."""
global credibility_system
if credibility_system and credibility_system.ontology_manager:
graph_data = credibility_system.ontology_manager.get_graph_json()
return jsonify(graph_data), 200
else:
# Return empty graph rather than 400 to avoid breaking frontend
return jsonify({'nodes': [], 'links': []}), 200
@app.route('/api/ontology/stats', methods=['GET'])
def ontology_stats():
"""Get ontology statistics."""
global credibility_system
if credibility_system and credibility_system.ontology_manager:
stats = credibility_system.ontology_manager.get_statistics()
return jsonify(stats), 200
else:
return jsonify({
'error': 'Ontology not loaded',
'base_triples': 0,
'data_triples': 0
}), 200
# --- TREC Endpoints ---
@app.route('/api/trec/search', methods=['POST'])
def trec_search():
"""
Search for evidence using TREC retrieval methods.
Request JSON:
{
"query": "Claim or query to search for",
"k": 10, # Number of results (optional, default 10)
"model": "bm25" # Retrieval model: bm25, tfidf, qld (optional)
}
Response:
{
"query": "original query",
"results": [
{"doc_id": "AP880101-0001", "score": 6.27, "rank": 1, "text": "...", "title": "..."},
...
],
"total": 3,
"model": "bm25",
"search_time_ms": 12.5
}
"""
global trec_retriever, eval_metrics
# Initialize TREC components if needed
if trec_retriever is None:
try:
trec_retriever = TRECRetriever(use_stemming=True, enable_prf=False)
trec_retriever.corpus = TREC_DEMO_CORPUS
eval_metrics = EvaluationMetrics()
print("[SysCRED Backend] TREC Retriever initialized with demo corpus")
except Exception as e:
return jsonify({'error': f'TREC initialization failed: {str(e)}'}), 503
if not request.is_json:
return jsonify({'error': 'Request must be JSON'}), 400
data = request.get_json()
query = data.get('query', '').strip()
if not query:
return jsonify({'error': "'query' is required"}), 400
k = data.get('k', 10)
model = data.get('model', 'bm25')
try:
import time
start_time = time.time()
# Retrieve evidence
result = trec_retriever.retrieve_evidence(query, k=k, model=model)
search_time_ms = (time.time() - start_time) * 1000
# Format results
results = []
for ev in result.evidences:
doc_info = trec_retriever.corpus.get(ev.doc_id, {})
results.append({
'doc_id': ev.doc_id,
'score': round(ev.score, 4),
'rank': ev.rank,
'text': ev.text,
'title': doc_info.get('title', ''),
'model': ev.retrieval_model
})
return jsonify({
'query': query,
'results': results,
'total': len(results),
'model': model,
'search_time_ms': round(search_time_ms, 2)
}), 200
except Exception as e:
print(f"[SysCRED Backend] TREC search error: {e}")
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/api/trec/corpus', methods=['GET'])
def trec_corpus():
"""
Get the TREC demo corpus information.
Response:
{
"corpus_size": 7,
"corpus_type": "AP88-90 Demo",
"documents": [
{"doc_id": "AP880101-0001", "title": "...", "text_preview": "..."},
...
]
}
"""
docs = []
for doc_id, doc in TREC_DEMO_CORPUS.items():
docs.append({
'doc_id': doc_id,
'title': doc.get('title', ''),
'text_preview': doc['text'][:150] + '...' if len(doc['text']) > 150 else doc['text']
})
return jsonify({
'corpus_size': len(TREC_DEMO_CORPUS),
'corpus_type': 'AP88-90 Demo',
'documents': docs
}), 200
@app.route('/api/trec/metrics', methods=['POST'])
def trec_metrics():
"""
Calculate IR evaluation metrics for a retrieval result.
Request JSON:
{
"retrieved": ["AP880101-0001", "AP890215-0001", "AP880101-0002"],
"relevant": ["AP880101-0001", "AP880101-0002", "AP880102-0001"]
}
Response:
{
"precision_at_3": 0.67,
"recall_at_3": 0.67,
"average_precision": 0.81,
"mrr": 1.0,
"ndcg_at_3": 0.88
}
"""
global eval_metrics
if eval_metrics is None:
eval_metrics = EvaluationMetrics()
if not request.is_json:
return jsonify({'error': 'Request must be JSON'}), 400
data = request.get_json()
retrieved = data.get('retrieved', [])
relevant = set(data.get('relevant', []))
if not retrieved:
return jsonify({'error': "'retrieved' list is required"}), 400
k = len(retrieved)
try:
# Calculate metrics
p_at_k = eval_metrics.precision_at_k(retrieved, relevant, k)
r_at_k = eval_metrics.recall_at_k(retrieved, relevant, k)
ap = eval_metrics.average_precision(retrieved, relevant)
mrr = eval_metrics.mrr(retrieved, relevant)
# For NDCG, create relevance dict (binary: 1 if relevant, 0 otherwise)
relevance_dict = {doc: 1 for doc in relevant}
ndcg = eval_metrics.ndcg_at_k(retrieved, relevance_dict, k)
return jsonify({
f'precision_at_{k}': round(p_at_k, 4),
f'recall_at_{k}': round(r_at_k, 4),
'average_precision': round(ap, 4),
'mrr': round(mrr, 4),
f'ndcg_at_{k}': round(ndcg, 4),
'metrics_explanation': {
'P@K': 'Proportion de documents pertinents parmi les K premiers récupérés',
'R@K': 'Proportion de documents pertinents récupérés parmi tous les pertinents',
'AP': 'Moyenne des précisions à chaque document pertinent trouvé',
'MRR': 'Rang réciproque du premier document pertinent',
'NDCG': 'Gain cumulatif normalisé avec décroissance logarithmique'
}
}), 200
except Exception as e:
print(f"[SysCRED Backend] TREC metrics error: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/trec/health', methods=['GET'])
def trec_health():
"""Health check for TREC module."""
return jsonify({
'status': 'healthy',
'trec_available': TREC_AVAILABLE if 'TREC_AVAILABLE' in dir() else True,
'retriever_initialized': trec_retriever is not None,
'corpus_size': len(TREC_DEMO_CORPUS),
'models_available': ['bm25', 'tfidf', 'qld']
}), 200
# --- Main ---
if __name__ == '__main__':
print("=" * 60)
print("SysCRED Backend API Server")
print("(c) Dominique S. Loyer - PhD Thesis Prototype")
print("=" * 60)
print()
# Initialize system at startup
print("[SysCRED Backend] Pre-initializing system...")
initialize_system()
print()
print("[SysCRED Backend] Starting Flask server...")
print("[SysCRED Backend] Endpoints:")
print(" - POST /api/verify - Full credibility verification")
print(" - POST /api/seo - SEO analysis only (faster)")
print(" - GET /api/ontology/stats - Ontology statistics")
print(" - GET /api/health - Health check")
print(" --- TREC Endpoints ---")
print(" - POST /api/trec/search - Evidence retrieval (BM25/TF-IDF/QLD)")
print(" - POST /api/trec/metrics - Calculate IR metrics (MAP, P@K, NDCG)")
print(" - GET /api/trec/corpus - Demo corpus info")
print(" - GET /api/trec/health - TREC module health")
print()
app.run(host='0.0.0.0', port=5001, debug=False, threaded=True)