DomLoyer's picture
Sync: TREC IR metrics in verify, DB fallback, NER/EEAT fix, all API keys
ea9303b verified
# -*- coding: utf-8 -*-
"""
SysCRED Configuration
=====================
Configuration centralisée pour le système de vérification de crédibilité.
Usage:
from syscred.config import Config
# Accéder aux paramètres
config = Config()
port = config.PORT
# Ou avec variables d'environnement
# export SYSCRED_GOOGLE_API_KEY=your_key
# export SYSCRED_PORT=8080
(c) Dominique S. Loyer - PhD Thesis Prototype
"""
import os
from pathlib import Path
from typing import Dict, Optional
from dotenv import load_dotenv
# Charger les variables depuis .env (Project Root)
# Path: .../systemFactChecking/syscred/config.py
# Root .env is at .../systemFactChecking/.env (1 level up from syscred/)
current_path = Path(__file__).resolve()
env_path = current_path.parent.parent / '.env'
if not env_path.exists():
print(f"[Config] WARNING: .env not found at {env_path}")
# Try alternate locations
for alt in [Path.cwd() / '.env', Path.cwd().parent / '.env']:
if alt.exists():
env_path = alt
break
load_dotenv(dotenv_path=env_path)
print(f"[Config] Loading .env from {env_path}")
print(f"[Config] SYSCRED_GOOGLE_API_KEY loaded: {'Yes' if os.environ.get('SYSCRED_GOOGLE_API_KEY') else 'No'}")
class Config:
"""
Configuration centralisée pour SysCRED.
Les valeurs peuvent être override par des variables d'environnement
préfixées par SYSCRED_.
"""
# === Chemins ===
# BASE_DIR = project root (parent of syscred/)
BASE_DIR = Path(__file__).parent.parent
ONTOLOGY_BASE_PATH = BASE_DIR / "ontology" / "sysCRED_onto26avrtil.ttl"
ONTOLOGY_DATA_PATH = BASE_DIR / "ontology" / "sysCRED_data.ttl"
# === Serveur Flask ===
HOST = os.getenv("SYSCRED_HOST", "0.0.0.0")
PORT = int(os.getenv("SYSCRED_PORT", "5000"))
DEBUG = os.getenv("SYSCRED_DEBUG", "true").lower() == "true"
# === API Keys ===
GOOGLE_FACT_CHECK_API_KEY = os.getenv("SYSCRED_GOOGLE_API_KEY")
DATABASE_URL = os.getenv("SYSCRED_DATABASE_URL", os.getenv("DATABASE_URL")) # Standardized env var
# === Modèles ML ===
# Support both SYSCRED_LOAD_ML and SYSCRED_LOAD_ML_MODELS (for Render)
LOAD_ML_MODELS = os.getenv("SYSCRED_LOAD_ML_MODELS", os.getenv("SYSCRED_LOAD_ML", "true")).lower() == "true"
SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
# === Timeouts ===
WEB_FETCH_TIMEOUT = int(os.getenv("SYSCRED_TIMEOUT", "10"))
# === TREC IR Configuration (NEW - Feb 2026) ===
TREC_INDEX_PATH = os.getenv("SYSCRED_TREC_INDEX", None) # Lucene/Pyserini index
TREC_CORPUS_PATH = os.getenv("SYSCRED_TREC_CORPUS", None) # JSONL corpus
TREC_TOPICS_PATH = os.getenv("SYSCRED_TREC_TOPICS", None) # Topics directory
TREC_QRELS_PATH = os.getenv("SYSCRED_TREC_QRELS", None) # Qrels directory
# BM25 Parameters (optimized on AP88-90)
BM25_K1 = float(os.getenv("SYSCRED_BM25_K1", "0.9"))
BM25_B = float(os.getenv("SYSCRED_BM25_B", "0.4"))
# PRF (Pseudo-Relevance Feedback) settings
ENABLE_PRF = os.getenv("SYSCRED_ENABLE_PRF", "true").lower() == "true"
PRF_TOP_DOCS = int(os.getenv("SYSCRED_PRF_TOP_DOCS", "3"))
PRF_EXPANSION_TERMS = int(os.getenv("SYSCRED_PRF_TERMS", "10"))
# === Pondération des scores ===
# Note: Weights should sum to 1.0 for proper normalization
SCORE_WEIGHTS = {
'source_reputation': 0.22, # Was 0.25, reduced for graph_context
'domain_age': 0.08, # Was 0.10
'sentiment_neutrality': 0.13, # Was 0.15
'entity_presence': 0.13, # Was 0.15
'coherence': 0.12, # Was 0.15
'fact_check': 0.17, # Was 0.20
'graph_context': 0.15 # NEW - Historical knowledge from GraphRAG
}
# === Seuils de crédibilité ===
CREDIBILITY_THRESHOLDS = {
'HIGH': 0.7,
'MEDIUM': 0.4,
'LOW': 0.0
}
# === Base de données de réputation ===
# Les sources peuvent être étendues ou chargées d'un fichier externe
SOURCE_REPUTATIONS: Dict[str, str] = {
# === HAUTE CRÉDIBILITÉ ===
# Médias internationaux
'lemonde.fr': 'High',
'nytimes.com': 'High',
'reuters.com': 'High',
'bbc.com': 'High',
'bbc.co.uk': 'High',
'theguardian.com': 'High',
'apnews.com': 'High',
'afp.com': 'High',
'france24.com': 'High',
# Médias canadiens
'cbc.ca': 'High',
'radio-canada.ca': 'High',
'lapresse.ca': 'High',
'ledevoir.com': 'High',
'theglobeandmail.com': 'High',
# Sources académiques
'nature.com': 'High',
'sciencedirect.com': 'High',
'scholar.google.com': 'High',
'pubmed.ncbi.nlm.nih.gov': 'High',
'jstor.org': 'High',
'springer.com': 'High',
'ieee.org': 'High',
'acm.org': 'High',
'arxiv.org': 'High',
# Fact-checkers
'factcheck.org': 'High',
'snopes.com': 'High',
'politifact.com': 'High',
'fullfact.org': 'High',
'checknews.fr': 'High',
# Institutions
'who.int': 'High',
'un.org': 'High',
'europa.eu': 'High',
'canada.ca': 'High',
'gouv.fr': 'High',
'gouv.qc.ca': 'High',
# === CRÉDIBILITÉ MOYENNE ===
'wikipedia.org': 'Medium',
'medium.com': 'Medium',
'huffpost.com': 'Medium',
'buzzfeed.com': 'Medium',
'vice.com': 'Medium',
'slate.com': 'Medium',
'theconversation.com': 'Medium',
# === BASSE CRÉDIBILITÉ ===
'infowars.com': 'Low',
'naturalnews.com': 'Low',
'breitbart.com': 'Low',
'dailystormer.su': 'Low',
'beforeitsnews.com': 'Low',
'worldtruth.tv': 'Low',
'yournewswire.com': 'Low',
}
# === Patterns de mésinformation ===
MISINFORMATION_KEYWORDS = [
'conspiracy', 'hoax', 'fake news', 'miracle cure',
"they don't want you to know", 'mainstream media lies',
'deep state', 'plandemic', 'wake up sheeple',
'big pharma cover-up', 'government conspiracy',
'censored truth', 'what they hide'
]
@classmethod
def load_external_reputations(cls, filepath: str) -> None:
"""
Charger des réputations supplémentaires depuis un fichier JSON.
Args:
filepath: Chemin vers le fichier JSON avec format:
{"domain.com": "High", "autre.com": "Low"}
"""
import json
try:
with open(filepath, 'r') as f:
external_reps = json.load(f)
cls.SOURCE_REPUTATIONS.update(external_reps)
print(f"[Config] Loaded {len(external_reps)} external reputations")
except Exception as e:
print(f"[Config] Could not load external reputations: {e}")
@classmethod
def update_weights(cls, new_weights: Dict[str, float]) -> None:
"""
Mettre à jour les pondérations des scores.
Args:
new_weights: Dictionnaire avec les nouvelles pondérations
"""
cls.SCORE_WEIGHTS.update(new_weights)
# Normaliser pour que la somme = 1
total = sum(cls.SCORE_WEIGHTS.values())
cls.SCORE_WEIGHTS = {k: v/total for k, v in cls.SCORE_WEIGHTS.items()}
print(f"[Config] Updated weights: {cls.SCORE_WEIGHTS}")
@classmethod
def to_dict(cls) -> Dict:
"""Exporter la configuration actuelle en dictionnaire."""
return {
'host': cls.HOST,
'port': cls.PORT,
'debug': cls.DEBUG,
'google_api_configured': cls.GOOGLE_FACT_CHECK_API_KEY is not None,
'ml_models_enabled': cls.LOAD_ML_MODELS,
'score_weights': cls.SCORE_WEIGHTS,
'known_sources_count': len(cls.SOURCE_REPUTATIONS),
'ontology_base': str(cls.ONTOLOGY_BASE_PATH),
'ontology_data': str(cls.ONTOLOGY_DATA_PATH),
}
@classmethod
def print_config(cls) -> None:
"""Afficher la configuration actuelle."""
print("=" * 50)
print("SysCRED Configuration")
print("=" * 50)
for key, value in cls.to_dict().items():
print(f" {key}: {value}")
print("=" * 50)
# === Configuration par environnement ===
class DevelopmentConfig(Config):
"""Configuration pour développement local."""
DEBUG = True
LOAD_ML_MODELS = True
class ProductionConfig(Config):
"""Configuration pour production."""
DEBUG = False
LOAD_ML_MODELS = True
HOST = "0.0.0.0"
class TestingConfig(Config):
"""Configuration pour tests."""
DEBUG = True
LOAD_ML_MODELS = False # Plus rapide pour les tests
WEB_FETCH_TIMEOUT = 5
# Sélection automatique de la configuration
def get_config() -> Config:
"""
Retourne la configuration appropriée selon l'environnement.
Variable d'environnement: SYSCRED_ENV (development, production, testing)
"""
env = os.getenv("SYSCRED_ENV", "development").lower()
configs = {
'development': DevelopmentConfig,
'production': ProductionConfig,
'testing': TestingConfig,
}
return configs.get(env, DevelopmentConfig)
# Instance par défaut
config = get_config()
if __name__ == "__main__":
# Test de la configuration
config.print_config()
print("\n=== Source Reputations Sample ===")
for domain, rep in list(config.SOURCE_REPUTATIONS.items())[:10]:
print(f" {domain}: {rep}")