Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| """ | |
| SysCRED Configuration | |
| ===================== | |
| Configuration centralisée pour le système de vérification de crédibilité. | |
| Usage: | |
| from syscred.config import Config | |
| # Accéder aux paramètres | |
| config = Config() | |
| port = config.PORT | |
| # Ou avec variables d'environnement | |
| # export SYSCRED_GOOGLE_API_KEY=your_key | |
| # export SYSCRED_PORT=8080 | |
| (c) Dominique S. Loyer - PhD Thesis Prototype | |
| """ | |
| import os | |
| from pathlib import Path | |
| from typing import Dict, Optional | |
| from dotenv import load_dotenv | |
| # Charger les variables depuis .env (Project Root) | |
| # Path: .../systemFactChecking/syscred/config.py | |
| # Root .env is at .../systemFactChecking/.env (1 level up from syscred/) | |
| current_path = Path(__file__).resolve() | |
| env_path = current_path.parent.parent / '.env' | |
| if not env_path.exists(): | |
| print(f"[Config] WARNING: .env not found at {env_path}") | |
| # Try alternate locations | |
| for alt in [Path.cwd() / '.env', Path.cwd().parent / '.env']: | |
| if alt.exists(): | |
| env_path = alt | |
| break | |
| load_dotenv(dotenv_path=env_path) | |
| print(f"[Config] Loading .env from {env_path}") | |
| print(f"[Config] SYSCRED_GOOGLE_API_KEY loaded: {'Yes' if os.environ.get('SYSCRED_GOOGLE_API_KEY') else 'No'}") | |
| class Config: | |
| """ | |
| Configuration centralisée pour SysCRED. | |
| Les valeurs peuvent être override par des variables d'environnement | |
| préfixées par SYSCRED_. | |
| """ | |
| # === Chemins === | |
| # BASE_DIR = project root (parent of syscred/) | |
| BASE_DIR = Path(__file__).parent.parent | |
| ONTOLOGY_BASE_PATH = BASE_DIR / "ontology" / "sysCRED_onto26avrtil.ttl" | |
| ONTOLOGY_DATA_PATH = BASE_DIR / "ontology" / "sysCRED_data.ttl" | |
| # === Serveur Flask === | |
| HOST = os.getenv("SYSCRED_HOST", "0.0.0.0") | |
| PORT = int(os.getenv("SYSCRED_PORT", "5000")) | |
| DEBUG = os.getenv("SYSCRED_DEBUG", "true").lower() == "true" | |
| # === API Keys === | |
| GOOGLE_FACT_CHECK_API_KEY = os.getenv("SYSCRED_GOOGLE_API_KEY") | |
| DATABASE_URL = os.getenv("SYSCRED_DATABASE_URL", os.getenv("DATABASE_URL")) # Standardized env var | |
| # === Modèles ML === | |
| # Support both SYSCRED_LOAD_ML and SYSCRED_LOAD_ML_MODELS (for Render) | |
| LOAD_ML_MODELS = os.getenv("SYSCRED_LOAD_ML_MODELS", os.getenv("SYSCRED_LOAD_ML", "true")).lower() == "true" | |
| SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english" | |
| NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english" | |
| # === Timeouts === | |
| WEB_FETCH_TIMEOUT = int(os.getenv("SYSCRED_TIMEOUT", "10")) | |
| # === TREC IR Configuration (NEW - Feb 2026) === | |
| TREC_INDEX_PATH = os.getenv("SYSCRED_TREC_INDEX", None) # Lucene/Pyserini index | |
| TREC_CORPUS_PATH = os.getenv("SYSCRED_TREC_CORPUS", None) # JSONL corpus | |
| TREC_TOPICS_PATH = os.getenv("SYSCRED_TREC_TOPICS", None) # Topics directory | |
| TREC_QRELS_PATH = os.getenv("SYSCRED_TREC_QRELS", None) # Qrels directory | |
| # BM25 Parameters (optimized on AP88-90) | |
| BM25_K1 = float(os.getenv("SYSCRED_BM25_K1", "0.9")) | |
| BM25_B = float(os.getenv("SYSCRED_BM25_B", "0.4")) | |
| # PRF (Pseudo-Relevance Feedback) settings | |
| ENABLE_PRF = os.getenv("SYSCRED_ENABLE_PRF", "true").lower() == "true" | |
| PRF_TOP_DOCS = int(os.getenv("SYSCRED_PRF_TOP_DOCS", "3")) | |
| PRF_EXPANSION_TERMS = int(os.getenv("SYSCRED_PRF_TERMS", "10")) | |
| # === Pondération des scores === | |
| # Note: Weights should sum to 1.0 for proper normalization | |
| SCORE_WEIGHTS = { | |
| 'source_reputation': 0.22, # Was 0.25, reduced for graph_context | |
| 'domain_age': 0.08, # Was 0.10 | |
| 'sentiment_neutrality': 0.13, # Was 0.15 | |
| 'entity_presence': 0.13, # Was 0.15 | |
| 'coherence': 0.12, # Was 0.15 | |
| 'fact_check': 0.17, # Was 0.20 | |
| 'graph_context': 0.15 # NEW - Historical knowledge from GraphRAG | |
| } | |
| # === Seuils de crédibilité === | |
| CREDIBILITY_THRESHOLDS = { | |
| 'HIGH': 0.7, | |
| 'MEDIUM': 0.4, | |
| 'LOW': 0.0 | |
| } | |
| # === Base de données de réputation === | |
| # Les sources peuvent être étendues ou chargées d'un fichier externe | |
| SOURCE_REPUTATIONS: Dict[str, str] = { | |
| # === HAUTE CRÉDIBILITÉ === | |
| # Médias internationaux | |
| 'lemonde.fr': 'High', | |
| 'nytimes.com': 'High', | |
| 'reuters.com': 'High', | |
| 'bbc.com': 'High', | |
| 'bbc.co.uk': 'High', | |
| 'theguardian.com': 'High', | |
| 'apnews.com': 'High', | |
| 'afp.com': 'High', | |
| 'france24.com': 'High', | |
| # Médias canadiens | |
| 'cbc.ca': 'High', | |
| 'radio-canada.ca': 'High', | |
| 'lapresse.ca': 'High', | |
| 'ledevoir.com': 'High', | |
| 'theglobeandmail.com': 'High', | |
| # Sources académiques | |
| 'nature.com': 'High', | |
| 'sciencedirect.com': 'High', | |
| 'scholar.google.com': 'High', | |
| 'pubmed.ncbi.nlm.nih.gov': 'High', | |
| 'jstor.org': 'High', | |
| 'springer.com': 'High', | |
| 'ieee.org': 'High', | |
| 'acm.org': 'High', | |
| 'arxiv.org': 'High', | |
| # Fact-checkers | |
| 'factcheck.org': 'High', | |
| 'snopes.com': 'High', | |
| 'politifact.com': 'High', | |
| 'fullfact.org': 'High', | |
| 'checknews.fr': 'High', | |
| # Institutions | |
| 'who.int': 'High', | |
| 'un.org': 'High', | |
| 'europa.eu': 'High', | |
| 'canada.ca': 'High', | |
| 'gouv.fr': 'High', | |
| 'gouv.qc.ca': 'High', | |
| # === CRÉDIBILITÉ MOYENNE === | |
| 'wikipedia.org': 'Medium', | |
| 'medium.com': 'Medium', | |
| 'huffpost.com': 'Medium', | |
| 'buzzfeed.com': 'Medium', | |
| 'vice.com': 'Medium', | |
| 'slate.com': 'Medium', | |
| 'theconversation.com': 'Medium', | |
| # === BASSE CRÉDIBILITÉ === | |
| 'infowars.com': 'Low', | |
| 'naturalnews.com': 'Low', | |
| 'breitbart.com': 'Low', | |
| 'dailystormer.su': 'Low', | |
| 'beforeitsnews.com': 'Low', | |
| 'worldtruth.tv': 'Low', | |
| 'yournewswire.com': 'Low', | |
| } | |
| # === Patterns de mésinformation === | |
| MISINFORMATION_KEYWORDS = [ | |
| 'conspiracy', 'hoax', 'fake news', 'miracle cure', | |
| "they don't want you to know", 'mainstream media lies', | |
| 'deep state', 'plandemic', 'wake up sheeple', | |
| 'big pharma cover-up', 'government conspiracy', | |
| 'censored truth', 'what they hide' | |
| ] | |
| def load_external_reputations(cls, filepath: str) -> None: | |
| """ | |
| Charger des réputations supplémentaires depuis un fichier JSON. | |
| Args: | |
| filepath: Chemin vers le fichier JSON avec format: | |
| {"domain.com": "High", "autre.com": "Low"} | |
| """ | |
| import json | |
| try: | |
| with open(filepath, 'r') as f: | |
| external_reps = json.load(f) | |
| cls.SOURCE_REPUTATIONS.update(external_reps) | |
| print(f"[Config] Loaded {len(external_reps)} external reputations") | |
| except Exception as e: | |
| print(f"[Config] Could not load external reputations: {e}") | |
| def update_weights(cls, new_weights: Dict[str, float]) -> None: | |
| """ | |
| Mettre à jour les pondérations des scores. | |
| Args: | |
| new_weights: Dictionnaire avec les nouvelles pondérations | |
| """ | |
| cls.SCORE_WEIGHTS.update(new_weights) | |
| # Normaliser pour que la somme = 1 | |
| total = sum(cls.SCORE_WEIGHTS.values()) | |
| cls.SCORE_WEIGHTS = {k: v/total for k, v in cls.SCORE_WEIGHTS.items()} | |
| print(f"[Config] Updated weights: {cls.SCORE_WEIGHTS}") | |
| def to_dict(cls) -> Dict: | |
| """Exporter la configuration actuelle en dictionnaire.""" | |
| return { | |
| 'host': cls.HOST, | |
| 'port': cls.PORT, | |
| 'debug': cls.DEBUG, | |
| 'google_api_configured': cls.GOOGLE_FACT_CHECK_API_KEY is not None, | |
| 'ml_models_enabled': cls.LOAD_ML_MODELS, | |
| 'score_weights': cls.SCORE_WEIGHTS, | |
| 'known_sources_count': len(cls.SOURCE_REPUTATIONS), | |
| 'ontology_base': str(cls.ONTOLOGY_BASE_PATH), | |
| 'ontology_data': str(cls.ONTOLOGY_DATA_PATH), | |
| } | |
| def print_config(cls) -> None: | |
| """Afficher la configuration actuelle.""" | |
| print("=" * 50) | |
| print("SysCRED Configuration") | |
| print("=" * 50) | |
| for key, value in cls.to_dict().items(): | |
| print(f" {key}: {value}") | |
| print("=" * 50) | |
| # === Configuration par environnement === | |
| class DevelopmentConfig(Config): | |
| """Configuration pour développement local.""" | |
| DEBUG = True | |
| LOAD_ML_MODELS = True | |
| class ProductionConfig(Config): | |
| """Configuration pour production.""" | |
| DEBUG = False | |
| LOAD_ML_MODELS = True | |
| HOST = "0.0.0.0" | |
| class TestingConfig(Config): | |
| """Configuration pour tests.""" | |
| DEBUG = True | |
| LOAD_ML_MODELS = False # Plus rapide pour les tests | |
| WEB_FETCH_TIMEOUT = 5 | |
| # Sélection automatique de la configuration | |
| def get_config() -> Config: | |
| """ | |
| Retourne la configuration appropriée selon l'environnement. | |
| Variable d'environnement: SYSCRED_ENV (development, production, testing) | |
| """ | |
| env = os.getenv("SYSCRED_ENV", "development").lower() | |
| configs = { | |
| 'development': DevelopmentConfig, | |
| 'production': ProductionConfig, | |
| 'testing': TestingConfig, | |
| } | |
| return configs.get(env, DevelopmentConfig) | |
| # Instance par défaut | |
| config = get_config() | |
| if __name__ == "__main__": | |
| # Test de la configuration | |
| config.print_config() | |
| print("\n=== Source Reputations Sample ===") | |
| for domain, rep in list(config.SOURCE_REPUTATIONS.items())[:10]: | |
| print(f" {domain}: {rep}") | |