syscred_duplicate

Running

App Files Files Community

syscred_duplicate / syscred /config.py

DomLoyer

Sync: TREC IR metrics in verify, DB fallback, NER/EEAT fix, all API keys

ea9303b verified 3 days ago

raw

history blame contribute delete

9.83 kB

	# -- coding: utf-8 --
	"""
	SysCRED Configuration
	=====================
	Configuration centralisée pour le système de vérification de crédibilité.

	Usage:
	from syscred.config import Config

	# Accéder aux paramètres
	config = Config()
	port = config.PORT

	# Ou avec variables d'environnement
	# export SYSCRED_GOOGLE_API_KEY=your_key
	# export SYSCRED_PORT=8080

	(c) Dominique S. Loyer - PhD Thesis Prototype
	"""

	import os
	from pathlib import Path
	from typing import Dict, Optional
	from dotenv import load_dotenv

	# Charger les variables depuis .env (Project Root)
	# Path: .../systemFactChecking/syscred/config.py
	# Root .env is at .../systemFactChecking/.env (1 level up from syscred/)
	current_path = Path(__file__).resolve()
	env_path = current_path.parent.parent / '.env'

	if not env_path.exists():
	print(f"[Config] WARNING: .env not found at {env_path}")
	# Try alternate locations
	for alt in [Path.cwd() / '.env', Path.cwd().parent / '.env']:
	if alt.exists():
	env_path = alt
	break

	load_dotenv(dotenv_path=env_path)
	print(f"[Config] Loading .env from {env_path}")
	print(f"[Config] SYSCRED_GOOGLE_API_KEY loaded: {'Yes' if os.environ.get('SYSCRED_GOOGLE_API_KEY') else 'No'}")



	class Config:
	"""
	Configuration centralisée pour SysCRED.

	Les valeurs peuvent être override par des variables d'environnement
	préfixées par SYSCRED_.
	"""

	# === Chemins ===
	# BASE_DIR = project root (parent of syscred/)
	BASE_DIR = Path(__file__).parent.parent
	ONTOLOGY_BASE_PATH = BASE_DIR / "ontology" / "sysCRED_onto26avrtil.ttl"
	ONTOLOGY_DATA_PATH = BASE_DIR / "ontology" / "sysCRED_data.ttl"

	# === Serveur Flask ===
	HOST = os.getenv("SYSCRED_HOST", "0.0.0.0")
	PORT = int(os.getenv("SYSCRED_PORT", "5000"))
	DEBUG = os.getenv("SYSCRED_DEBUG", "true").lower() == "true"

	# === API Keys ===
	GOOGLE_FACT_CHECK_API_KEY = os.getenv("SYSCRED_GOOGLE_API_KEY")
	DATABASE_URL = os.getenv("SYSCRED_DATABASE_URL", os.getenv("DATABASE_URL")) # Standardized env var

	# === Modèles ML ===
	# Support both SYSCRED_LOAD_ML and SYSCRED_LOAD_ML_MODELS (for Render)
	LOAD_ML_MODELS = os.getenv("SYSCRED_LOAD_ML_MODELS", os.getenv("SYSCRED_LOAD_ML", "true")).lower() == "true"
	SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
	NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"

	# === Timeouts ===
	WEB_FETCH_TIMEOUT = int(os.getenv("SYSCRED_TIMEOUT", "10"))

	# === TREC IR Configuration (NEW - Feb 2026) ===
	TREC_INDEX_PATH = os.getenv("SYSCRED_TREC_INDEX", None) # Lucene/Pyserini index
	TREC_CORPUS_PATH = os.getenv("SYSCRED_TREC_CORPUS", None) # JSONL corpus
	TREC_TOPICS_PATH = os.getenv("SYSCRED_TREC_TOPICS", None) # Topics directory
	TREC_QRELS_PATH = os.getenv("SYSCRED_TREC_QRELS", None) # Qrels directory

	# BM25 Parameters (optimized on AP88-90)
	BM25_K1 = float(os.getenv("SYSCRED_BM25_K1", "0.9"))
	BM25_B = float(os.getenv("SYSCRED_BM25_B", "0.4"))

	# PRF (Pseudo-Relevance Feedback) settings
	ENABLE_PRF = os.getenv("SYSCRED_ENABLE_PRF", "true").lower() == "true"
	PRF_TOP_DOCS = int(os.getenv("SYSCRED_PRF_TOP_DOCS", "3"))
	PRF_EXPANSION_TERMS = int(os.getenv("SYSCRED_PRF_TERMS", "10"))

	# === Pondération des scores ===
	# Note: Weights should sum to 1.0 for proper normalization
	SCORE_WEIGHTS = {
	'source_reputation': 0.22, # Was 0.25, reduced for graph_context
	'domain_age': 0.08, # Was 0.10
	'sentiment_neutrality': 0.13, # Was 0.15
	'entity_presence': 0.13, # Was 0.15
	'coherence': 0.12, # Was 0.15
	'fact_check': 0.17, # Was 0.20
	'graph_context': 0.15 # NEW - Historical knowledge from GraphRAG
	}

	# === Seuils de crédibilité ===
	CREDIBILITY_THRESHOLDS = {
	'HIGH': 0.7,
	'MEDIUM': 0.4,
	'LOW': 0.0
	}

	# === Base de données de réputation ===
	# Les sources peuvent être étendues ou chargées d'un fichier externe
	SOURCE_REPUTATIONS: Dict[str, str] = {
	# === HAUTE CRÉDIBILITÉ ===
	# Médias internationaux
	'lemonde.fr': 'High',
	'nytimes.com': 'High',
	'reuters.com': 'High',
	'bbc.com': 'High',
	'bbc.co.uk': 'High',
	'theguardian.com': 'High',
	'apnews.com': 'High',
	'afp.com': 'High',
	'france24.com': 'High',

	# Médias canadiens
	'cbc.ca': 'High',
	'radio-canada.ca': 'High',
	'lapresse.ca': 'High',
	'ledevoir.com': 'High',
	'theglobeandmail.com': 'High',

	# Sources académiques
	'nature.com': 'High',
	'sciencedirect.com': 'High',
	'scholar.google.com': 'High',
	'pubmed.ncbi.nlm.nih.gov': 'High',
	'jstor.org': 'High',
	'springer.com': 'High',
	'ieee.org': 'High',
	'acm.org': 'High',
	'arxiv.org': 'High',

	# Fact-checkers
	'factcheck.org': 'High',
	'snopes.com': 'High',
	'politifact.com': 'High',
	'fullfact.org': 'High',
	'checknews.fr': 'High',

	# Institutions
	'who.int': 'High',
	'un.org': 'High',
	'europa.eu': 'High',
	'canada.ca': 'High',
	'gouv.fr': 'High',
	'gouv.qc.ca': 'High',

	# === CRÉDIBILITÉ MOYENNE ===
	'wikipedia.org': 'Medium',
	'medium.com': 'Medium',
	'huffpost.com': 'Medium',
	'buzzfeed.com': 'Medium',
	'vice.com': 'Medium',
	'slate.com': 'Medium',
	'theconversation.com': 'Medium',

	# === BASSE CRÉDIBILITÉ ===
	'infowars.com': 'Low',
	'naturalnews.com': 'Low',
	'breitbart.com': 'Low',
	'dailystormer.su': 'Low',
	'beforeitsnews.com': 'Low',
	'worldtruth.tv': 'Low',
	'yournewswire.com': 'Low',
	}

	# === Patterns de mésinformation ===
	MISINFORMATION_KEYWORDS = [
	'conspiracy', 'hoax', 'fake news', 'miracle cure',
	"they don't want you to know", 'mainstream media lies',
	'deep state', 'plandemic', 'wake up sheeple',
	'big pharma cover-up', 'government conspiracy',
	'censored truth', 'what they hide'
	]

	@classmethod
	def load_external_reputations(cls, filepath: str) -> None:
	"""
	Charger des réputations supplémentaires depuis un fichier JSON.

	Args:
	filepath: Chemin vers le fichier JSON avec format:
	{"domain.com": "High", "autre.com": "Low"}
	"""
	import json
	try:
	with open(filepath, 'r') as f:
	external_reps = json.load(f)
	cls.SOURCE_REPUTATIONS.update(external_reps)
	print(f"[Config] Loaded {len(external_reps)} external reputations")
	except Exception as e:
	print(f"[Config] Could not load external reputations: {e}")

	@classmethod
	def update_weights(cls, new_weights: Dict[str, float]) -> None:
	"""
	Mettre à jour les pondérations des scores.

	Args:
	new_weights: Dictionnaire avec les nouvelles pondérations
	"""
	cls.SCORE_WEIGHTS.update(new_weights)
	# Normaliser pour que la somme = 1
	total = sum(cls.SCORE_WEIGHTS.values())
	cls.SCORE_WEIGHTS = {k: v/total for k, v in cls.SCORE_WEIGHTS.items()}
	print(f"[Config] Updated weights: {cls.SCORE_WEIGHTS}")

	@classmethod
	def to_dict(cls) -> Dict:
	"""Exporter la configuration actuelle en dictionnaire."""
	return {
	'host': cls.HOST,
	'port': cls.PORT,
	'debug': cls.DEBUG,
	'google_api_configured': cls.GOOGLE_FACT_CHECK_API_KEY is not None,
	'ml_models_enabled': cls.LOAD_ML_MODELS,
	'score_weights': cls.SCORE_WEIGHTS,
	'known_sources_count': len(cls.SOURCE_REPUTATIONS),
	'ontology_base': str(cls.ONTOLOGY_BASE_PATH),
	'ontology_data': str(cls.ONTOLOGY_DATA_PATH),
	}

	@classmethod
	def print_config(cls) -> None:
	"""Afficher la configuration actuelle."""
	print("=" * 50)
	print("SysCRED Configuration")
	print("=" * 50)
	for key, value in cls.to_dict().items():
	print(f" {key}: {value}")
	print("=" * 50)


	# === Configuration par environnement ===

	class DevelopmentConfig(Config):
	"""Configuration pour développement local."""
	DEBUG = True
	LOAD_ML_MODELS = True


	class ProductionConfig(Config):
	"""Configuration pour production."""
	DEBUG = False
	LOAD_ML_MODELS = True
	HOST = "0.0.0.0"


	class TestingConfig(Config):
	"""Configuration pour tests."""
	DEBUG = True
	LOAD_ML_MODELS = False # Plus rapide pour les tests
	WEB_FETCH_TIMEOUT = 5


	# Sélection automatique de la configuration
	def get_config() -> Config:
	"""
	Retourne la configuration appropriée selon l'environnement.

	Variable d'environnement: SYSCRED_ENV (development, production, testing)
	"""
	env = os.getenv("SYSCRED_ENV", "development").lower()

	configs = {
	'development': DevelopmentConfig,
	'production': ProductionConfig,
	'testing': TestingConfig,
	}

	return configs.get(env, DevelopmentConfig)


	# Instance par défaut
	config = get_config()


	if __name__ == "__main__":
	# Test de la configuration
	config.print_config()

	print("\n=== Source Reputations Sample ===")
	for domain, rep in list(config.SOURCE_REPUTATIONS.items())[:10]:
	print(f" {domain}: {rep}")