#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Named Entity Recognition (NER) Analyzer for SysCRED ==================================================== Extracts named entities from text using spaCy. Entities detected: - PER: Persons (Donald Trump, Emmanuel Macron) - ORG: Organizations (FBI, UN, Google) - LOC: Locations (Paris, Capitol) - DATE: Dates (January 6, 2021) - MONEY: Amounts ($10 million) - EVENT: Events (insurrection, election) """ from typing import Dict, List, Any, Optional import logging # Try to import spaCy try: import spacy from spacy.language import Language HAS_SPACY = True except ImportError: HAS_SPACY = False spacy = None logger = logging.getLogger(__name__) class NERAnalyzer: """ Named Entity Recognition analyzer using spaCy. Supports French (fr_core_news_md) and English (en_core_web_md). Falls back to heuristic extraction if spaCy is not available. """ # Entity type mappings for display ENTITY_LABELS = { 'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'}, 'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'}, 'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'}, 'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'}, 'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'}, 'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'}, 'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'}, 'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'}, 'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'}, 'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'}, 'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'}, 'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'}, 'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'}, 'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'}, } def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True): """ Initialize NER analyzer. Args: model_name: spaCy model to load (fr_core_news_md, en_core_web_md) fallback: If True, use heuristics when spaCy unavailable """ self.model_name = model_name self.fallback = fallback self.nlp = None self.use_heuristics = False if HAS_SPACY: try: self.nlp = spacy.load(model_name) logger.info(f"[NER] Loaded spaCy model: {model_name}") except OSError as e: logger.warning(f"[NER] Could not load model {model_name}: {e}") if fallback: self.use_heuristics = True logger.info("[NER] Using heuristic entity extraction") else: if fallback: self.use_heuristics = True logger.info("[NER] spaCy not installed. Using heuristic extraction") def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]: """ Extract named entities from text. Args: text: Input text to analyze Returns: Dictionary mapping entity types to lists of entities Each entity has: text, start, end, label, label_display, emoji, confidence """ if not text or len(text.strip()) == 0: return {} if self.nlp: return self._extract_with_spacy(text) elif self.use_heuristics: return self._extract_with_heuristics(text) else: return {} def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]: """Extract entities using spaCy NLP.""" doc = self.nlp(text) entities: Dict[str, List[Dict[str, Any]]] = {} for ent in doc.ents: label = ent.label_ # Get display info label_info = self.ENTITY_LABELS.get(label, { 'fr': label, 'en': label, 'emoji': '🔖' }) entity_data = { 'text': ent.text, 'start': ent.start_char, 'end': ent.end_char, 'label': label, 'label_display': label_info.get('fr', label), 'emoji': label_info.get('emoji', '🔖'), 'confidence': 0.85 # spaCy doesn't provide confidence by default } if label not in entities: entities[label] = [] # Avoid duplicates if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]): entities[label].append(entity_data) return entities def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]: """ Fallback heuristic entity extraction. Uses pattern matching for common entities. """ import re entities: Dict[str, List[Dict[str, Any]]] = {} # Common patterns patterns = { 'PER': [ # Known political figures r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|' r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b', ], 'ORG': [ r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|' r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|' r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b', ], 'LOC': [ r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|' r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b', ], 'DATE': [ r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|' r'septembre|octobre|novembre|décembre)\s+\d{4})\b', r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b', r'\b(January|February|March|April|May|June|July|August|' r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ], 'MONEY': [ r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?', r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)', r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)', ], 'PERCENT': [ r'\b\d+(?:\.\d+)?%', r'\b\d+(?:\.\d+)?\s*pour\s*cent', r'\b\d+(?:\.\d+)?\s*percent', ], } for label, pattern_list in patterns.items(): label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'}) for pattern in pattern_list: for match in re.finditer(pattern, text, re.IGNORECASE): entity_data = { 'text': match.group(), 'start': match.start(), 'end': match.end(), 'label': label, 'label_display': label_info.get('fr', label), 'emoji': label_info.get('emoji', '🔖'), 'confidence': 0.70 # Lower confidence for heuristics } if label not in entities: entities[label] = [] # Avoid duplicates if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]): entities[label].append(entity_data) return entities def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str: """ Generate a human-readable summary of extracted entities. Args: entities: Dictionary of entities from extract_entities() Returns: Formatted string summary """ if not entities: return "Aucune entité nommée détectée." lines = [] for label, ent_list in entities.items(): label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'}) emoji = label_info.get('emoji', '🔖') label_display = label_info.get('fr', label) entity_texts = [e['text'] for e in ent_list[:5]] # Limit to 5 lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}") return "\n".join(lines) def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]: """ Convert entities to frontend-friendly format. Returns: List of entities with all info for display """ result = [] for label, ent_list in entities.items(): for ent in ent_list: result.append({ 'text': ent['text'], 'type': ent['label'], 'type_display': ent.get('label_display', ent['label']), 'emoji': ent.get('emoji', '🔖'), 'confidence': ent.get('confidence', 0.5), 'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%" }) # Sort by confidence result.sort(key=lambda x: x['confidence'], reverse=True) return result # Singleton instance for easy import _ner_analyzer: Optional[NERAnalyzer] = None def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer: """Get or create singleton NER analyzer instance.""" global _ner_analyzer if _ner_analyzer is None: _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True) return _ner_analyzer # Quick test if __name__ == "__main__": analyzer = NERAnalyzer(fallback=True) test_text = """ Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée. Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington. Les dégâts sont estimés à 30 millions de dollars. """ entities = analyzer.extract_entities(test_text) print("=== Entités détectées ===") print(analyzer.get_entity_summary(entities)) print("\n=== Format Frontend ===") for e in analyzer.to_frontend_format(entities): print(f" {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")