syscred_duplicate

Running

File size: 10,847 Bytes

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Named Entity Recognition (NER) Analyzer for SysCRED
====================================================
Extracts named entities from text using spaCy.

Entities detected:
- PER: Persons (Donald Trump, Emmanuel Macron)
- ORG: Organizations (FBI, UN, Google)
- LOC: Locations (Paris, Capitol)
- DATE: Dates (January 6, 2021)
- MONEY: Amounts ($10 million)
- EVENT: Events (insurrection, election)
"""

from typing import Dict, List, Any, Optional
import logging

# Try to import spaCy
try:
    import spacy
    from spacy.language import Language
    HAS_SPACY = True
except ImportError:
    HAS_SPACY = False
    spacy = None

logger = logging.getLogger(__name__)


class NERAnalyzer:
    """
    Named Entity Recognition analyzer using spaCy.
    
    Supports French (fr_core_news_md) and English (en_core_web_md).
    Falls back to heuristic extraction if spaCy is not available.
    """
    
    # Entity type mappings for display
    ENTITY_LABELS = {
        'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
        'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
        'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
        'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
        'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
        'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
        'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
        'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
        'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
        'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
        'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
        'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
        'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
        'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
    }
    
    def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
        """
        Initialize NER analyzer.
        
        Args:
            model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
            fallback: If True, use heuristics when spaCy unavailable
        """
        self.model_name = model_name
        self.fallback = fallback
        self.nlp = None
        self.use_heuristics = False
        
        if HAS_SPACY:
            try:
                self.nlp = spacy.load(model_name)
                logger.info(f"[NER] Loaded spaCy model: {model_name}")
            except OSError as e:
                logger.warning(f"[NER] Could not load model {model_name}: {e}")
                if fallback:
                    self.use_heuristics = True
                    logger.info("[NER] Using heuristic entity extraction")
        else:
            if fallback:
                self.use_heuristics = True
                logger.info("[NER] spaCy not installed. Using heuristic extraction")
    
    def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
        """
        Extract named entities from text.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary mapping entity types to lists of entities
            Each entity has: text, start, end, label, label_display, emoji, confidence
        """
        if not text or len(text.strip()) == 0:
            return {}
        
        if self.nlp:
            return self._extract_with_spacy(text)
        elif self.use_heuristics:
            return self._extract_with_heuristics(text)
        else:
            return {}
    
    def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
        """Extract entities using spaCy NLP."""
        doc = self.nlp(text)
        entities: Dict[str, List[Dict[str, Any]]] = {}
        
        for ent in doc.ents:
            label = ent.label_
            
            # Get display info
            label_info = self.ENTITY_LABELS.get(label, {
                'fr': label, 
                'en': label, 
                'emoji': '🔖'
            })
            
            entity_data = {
                'text': ent.text,
                'start': ent.start_char,
                'end': ent.end_char,
                'label': label,
                'label_display': label_info.get('fr', label),
                'emoji': label_info.get('emoji', '🔖'),
                'confidence': 0.85  # spaCy doesn't provide confidence by default
            }
            
            if label not in entities:
                entities[label] = []
            
            # Avoid duplicates
            if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
                entities[label].append(entity_data)
        
        return entities
    
    def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
        """
        Fallback heuristic entity extraction.
        Uses pattern matching for common entities.
        """
        import re
        entities: Dict[str, List[Dict[str, Any]]] = {}
        
        # Common patterns
        patterns = {
            'PER': [
                # Known political figures
                r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
                r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
            ],
            'ORG': [
                r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
                r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
                r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
            ],
            'LOC': [
                r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
                r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
            ],
            'DATE': [
                r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
                r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
                r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
                r'\b(January|February|March|April|May|June|July|August|'
                r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
            ],
            'MONEY': [
                r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
                r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
                r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
            ],
            'PERCENT': [
                r'\b\d+(?:\.\d+)?%',
                r'\b\d+(?:\.\d+)?\s*pour\s*cent',
                r'\b\d+(?:\.\d+)?\s*percent',
            ],
        }
        
        for label, pattern_list in patterns.items():
            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
            
            for pattern in pattern_list:
                for match in re.finditer(pattern, text, re.IGNORECASE):
                    entity_data = {
                        'text': match.group(),
                        'start': match.start(),
                        'end': match.end(),
                        'label': label,
                        'label_display': label_info.get('fr', label),
                        'emoji': label_info.get('emoji', '🔖'),
                        'confidence': 0.70  # Lower confidence for heuristics
                    }
                    
                    if label not in entities:
                        entities[label] = []
                    
                    # Avoid duplicates
                    if not any(e['text'].lower() == entity_data['text'].lower() 
                              for e in entities[label]):
                        entities[label].append(entity_data)
        
        return entities
    
    def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
        """
        Generate a human-readable summary of extracted entities.
        
        Args:
            entities: Dictionary of entities from extract_entities()
            
        Returns:
            Formatted string summary
        """
        if not entities:
            return "Aucune entité nommée détectée."
        
        lines = []
        for label, ent_list in entities.items():
            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
            emoji = label_info.get('emoji', '🔖')
            label_display = label_info.get('fr', label)
            
            entity_texts = [e['text'] for e in ent_list[:5]]  # Limit to 5
            lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
        
        return "\n".join(lines)
    
    def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
        """
        Convert entities to frontend-friendly format.
        
        Returns:
            List of entities with all info for display
        """
        result = []
        for label, ent_list in entities.items():
            for ent in ent_list:
                result.append({
                    'text': ent['text'],
                    'type': ent['label'],
                    'type_display': ent.get('label_display', ent['label']),
                    'emoji': ent.get('emoji', '🔖'),
                    'confidence': ent.get('confidence', 0.5),
                    'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
                })
        
        # Sort by confidence
        result.sort(key=lambda x: x['confidence'], reverse=True)
        return result


# Singleton instance for easy import
_ner_analyzer: Optional[NERAnalyzer] = None


def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
    """Get or create singleton NER analyzer instance."""
    global _ner_analyzer
    if _ner_analyzer is None:
        _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
    return _ner_analyzer


# Quick test
if __name__ == "__main__":
    analyzer = NERAnalyzer(fallback=True)
    
    test_text = """
    Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
    Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
    Les dégâts sont estimés à 30 millions de dollars.
    """
    
    entities = analyzer.extract_entities(test_text)
    print("=== Entités détectées ===")
    print(analyzer.get_entity_summary(entities))
    print("\n=== Format Frontend ===")
    for e in analyzer.to_frontend_format(entities):
        print(f"  {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")