File size: 10,847 Bytes
ea9303b
3700c55
 
ea9303b
 
 
3700c55
ea9303b
 
 
 
 
 
 
3700c55
 
ea9303b
 
3700c55
ea9303b
3700c55
 
ea9303b
3700c55
 
 
ea9303b
 
 
3700c55
 
 
 
ea9303b
3700c55
ea9303b
 
3700c55
 
ea9303b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3700c55
 
ea9303b
3700c55
 
 
 
ea9303b
 
3700c55
ea9303b
 
3700c55
ea9303b
3700c55
 
 
 
ea9303b
 
 
 
 
 
 
 
 
 
3700c55
ea9303b
3700c55
 
 
ea9303b
 
 
3700c55
ea9303b
 
3700c55
ea9303b
 
3700c55
ea9303b
 
 
 
 
 
 
 
 
3700c55
ea9303b
3700c55
 
ea9303b
3700c55
ea9303b
 
 
 
 
 
 
 
3700c55
 
ea9303b
 
 
 
 
3700c55
 
ea9303b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3700c55
ea9303b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3700c55
ea9303b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3700c55
ea9303b
3700c55
ea9303b
3700c55
ea9303b
 
 
 
 
3700c55
ea9303b
 
 
 
 
 
 
 
 
 
 
3700c55
ea9303b
 
 
 
 
3700c55
ea9303b
 
 
 
 
 
 
 
 
 
 
 
 
 
3700c55
ea9303b
 
3700c55
 
 
ea9303b
 
 
3700c55
ea9303b
 
 
 
 
 
3700c55
 
ea9303b
3700c55
ea9303b
3700c55
 
ea9303b
 
 
3700c55
 
ea9303b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Named Entity Recognition (NER) Analyzer for SysCRED
====================================================
Extracts named entities from text using spaCy.

Entities detected:
- PER: Persons (Donald Trump, Emmanuel Macron)
- ORG: Organizations (FBI, UN, Google)
- LOC: Locations (Paris, Capitol)
- DATE: Dates (January 6, 2021)
- MONEY: Amounts ($10 million)
- EVENT: Events (insurrection, election)
"""

from typing import Dict, List, Any, Optional
import logging

# Try to import spaCy
try:
    import spacy
    from spacy.language import Language
    HAS_SPACY = True
except ImportError:
    HAS_SPACY = False
    spacy = None

logger = logging.getLogger(__name__)


class NERAnalyzer:
    """
    Named Entity Recognition analyzer using spaCy.
    
    Supports French (fr_core_news_md) and English (en_core_web_md).
    Falls back to heuristic extraction if spaCy is not available.
    """
    
    # Entity type mappings for display
    ENTITY_LABELS = {
        'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
        'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
        'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
        'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
        'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
        'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
        'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
        'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
        'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
        'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
        'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
        'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
        'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
        'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
    }
    
    def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
        """
        Initialize NER analyzer.
        
        Args:
            model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
            fallback: If True, use heuristics when spaCy unavailable
        """
        self.model_name = model_name
        self.fallback = fallback
        self.nlp = None
        self.use_heuristics = False
        
        if HAS_SPACY:
            try:
                self.nlp = spacy.load(model_name)
                logger.info(f"[NER] Loaded spaCy model: {model_name}")
            except OSError as e:
                logger.warning(f"[NER] Could not load model {model_name}: {e}")
                if fallback:
                    self.use_heuristics = True
                    logger.info("[NER] Using heuristic entity extraction")
        else:
            if fallback:
                self.use_heuristics = True
                logger.info("[NER] spaCy not installed. Using heuristic extraction")
    
    def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
        """
        Extract named entities from text.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary mapping entity types to lists of entities
            Each entity has: text, start, end, label, label_display, emoji, confidence
        """
        if not text or len(text.strip()) == 0:
            return {}
        
        if self.nlp:
            return self._extract_with_spacy(text)
        elif self.use_heuristics:
            return self._extract_with_heuristics(text)
        else:
            return {}
    
    def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
        """Extract entities using spaCy NLP."""
        doc = self.nlp(text)
        entities: Dict[str, List[Dict[str, Any]]] = {}
        
        for ent in doc.ents:
            label = ent.label_
            
            # Get display info
            label_info = self.ENTITY_LABELS.get(label, {
                'fr': label, 
                'en': label, 
                'emoji': '🔖'
            })
            
            entity_data = {
                'text': ent.text,
                'start': ent.start_char,
                'end': ent.end_char,
                'label': label,
                'label_display': label_info.get('fr', label),
                'emoji': label_info.get('emoji', '🔖'),
                'confidence': 0.85  # spaCy doesn't provide confidence by default
            }
            
            if label not in entities:
                entities[label] = []
            
            # Avoid duplicates
            if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
                entities[label].append(entity_data)
        
        return entities
    
    def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
        """
        Fallback heuristic entity extraction.
        Uses pattern matching for common entities.
        """
        import re
        entities: Dict[str, List[Dict[str, Any]]] = {}
        
        # Common patterns
        patterns = {
            'PER': [
                # Known political figures
                r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
                r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
            ],
            'ORG': [
                r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
                r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
                r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
            ],
            'LOC': [
                r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
                r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
            ],
            'DATE': [
                r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
                r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
                r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
                r'\b(January|February|March|April|May|June|July|August|'
                r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
            ],
            'MONEY': [
                r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
                r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
                r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
            ],
            'PERCENT': [
                r'\b\d+(?:\.\d+)?%',
                r'\b\d+(?:\.\d+)?\s*pour\s*cent',
                r'\b\d+(?:\.\d+)?\s*percent',
            ],
        }
        
        for label, pattern_list in patterns.items():
            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
            
            for pattern in pattern_list:
                for match in re.finditer(pattern, text, re.IGNORECASE):
                    entity_data = {
                        'text': match.group(),
                        'start': match.start(),
                        'end': match.end(),
                        'label': label,
                        'label_display': label_info.get('fr', label),
                        'emoji': label_info.get('emoji', '🔖'),
                        'confidence': 0.70  # Lower confidence for heuristics
                    }
                    
                    if label not in entities:
                        entities[label] = []
                    
                    # Avoid duplicates
                    if not any(e['text'].lower() == entity_data['text'].lower() 
                              for e in entities[label]):
                        entities[label].append(entity_data)
        
        return entities
    
    def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
        """
        Generate a human-readable summary of extracted entities.
        
        Args:
            entities: Dictionary of entities from extract_entities()
            
        Returns:
            Formatted string summary
        """
        if not entities:
            return "Aucune entité nommée détectée."
        
        lines = []
        for label, ent_list in entities.items():
            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
            emoji = label_info.get('emoji', '🔖')
            label_display = label_info.get('fr', label)
            
            entity_texts = [e['text'] for e in ent_list[:5]]  # Limit to 5
            lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
        
        return "\n".join(lines)
    
    def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
        """
        Convert entities to frontend-friendly format.
        
        Returns:
            List of entities with all info for display
        """
        result = []
        for label, ent_list in entities.items():
            for ent in ent_list:
                result.append({
                    'text': ent['text'],
                    'type': ent['label'],
                    'type_display': ent.get('label_display', ent['label']),
                    'emoji': ent.get('emoji', '🔖'),
                    'confidence': ent.get('confidence', 0.5),
                    'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
                })
        
        # Sort by confidence
        result.sort(key=lambda x: x['confidence'], reverse=True)
        return result


# Singleton instance for easy import
_ner_analyzer: Optional[NERAnalyzer] = None


def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
    """Get or create singleton NER analyzer instance."""
    global _ner_analyzer
    if _ner_analyzer is None:
        _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
    return _ner_analyzer


# Quick test
if __name__ == "__main__":
    analyzer = NERAnalyzer(fallback=True)
    
    test_text = """
    Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
    Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
    Les dégâts sont estimés à 30 millions de dollars.
    """
    
    entities = analyzer.extract_entities(test_text)
    print("=== Entités détectées ===")
    print(analyzer.get_entity_summary(entities))
    print("\n=== Format Frontend ===")
    for e in analyzer.to_frontend_format(entities):
        print(f"  {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")