File size: 20,106 Bytes
e70050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
# -*- coding: utf-8 -*-
"""
API Clients Module - SysCRED
============================
Handles all external API calls for the credibility verification system.

APIs intégrées:
- Web content fetching (requests + BeautifulSoup)
- WHOIS lookup for domain age
- Google Fact Check Tools API
- Backlinks estimation via CommonCrawl

(c) Dominique S. Loyer - PhD Thesis Prototype
Citation Key: loyerModelingHybridSystem2025
"""

import requests
from urllib.parse import urlparse
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any
from dataclasses import dataclass
import re
import json
from functools import lru_cache

# Optional imports with fallbacks
try:
    from bs4 import BeautifulSoup
    HAS_BS4 = True
except ImportError:
    HAS_BS4 = False
    print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")

try:
    import whois
    HAS_WHOIS = True
except ImportError:
    HAS_WHOIS = False
    print("Warning: python-whois not installed. Run: pip install python-whois")


# --- Data Classes for Structured Results ---

@dataclass
class WebContent:
    """Represents fetched web content."""
    url: str
    title: Optional[str]
    text_content: str
    meta_description: Optional[str]
    meta_keywords: List[str]
    links: List[str]
    fetch_timestamp: str
    success: bool
    error: Optional[str] = None


@dataclass
class DomainInfo:
    """Represents domain WHOIS information."""
    domain: str
    creation_date: Optional[datetime]
    expiration_date: Optional[datetime]
    registrar: Optional[str]
    age_days: Optional[int]
    success: bool
    error: Optional[str] = None


@dataclass
class FactCheckResult:
    """Represents a single fact-check claim review."""
    claim: str
    claimant: Optional[str]
    rating: str
    publisher: str
    url: str
    review_date: Optional[str]


@dataclass
class ExternalData:
    """Combined external data for credibility analysis."""
    fact_checks: List[FactCheckResult]
    source_reputation: str
    domain_age_days: Optional[int]
    domain_info: Optional[DomainInfo]
    related_articles: List[Dict[str, str]]
    backlinks_count: int
    backlinks_sample: List[Dict[str, str]]


class ExternalAPIClients:
    """
    Central class for all external API integrations.
    Replaces simulated functions with real API calls.
    """
    
    def __init__(self, google_api_key: Optional[str] = None):
        """
        Initialize API clients.
        
        Args:
            google_api_key: API key for Google Fact Check Tools API (optional)
        """
        self.google_api_key = google_api_key
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8',
            'Referer': 'https://www.google.com/',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1'
        })
        
        # Reputation database (can be extended or loaded from file)
        self.known_reputations = {
            # High credibility sources
            'lemonde.fr': 'High',
            'nytimes.com': 'High',
            'reuters.com': 'High',
            'bbc.com': 'High',
            'theguardian.com': 'High',
            'apnews.com': 'High',
            'nature.com': 'High',
            'sciencedirect.com': 'High',
            'scholar.google.com': 'High',
            'factcheck.org': 'High',
            'snopes.com': 'High',
            'politifact.com': 'High',
            # Medium credibility
            'wikipedia.org': 'Medium',
            'medium.com': 'Medium',
            'huffpost.com': 'Medium',
            # Low credibility (known misinformation spreaders)
            'infowars.com': 'Low',
            'naturalnews.com': 'Low',
        }
    
    def fetch_web_content(self, url: str, timeout: int = 10) -> WebContent:
        """
        Fetch and parse web content from a URL.
        
        Args:
            url: The URL to fetch
            timeout: Request timeout in seconds
            
        Returns:
            WebContent dataclass with extracted information
        """
        timestamp = datetime.now().isoformat()
        
        if not HAS_BS4:
            return WebContent(
                url=url, title=None, text_content="",
                meta_description=None, meta_keywords=[],
                links=[], fetch_timestamp=timestamp,
                success=False, error="BeautifulSoup not installed"
            )
        
        try:
            try:
                response = self.session.get(url, timeout=timeout, allow_redirects=True)
                response.raise_for_status()
            except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
                print(f"[SysCRED] SSL/Connection error for {url}. Retrying without verification...")
                # Suppress warnings for unverified HTTPS request
                import urllib3
                urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
                response = self.session.get(url, timeout=timeout, allow_redirects=True, verify=False)
                response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract title
            title = soup.title.string.strip() if soup.title else None
            
            # Extract meta description
            meta_desc = soup.find('meta', attrs={'name': 'description'})
            meta_description = meta_desc.get('content', '') if meta_desc else None
            
            # Extract meta keywords
            meta_kw = soup.find('meta', attrs={'name': 'keywords'})
            meta_keywords = []
            if meta_kw and meta_kw.get('content'):
                meta_keywords = [k.strip() for k in meta_kw.get('content', '').split(',')]
            
            # Remove script and style elements
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                element.decompose()
            
            # Extract main text content
            text_content = soup.get_text(separator=' ', strip=True)
            # Clean up excessive whitespace
            text_content = re.sub(r'\s+', ' ', text_content)
            
            # Extract links
            links = []
            for a_tag in soup.find_all('a', href=True)[:50]:  # Limit to 50 links
                href = a_tag['href']
                if href.startswith('http'):
                    links.append(href)
            
            return WebContent(
                url=url,
                title=title,
                text_content=text_content[:10000],  # Limit text size
                meta_description=meta_description,
                meta_keywords=meta_keywords,
                links=links,
                fetch_timestamp=timestamp,
                success=True
            )
            
        except requests.exceptions.Timeout:
            return WebContent(
                url=url, title=None, text_content="",
                meta_description=None, meta_keywords=[], links=[],
                fetch_timestamp=timestamp, success=False,
                error=f"Timeout after {timeout}s"
            )
        except requests.exceptions.RequestException as e:
            return WebContent(
                url=url, title=None, text_content="",
                meta_description=None, meta_keywords=[], links=[],
                fetch_timestamp=timestamp, success=False,
                error=str(e)
            )
        except Exception as e:
            return WebContent(
                url=url, title=None, text_content="",
                meta_description=None, meta_keywords=[], links=[],
                fetch_timestamp=timestamp, success=False,
                error=f"Parsing error: {str(e)}"
            )
    
    @lru_cache(maxsize=128)
    def whois_lookup(self, url_or_domain: str) -> DomainInfo:
        """
        Perform WHOIS lookup to get domain registration information.
        
        Args:
            url_or_domain: URL or domain name
            
        Returns:
            DomainInfo dataclass with domain details
        """
        # Extract domain from URL if needed
        if url_or_domain.startswith('http'):
            domain = urlparse(url_or_domain).netloc
        else:
            domain = url_or_domain
        
        # Remove 'www.' prefix
        if domain.startswith('www.'):
            domain = domain[4:]
        
        if not HAS_WHOIS:
            return DomainInfo(
                domain=domain,
                creation_date=None, expiration_date=None,
                registrar=None, age_days=None,
                success=False, error="python-whois not installed"
            )
        
        try:
            w = whois.whois(domain)
            
            # Handle creation_date (can be a list or single value)
            creation_date = w.creation_date
            if isinstance(creation_date, list):
                creation_date = creation_date[0]
            
            # Handle expiration_date
            expiration_date = w.expiration_date
            if isinstance(expiration_date, list):
                expiration_date = expiration_date[0]
            
            # Calculate age in days
            age_days = None
            if creation_date:
                if isinstance(creation_date, datetime):
                    age_days = (datetime.now() - creation_date).days
            
            return DomainInfo(
                domain=domain,
                creation_date=creation_date,
                expiration_date=expiration_date,
                registrar=w.registrar,
                age_days=age_days,
                success=True
            )
            
        except Exception as e:
            return DomainInfo(
                domain=domain,
                creation_date=None, expiration_date=None,
                registrar=None, age_days=None,
                success=False, error=str(e)
            )
    
    def google_fact_check(self, query: str, language: str = "fr") -> List[FactCheckResult]:
        """
        Query Google Fact Check Tools API.
        
        Args:
            query: The claim or text to check
            language: Language code (default: French)
            
        Returns:
            List of FactCheckResult objects
        """
        results = []
        
        if not self.google_api_key:
            print("[Info] Google Fact Check API key not configured. Using simulation.")
            return self._simulate_fact_check(query)
        
        try:
            api_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
            params = {
                'key': self.google_api_key,
                'query': query[:200],  # API has character limit
                # 'languageCode': language  # Removed to allow all languages (e.g. English queries)
            }
            
            response = self.session.get(api_url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            
            claims = data.get('claims', [])
            for claim in claims[:5]:  # Limit to 5 results
                text = claim.get('text', '')
                claimant = claim.get('claimant')
                
                for review in claim.get('claimReview', []):
                    results.append(FactCheckResult(
                        claim=text,
                        claimant=claimant,
                        rating=review.get('textualRating', 'Unknown'),
                        publisher=review.get('publisher', {}).get('name', 'Unknown'),
                        url=review.get('url', ''),
                        review_date=review.get('reviewDate')
                    ))
            
            return results
            
        except Exception as e:
            print(f"[Warning] Google Fact Check API error: {e}")
            return self._simulate_fact_check(query)
    
    def _simulate_fact_check(self, query: str) -> List[FactCheckResult]:
        """Fallback simulation when API is not available."""
        # Check for known misinformation patterns
        misinformation_keywords = [
            'conspiracy', 'hoax', 'fake', 'miracle cure', 'they don\'t want you to know',
            'mainstream media lies', 'deep state', 'plandemic'
        ]
        
        query_lower = query.lower()
        for keyword in misinformation_keywords:
            if keyword in query_lower:
                return [FactCheckResult(
                    claim=f"Text contains potential misinformation marker: '{keyword}'",
                    claimant=None,
                    rating="Needs Verification",
                    publisher="SysCRED Heuristic",
                    url="",
                    review_date=datetime.now().isoformat()
                )]
        
        return []  # No fact checks found
    
    @lru_cache(maxsize=128)
    def get_source_reputation(self, url: str) -> str:
        """
        Get reputation score for a source/domain.
        
        Args:
            url: URL or domain to check
            
        Returns:
            Reputation level: 'High', 'Medium', 'Low', or 'Unknown'
        """
        if url.startswith('http'):
            domain = urlparse(url).netloc
        else:
            domain = url
        
        # Remove www prefix
        if domain.startswith('www.'):
            domain = domain[4:]
        
        # Check known reputations
        for known_domain, reputation in self.known_reputations.items():
            if domain.endswith(known_domain) or known_domain in domain:
                return reputation
        
        # Heuristics for unknown domains
        # Academic domains tend to be more credible
        if domain.endswith('.edu') or domain.endswith('.gov') or domain.endswith('.ac.uk'):
            return 'High'
        
        # Personal sites and free hosting are less credible
        if any(x in domain for x in ['.blogspot.', '.wordpress.', '.wix.', '.weebly.']):
            return 'Low'
        
        return 'Unknown'
    
    def estimate_backlinks(self, url: str) -> Dict[str, Any]:
        """
        Estimate relative authority/backlinks based on available signals.
        
        Since real backlink databases (Ahrefs, Moz) are paid/proprietary,
        we use a composite heuristic based on:
        1. Domain age (older domains tend to have more backlinks)
        2. Known reputation (High reputation sources imply high backlinks)
        3. Google Fact Check mentions (as a proxy for visibility in fact-checks)
        """
        domain = urlparse(url).netloc
        if domain.startswith('www.'):
            domain = domain[4:]
            
        # 1. Base Score from Reputation
        reputation = self.get_source_reputation(domain)
        base_count = 0
        if reputation == 'High':
            base_count = 10000  # High authority
        elif reputation == 'Medium':
            base_count = 1000   # Medium authority
        elif reputation == 'Low':
            base_count = 50     # Low authority
        else:
            base_count = 100    # Unknown
            
        # 2. Multiplier from Domain Age
        age_multiplier = 1.0
        domain_info = self.whois_lookup(domain)
        if domain_info.success and domain_info.age_days:
            # Add 10% for every year of age, max 5x
            years = domain_info.age_days / 365
            age_multiplier = min(5.0, 1.0 + (years * 0.1))
            
        estimated_count = int(base_count * age_multiplier)
        
        # 3. Adjust for specific TLDs
        if domain.endswith('.edu') or domain.endswith('.gov'):
            estimated_count *= 2
            
        return {
            'estimated_count': estimated_count,
            'sample_backlinks': [], # Real sample requires SERP API
            'method': 'heuristic_v2.1',
            'note': 'Estimated from domain age and reputation (Proxy)'
        }
    
    def fetch_external_data(self, input_data: str, fc_query: str = None) -> ExternalData:
        """
        Main method to fetch all external data for credibility analysis.
        This replaces the simulated fetch_external_data function.
        
        Args:
            input_data: URL or text to analyze
            
        Returns:
            ExternalData with all gathered information
        """
        from urllib.parse import urlparse
        
        # Determine if input is URL
        is_url = False
        try:
            result = urlparse(input_data)
            is_url = all([result.scheme, result.netloc])
        except:
            pass
        
        # Initialize results
        domain_age_days = None
        domain_info = None
        source_reputation = 'Unknown'
        fact_checks = []
        backlinks_data = {'estimated_count': 0, 'sample_backlinks': []}
        
        if is_url:
            # Get domain information
            domain_info = self.whois_lookup(input_data)
            if domain_info.success:
                domain_age_days = domain_info.age_days
            
            # Get source reputation
            source_reputation = self.get_source_reputation(input_data)
            
            # Get backlink estimation
            backlinks_data = self.estimate_backlinks(input_data)
        
        # Perform fact check on the content/URL
        # Use provided query or fall back to input_data
        query_to_use = fc_query if fc_query else input_data
        fact_checks = self.google_fact_check(query_to_use)
        
        return ExternalData(
            fact_checks=fact_checks,
            source_reputation=source_reputation,
            domain_age_days=domain_age_days,
            domain_info=domain_info,
            related_articles=[],  # TODO: Implement related article search
            backlinks_count=backlinks_data.get('estimated_count', 0),
            backlinks_sample=backlinks_data.get('sample_backlinks', [])
        )


# --- Testing ---
if __name__ == "__main__":
    print("=== Testing ExternalAPIClients ===\n")
    
    client = ExternalAPIClients()
    
    # Test 1: Web content fetching
    print("Test 1: Fetching web content from Le Monde...")
    content = client.fetch_web_content("https://www.lemonde.fr")
    print(f"  Success: {content.success}")
    print(f"  Title: {content.title}")
    print(f"  Text length: {len(content.text_content)} chars")
    print(f"  Links found: {len(content.links)}")
    print()
    
    # Test 2: WHOIS lookup
    print("Test 2: WHOIS lookup for lemonde.fr...")
    domain_info = client.whois_lookup("https://www.lemonde.fr")
    print(f"  Success: {domain_info.success}")
    print(f"  Domain: {domain_info.domain}")
    print(f"  Age: {domain_info.age_days} days")
    print(f"  Registrar: {domain_info.registrar}")
    print()
    
    # Test 3: Source reputation
    print("Test 3: Source reputation checks...")
    test_urls = [
        "https://www.nytimes.com/article",
        "https://www.infowars.com/post",
        "https://random-blog.wordpress.com"
    ]
    for url in test_urls:
        rep = client.get_source_reputation(url)
        print(f"  {url}: {rep}")
    print()
    
    # Test 4: Full external data
    print("Test 4: Full external data fetch...")
    external_data = client.fetch_external_data("https://www.bbc.com/news")
    print(f"  Source reputation: {external_data.source_reputation}")
    print(f"  Domain age: {external_data.domain_age_days} days")
    print(f"  Fact checks found: {len(external_data.fact_checks)}")
    
    print("\n=== Tests Complete ===")