Spaces:

alinabil21
/

geo-platform

Running

File size: 7,215 Bytes

5c429d4

"""Competitor Detection and Analysis."""
import re
from urllib.parse import urlparse
from typing import List, Dict, Set
from collections import defaultdict

def extract_domain(url: str) -> str:
    """Extract clean domain from URL."""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or parsed.path
        # Remove www.
        domain = re.sub(r'^www\.', '', domain)
        return domain.lower()
    except:
        return ''

def is_valid_competitor(url: str, source_domain: str) -> bool:
    """Check if URL is a valid competitor (not internal, not social, not CDN)."""
    domain = extract_domain(url)
    
    if not domain:
        return False
    
    # Same domain = not competitor
    if domain == source_domain or source_domain in domain:
        return False
    
    # Filter out common non-competitor domains
    excluded_patterns = [
        # Social media
        r'facebook\.com', r'twitter\.com', r'instagram\.com', r'linkedin\.com',
        r'youtube\.com', r'tiktok\.com', r'pinterest\.com', r'snapchat\.com',
        # CDNs and services
        r'cloudflare\.com', r'amazonaws\.com', r'googleusercontent\.com',
        r'cloudfront\.net', r'akamai\.net', r'fastly\.net',
        # Analytics and ads
        r'google-analytics\.com', r'googletagmanager\.com', r'doubleclick\.net',
        r'facebook\.net', r'googlesyndication\.com', r'googleadservices\.com',
        # Payment and services
        r'paypal\.com', r'stripe\.com', r'shopify\.com',
        # Generic services
        r'google\.com', r'bing\.com', r'yahoo\.com', r'wikipedia\.org',
        r'w3\.org', r'schema\.org', r'creativecommons\.org',
        # Fonts and assets
        r'fonts\.googleapis\.com', r'fonts\.gstatic\.com',
        # Maps
        r'maps\.google\.com', r'openstreetmap\.org'
    ]
    
    for pattern in excluded_patterns:
        if re.search(pattern, domain):
            return False
    
    return True

def detect_competitors(pages: List[Dict], source_url: str, min_mentions: int = 1) -> List[Dict]:
    """
    Detect competitor domains from crawled pages with contextual snippets.
    
    Args:
        pages: List of page objects with 'links' and 'text' fields
        source_url: Source domain URL
        min_mentions: Minimum number of mentions to be considered competitor
    
    Returns:
        List of competitor dicts with domain, count, sample URLs, and context snippets
    """
    source_domain = extract_domain(source_url)
    
    # Count competitor mentions
    competitor_counts = defaultdict(int)
    competitor_urls = defaultdict(set)
    competitor_contexts = defaultdict(list)
    
    for page in pages:
        links = page.get('links', [])
        page_text = page.get('text', '')
        
        for link in links:
            if is_valid_competitor(link, source_domain):
                domain = extract_domain(link)
                competitor_counts[domain] += 1
                competitor_urls[domain].add(link)
                
                # Extract a small snippet of context if possible
                # In a real scenario, we'd use beautifulsoup to find parent elements
                # Here we do a simple text-based heuristic search
                if page_text and domain in page_text:
                    try:
                        idx = page_text.find(domain)
                        start = max(0, idx - 100)
                        end = min(len(page_text), idx + 100)
                        context = page_text[start:end].strip().replace('\n', ' ')
                        if context:
                            competitor_contexts[domain].append(context)
                    except:
                        pass
    
    # Filter by minimum mentions
    competitors = []
    for domain, count in competitor_counts.items():
        if count >= min_mentions:
            # Deduplicate and limit contexts
            unique_contexts = list(set(competitor_contexts[domain]))[:5]
            competitors.append({
                'domain': domain,
                'mentions': count,
                'sample_urls': list(competitor_urls[domain])[:3],
                'contexts': unique_contexts
            })
    
    # Sort by mentions (descending)
    competitors.sort(key=lambda x: x['mentions'], reverse=True)
    
    return competitors

def analyze_competitor_keywords(competitor_domain: str, pages: List[Dict]) -> Dict:
    """
    Analyze what keywords appear near competitor links.
    
    This helps understand the context in which competitors are mentioned.
    """
    # Find pages that mention this competitor
    relevant_pages = []
    for page in pages:
        links = page.get('links', [])
        for link in links:
            if competitor_domain in link:
                relevant_pages.append(page)
                break
    
    if not relevant_pages:
        return {'keywords': [], 'context': []}
    
    # Extract text around competitor mentions
    # This is a simplified version - could be enhanced with NLP
    contexts = []
    for page in relevant_pages:
        title = page.get('title', '')
        if title:
            contexts.append(title)
    
    return {
        'pages_mentioned': len(relevant_pages),
        'contexts': contexts[:5]
    }

def format_competitor_report(competitors: List[Dict], source_url: str) -> str:
    """Format competitor analysis as readable report."""
    lines = []
    lines.append("=" * 80)
    lines.append("COMPETITOR ANALYSIS REPORT")
    lines.append("=" * 80)
    
    source_domain = extract_domain(source_url)
    lines.append(f"\n🎯 Source Domain: {source_domain}")
    lines.append(f"📊 Competitors Found: {len(competitors)}")
    
    if not competitors:
        lines.append("\n❌ No competitors detected")
        lines.append("\nPossible reasons:")
        lines.append("  • Page has no external links")
        lines.append("  • All external links are to social media/CDNs")
        lines.append("  • Minimum mention threshold not met")
        return "\n".join(lines)
    
    lines.append("\n🏆 TOP COMPETITORS")
    lines.append("-" * 80)
    lines.append(f"{'Domain':<40} {'Mentions':<10} {'Sample URL'}")
    lines.append("-" * 80)
    
    for comp in competitors[:10]:
        sample = comp['sample_urls'][0] if comp['sample_urls'] else 'N/A'
        if len(sample) > 35:
            sample = sample[:32] + '...'
        lines.append(f"{comp['domain']:<40} {comp['mentions']:<10} {sample}")
    
    lines.append("\n" + "=" * 80)
    return "\n".join(lines)

def get_competitor_summary(competitors: List[Dict]) -> Dict:
    """Get summary statistics for competitors."""
    if not competitors:
        return {
            'total': 0,
            'avg_mentions': 0,
            'top_competitor': None
        }
    
    total = len(competitors)
    avg_mentions = sum(c['mentions'] for c in competitors) / total
    top_competitor = competitors[0] if competitors else None
    
    return {
        'total': total,
        'avg_mentions': round(avg_mentions, 1),
        'top_competitor': top_competitor['domain'] if top_competitor else None,
        'top_mentions': top_competitor['mentions'] if top_competitor else 0
    }