Spaces:
Build error
Build error
| import gradio as gr | |
| import asyncio | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter | |
| from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from typing import List, Dict, Tuple | |
| import pandas as pd | |
| from dataclasses import dataclass | |
| import json | |
| import time | |
| import warnings | |
| import os | |
| import re | |
| import tempfile | |
| # Trafilatura imports | |
| from trafilatura import fetch_url, extract, bare_extraction | |
| from trafilatura.downloads import fetch_url as trafilatura_fetch | |
| warnings.filterwarnings('ignore') | |
| # Global variable to store the latest vector data | |
| latest_vector_data = None | |
| def prepare_download(vector_df): | |
| """Prepare the vector data for download""" | |
| global latest_vector_data | |
| if vector_df is not None and not vector_df.empty: | |
| # Save to temporary file | |
| temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, newline='', encoding='utf-8') | |
| vector_df.to_csv(temp_file.name, index=False) | |
| latest_vector_data = temp_file.name | |
| return temp_file.name | |
| return None | |
| def download_vector_data(): | |
| """Return the prepared vector data file""" | |
| global latest_vector_data | |
| if latest_vector_data: | |
| return latest_vector_data | |
| return None | |
| class ContentChunk: | |
| content: str | |
| url: str | |
| page_type: str # 'client' or 'competitor' | |
| chunk_index: int | |
| chunk_type: str # 'header_section', 'paragraph', or 'header_subsection' | |
| header_info: Dict = None # Will store header level and text | |
| similarity_score: float = 0.0 | |
| class PageAnalysis: | |
| url: str | |
| page_type: str | |
| total_chunks: int | |
| avg_similarity: float | |
| max_similarity: float | |
| top_chunks: List[ContentChunk] | |
| class SEOContentAnalyzer: | |
| def __init__(self, api_key: str): | |
| self.embeddings = OpenAIEmbeddings( | |
| model="text-embedding-3-small", | |
| openai_api_key=api_key | |
| ) | |
| self.llm = ChatOpenAI( | |
| model="gpt-4o-mini", | |
| temperature=0.3, | |
| openai_api_key=api_key | |
| ) | |
| # Header-based splitter (first level) | |
| self.html_splitter = HTMLHeaderTextSplitter( | |
| headers_to_split_on=[ | |
| ("h1", "Header 1"), | |
| ("h2", "Header 2"), | |
| ("h3", "Header 3"), | |
| ("h4", "Header 4"), | |
| ("h5", "Header 5"), | |
| ("h6", "Header 6"), | |
| ] | |
| ) | |
| # Paragraph-based splitter (second level) | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=600, | |
| chunk_overlap=100, | |
| separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] | |
| ) | |
| self.all_chunks = [] | |
| self.keyword_embedding = None | |
| async def fetch_and_clean_html(self, url: str) -> Dict: | |
| """Fetch and clean HTML content from URL using Trafilatura""" | |
| try: | |
| # Use trafilatura to fetch the URL with custom settings | |
| downloaded = trafilatura_fetch(url) | |
| if not downloaded: | |
| return {'url': url, 'success': False, 'error': 'Failed to download'} | |
| # Extract text content using trafilatura | |
| text_content = extract(downloaded, include_comments=False, include_tables=True) | |
| if not text_content: | |
| return {'url': url, 'success': False, 'error': 'No content extracted'} | |
| # Extract with metadata to get title and other info | |
| metadata_result = bare_extraction(downloaded, include_comments=False, include_tables=True) | |
| # Handle Document object properly | |
| title = '' | |
| if metadata_result: | |
| if hasattr(metadata_result, 'title') and metadata_result.title: | |
| title = metadata_result.title | |
| elif hasattr(metadata_result, 'get'): | |
| title = metadata_result.get('title', '') | |
| else: | |
| # Try to access as attribute | |
| try: | |
| title = getattr(metadata_result, 'title', '') | |
| except: | |
| title = '' | |
| # Extract HTML with formatting for header splitting | |
| html_content = extract(downloaded, output_format='xml', include_comments=False, include_tables=True) | |
| # Convert trafilatura XML to simple HTML for header splitting | |
| if html_content and len(html_content) > 100: | |
| # Simple conversion: replace XML tags with HTML equivalents | |
| html_for_splitting = html_content | |
| # Convert <head> tags to proper header tags | |
| html_for_splitting = re.sub(r'<head rend="(h[1-6])"[^>]*>', r'<\1>', html_for_splitting) | |
| html_for_splitting = re.sub(r'<head rend="h(\d)"[^>]*>', r'<h\1>', html_for_splitting) | |
| html_for_splitting = re.sub(r'</head>', '</h2>', html_for_splitting) | |
| html_for_splitting = re.sub(r'<head[^>]*>', '<h2>', html_for_splitting) | |
| # Wrap in div | |
| html_for_splitting = f"<div>{html_for_splitting}</div>" | |
| else: | |
| # Fallback: create simple HTML structure from text | |
| # Try to detect headers in plain text | |
| lines = text_content.split('\n') | |
| html_lines = [] | |
| for line in lines: | |
| line = line.strip() | |
| if line: | |
| # Simple heuristic: short lines that might be headers | |
| if len(line) < 100 and len(line) > 5 and not line.endswith('.') and not line.endswith(',') and not line.endswith(';'): | |
| # Check if it looks like a header (title case, shorter, etc.) | |
| if line.istitle() or line.isupper() or (len(line.split()) <= 8): | |
| html_lines.append(f"<h3>{line}</h3>") | |
| else: | |
| html_lines.append(f"<p>{line}</p>") | |
| else: | |
| html_lines.append(f"<p>{line}</p>") | |
| html_for_splitting = f"<div>{''.join(html_lines)}</div>" | |
| word_count = len(text_content.split()) | |
| return { | |
| 'url': url, | |
| 'title': title, | |
| 'text': text_content, | |
| 'html': html_for_splitting, | |
| 'success': True, | |
| 'word_count': word_count | |
| } | |
| except Exception as e: | |
| return {'url': url, 'success': False, 'error': str(e)} | |
| async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict: | |
| """Crawl client and competitor URLs using Trafilatura""" | |
| all_urls = [client_url] + competitor_urls | |
| # Since trafilatura is synchronous, we'll run them sequentially | |
| # but we can still use async structure for consistency | |
| crawl_data = { | |
| 'client': None, | |
| 'competitors': [], | |
| 'failed_urls': [] | |
| } | |
| for i, url in enumerate(all_urls): | |
| result = await self.fetch_and_clean_html(url) | |
| if not result.get('success'): | |
| crawl_data['failed_urls'].append(result['url']) | |
| continue | |
| if i == 0: # First URL is client | |
| crawl_data['client'] = result | |
| else: | |
| crawl_data['competitors'].append(result) | |
| return crawl_data | |
| def chunk_content(self, crawl_data: Dict) -> List[ContentChunk]: | |
| """Chunk all content using header-first, then paragraph-level splitting""" | |
| all_chunks = [] | |
| # Process client content | |
| if crawl_data['client']: | |
| client_chunks = self._chunk_single_page( | |
| crawl_data['client'], 'client' | |
| ) | |
| all_chunks.extend(client_chunks) | |
| # Process competitor content | |
| for comp_data in crawl_data['competitors']: | |
| comp_chunks = self._chunk_single_page(comp_data, 'competitor') | |
| all_chunks.extend(comp_chunks) | |
| self.all_chunks = all_chunks | |
| return all_chunks | |
| def _chunk_single_page(self, page_data: Dict, page_type: str) -> List[ContentChunk]: | |
| """Chunk a single page using header + paragraph strategy""" | |
| chunks = [] | |
| chunk_index = 0 | |
| try: | |
| # Step 1: Try header-based splitting first | |
| if 'html' in page_data: | |
| header_splits = self.html_splitter.split_text(page_data['html']) | |
| if header_splits and len(header_splits) > 1: | |
| # We found headers, process each section | |
| for split in header_splits: | |
| header_info = split.metadata if hasattr(split, 'metadata') else {} | |
| content = split.page_content if hasattr(split, 'page_content') else str(split) | |
| # If header section is large, split it further by paragraphs | |
| if len(content) > 800: | |
| sub_chunks = self.text_splitter.split_text(content) | |
| for i, sub_chunk in enumerate(sub_chunks): | |
| if len(sub_chunk.strip()) > 50: | |
| chunks.append(ContentChunk( | |
| content=sub_chunk.strip(), | |
| url=page_data['url'], | |
| page_type=page_type, | |
| chunk_index=chunk_index, | |
| chunk_type='header_subsection', | |
| header_info=header_info | |
| )) | |
| chunk_index += 1 | |
| else: | |
| # Small header section, keep as is | |
| if len(content.strip()) > 50: | |
| chunks.append(ContentChunk( | |
| content=content.strip(), | |
| url=page_data['url'], | |
| page_type=page_type, | |
| chunk_index=chunk_index, | |
| chunk_type='header_section', | |
| header_info=header_info | |
| )) | |
| chunk_index += 1 | |
| else: | |
| # No meaningful headers found, fall back to paragraph splitting | |
| self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index) | |
| else: | |
| # No HTML available, use text splitting | |
| self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index) | |
| except Exception as e: | |
| self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index) | |
| return chunks | |
| def _add_paragraph_chunks(self, page_data: Dict, page_type: str, chunks: List, start_index: int): | |
| """Add paragraph-level chunks as fallback""" | |
| text_chunks = self.text_splitter.split_text(page_data['text']) | |
| chunk_index = start_index | |
| for chunk_text in text_chunks: | |
| if len(chunk_text.strip()) > 50: | |
| chunks.append(ContentChunk( | |
| content=chunk_text.strip(), | |
| url=page_data['url'], | |
| page_type=page_type, | |
| chunk_index=chunk_index, | |
| chunk_type='paragraph', | |
| header_info={} | |
| )) | |
| chunk_index += 1 | |
| async def calculate_similarities(self, keyword: str) -> List[ContentChunk]: | |
| """Calculate cosine similarity between chunks and keyword""" | |
| if not self.all_chunks: | |
| raise ValueError("No chunks available. Run chunk_content first.") | |
| # Create embeddings for keyword | |
| self.keyword_embedding = await self.embeddings.aembed_query(keyword) | |
| # Create embeddings for all chunks | |
| chunk_texts = [chunk.content for chunk in self.all_chunks] | |
| chunk_embeddings = await self.embeddings.aembed_documents(chunk_texts) | |
| # Calculate similarities | |
| similarities = cosine_similarity([self.keyword_embedding], chunk_embeddings)[0] | |
| # Update chunks with similarity scores | |
| for i, chunk in enumerate(self.all_chunks): | |
| chunk.similarity_score = float(similarities[i]) | |
| # Sort by similarity score | |
| sorted_chunks = sorted(self.all_chunks, key=lambda x: x.similarity_score, reverse=True) | |
| return sorted_chunks | |
| def analyze_pages(self, sorted_chunks: List[ContentChunk]) -> Dict[str, PageAnalysis]: | |
| """Analyze performance by page""" | |
| # Group chunks by URL | |
| url_groups = {} | |
| for chunk in sorted_chunks: | |
| if chunk.url not in url_groups: | |
| url_groups[chunk.url] = [] | |
| url_groups[chunk.url].append(chunk) | |
| page_analyses = {} | |
| for url, chunks in url_groups.items(): | |
| page_type = chunks[0].page_type | |
| similarities = [chunk.similarity_score for chunk in chunks] | |
| analysis = PageAnalysis( | |
| url=url, | |
| page_type=page_type, | |
| total_chunks=len(chunks), | |
| avg_similarity=np.mean(similarities), | |
| max_similarity=np.max(similarities), | |
| top_chunks=sorted(chunks, key=lambda x: x.similarity_score, reverse=True)[:3] | |
| ) | |
| page_analyses[url] = analysis | |
| return page_analyses | |
| async def generate_report(self, keyword: str, page_analyses: Dict[str, PageAnalysis], | |
| sorted_chunks: List[ContentChunk]) -> str: | |
| """Generate comprehensive SEO report""" | |
| # Prepare data for LLM | |
| client_analysis = next((p for p in page_analyses.values() if p.page_type == 'client'), None) | |
| competitor_analyses = [p for p in page_analyses.values() if p.page_type == 'competitor'] | |
| # Get top performing content | |
| top_chunks = sorted_chunks[:5] | |
| client_top_chunks = [c for c in sorted_chunks if c.page_type == 'client'][:3] | |
| competitor_top_chunks = [c for c in sorted_chunks if c.page_type == 'competitor'][:5] | |
| # Format client analysis data safely | |
| client_url = client_analysis.url if client_analysis else 'No client data' | |
| client_chunks = client_analysis.total_chunks if client_analysis else 0 | |
| client_avg = f"{client_analysis.avg_similarity:.4f}" if client_analysis else "0.0000" | |
| client_max = f"{client_analysis.max_similarity:.4f}" if client_analysis else "0.0000" | |
| # Create prompt for LLM | |
| prompt = f""" | |
| As an SEO expert, analyze this content relevance data for the keyword "{keyword}" and provide actionable insights. | |
| CLIENT PAGE PERFORMANCE: | |
| URL: {client_url} | |
| Total Chunks: {client_chunks} | |
| Average Similarity: {client_avg} | |
| Max Similarity: {client_max} | |
| TOP CLIENT CONTENT SECTIONS: | |
| {chr(10).join([f"Score {c.similarity_score:.4f}: {c.content[:200]}..." for c in client_top_chunks[:3]])} | |
| COMPETITOR PERFORMANCE: | |
| {chr(10).join([f"URL: {p.url}, Avg: {p.avg_similarity:.4f}, Max: {p.max_similarity:.4f}" for p in competitor_analyses])} | |
| TOP COMPETITOR CONTENT SECTIONS: | |
| {chr(10).join([f"Score {c.similarity_score:.4f} ({c.url}): {c.content[:200]}..." for c in competitor_top_chunks[:3]])} | |
| OVERALL TOP PERFORMING CONTENT: | |
| {chr(10).join([f"Score {c.similarity_score:.4f} ({c.page_type}): {c.content[:150]}..." for c in top_chunks])} | |
| 1. Top-performing page for this keyword: Identify the strongest-ranking page (ours or a competitorβs), including its URL and why it performs well. | |
| 2. Best-performing sections of content: Highlight the specific sections or content chunks (with text snippets and scores) that perform best for the keyword. | |
| 3. What our clientβs page does well: Summarize the client pageβs strengths compared to competitors. | |
| 4. What our clientβs page is missing: Identify gaps or underdeveloped areas in the clientβs content compared to competitors. | |
| 5. Specific, actionable recommendations: | |
| Break this section into clearly labeled subcategories, such as: | |
| β’ Content Expansion: Missing sections, new topics, or deeper explanations. | |
| β’ Content Enhancement: Improvements to clarity, examples, visuals, or formatting. | |
| For each recommendation, include: | |
| β’ A clear title. | |
| β’ A brief explanation of why it matters. | |
| β’ A reference to the competitor content that demonstrates the point, including: | |
| β’ URL | |
| β’ Score | |
| β’ Content chunk or snippet | |
| Output format: | |
| β’ Use clear section headings and bullet points for readability. | |
| β’ Include competitor references (URL, score, snippet) wherever applicable to support recommendations. | |
| β’ Focus only on content-related improvements, not general SEO optimizations or monitoring advice. | |
| The goal is to help the client improve content relevance, depth, and authority for the target keyword β grounded in the analysis of vector embeddings and competitive content. | |
| """ | |
| response = await self.llm.ainvoke(prompt) | |
| return response.content | |
| # Gradio Interface Functions | |
| async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competitor_urls_text: str, progress=gr.Progress()): | |
| """Main function to run SEO analysis""" | |
| # Create empty dataframes for error cases | |
| empty_summary_df = pd.DataFrame(columns=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"]) | |
| empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"]) | |
| if not api_key: | |
| return "β Please provide your OpenAI API key", empty_summary_df, empty_content_df, empty_summary_df | |
| if not keyword or not client_url: | |
| return "β Please provide both keyword and client URL", empty_summary_df, empty_content_df, empty_summary_df | |
| # Parse competitor URLs | |
| competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()] | |
| if not competitor_urls: | |
| return "β Please provide at least one competitor URL", empty_summary_df, empty_content_df, empty_summary_df | |
| try: | |
| progress(0.1, desc="Initializing analyzer with Trafilatura...") | |
| analyzer = SEOContentAnalyzer(api_key) | |
| progress(0.2, desc="Crawling websites with enhanced extraction...") | |
| crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls) | |
| # Check if we have any successful crawls | |
| total_successful = 0 | |
| if crawl_data['client']: | |
| total_successful += 1 | |
| total_successful += len(crawl_data['competitors']) | |
| if total_successful == 0: | |
| failed_urls = ', '.join(crawl_data['failed_urls'][:3]) | |
| return f"β No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df, empty_summary_df | |
| if not crawl_data['client']: | |
| return "β Failed to crawl client URL", empty_summary_df, empty_content_df, empty_summary_df | |
| if not crawl_data['competitors']: | |
| return "β Failed to crawl any competitor URLs", empty_summary_df, empty_content_df, empty_summary_df | |
| progress(0.4, desc="Processing content with intelligent chunking...") | |
| chunks = analyzer.chunk_content(crawl_data) | |
| if not chunks: | |
| return "β No content chunks were created from the crawled pages", empty_summary_df, empty_content_df, empty_summary_df | |
| progress(0.6, desc="Calculating semantic similarities...") | |
| sorted_chunks = await analyzer.calculate_similarities(keyword) | |
| progress(0.8, desc="Analyzing page performance...") | |
| page_analyses = analyzer.analyze_pages(sorted_chunks) | |
| progress(0.9, desc="Generating AI-powered SEO report...") | |
| report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks) | |
| # Create summary data | |
| summary_data = [] | |
| for url, analysis in page_analyses.items(): | |
| summary_data.append({ | |
| 'URL': url, | |
| 'Type': analysis.page_type.title(), | |
| 'Total Chunks': analysis.total_chunks, | |
| 'Avg Similarity': f"{analysis.avg_similarity:.4f}", | |
| 'Max Similarity': f"{analysis.max_similarity:.4f}" | |
| }) | |
| summary_df = pd.DataFrame(summary_data) | |
| # Create top content data | |
| top_content_data = [] | |
| for i, chunk in enumerate(sorted_chunks[:10], 1): | |
| top_content_data.append({ | |
| 'Rank': i, | |
| 'Type': chunk.page_type.title(), | |
| 'Score': f"{chunk.similarity_score:.4f}", | |
| 'Content Preview': chunk.content[:150] + "..." if len(chunk.content) > 150 else chunk.content, | |
| 'URL': chunk.url | |
| }) | |
| top_content_df = pd.DataFrame(top_content_data) | |
| # Create comprehensive vector data for download (similar to Colab export) | |
| vector_data = [] | |
| for chunk in sorted_chunks: | |
| vector_data.append({ | |
| 'url': chunk.url, | |
| 'page_type': chunk.page_type, | |
| 'chunk_index': chunk.chunk_index, | |
| 'chunk_type': chunk.chunk_type, | |
| 'header_info': str(chunk.header_info) if chunk.header_info else '', | |
| 'similarity_score': chunk.similarity_score, | |
| 'content_preview': chunk.content[:100] + '...' if len(chunk.content) > 100 else chunk.content, | |
| 'content_length': len(chunk.content), | |
| 'full_content': chunk.content # Include full content for download | |
| }) | |
| vector_df = pd.DataFrame(vector_data) | |
| # Prepare download file | |
| download_file_path = prepare_download(vector_df) | |
| progress(1.0, desc="Analysis complete!") | |
| return report, summary_df, top_content_df, vector_df | |
| except Exception as e: | |
| return f"β Error during analysis: {str(e)}", empty_summary_df, empty_content_df, empty_summary_df | |
| def sync_run_seo_analysis(*args): | |
| """Synchronous wrapper for the async function""" | |
| return asyncio.run(run_seo_analysis(*args)) | |
| def handle_analysis_and_download(api_key, keyword, client_url, competitor_urls_text, progress=gr.Progress()): | |
| """Handle analysis and prepare download file""" | |
| result = sync_run_seo_analysis(api_key, keyword, client_url, competitor_urls_text, progress) | |
| # If analysis was successful (4 outputs), prepare download | |
| if len(result) == 4 and isinstance(result[3], pd.DataFrame) and not result[3].empty: | |
| download_file_path = prepare_download(result[3]) | |
| return result[0], result[1], result[2], download_file_path | |
| else: | |
| return result[0], result[1], result[2], None | |
| # Create Gradio Interface with Glass Theme | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="SEO Content Gap Analysis", | |
| theme=gr.themes.Glass( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| neutral_hue="zinc", | |
| font="Inter" | |
| ) | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π SEO Content Relevance Analysis | |
| Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity. | |
| **Enhanced with Trafilatura** for superior content extraction and intelligent header-based chunking. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Configuration") | |
| api_key = gr.Textbox( | |
| label="OpenAI API Key", | |
| placeholder="sk-...", | |
| type="password", | |
| info="Your OpenAI API key for embeddings and analysis" | |
| ) | |
| keyword = gr.Textbox( | |
| label="Target Keyword", | |
| placeholder="e.g., python web scraping", | |
| info="The keyword you want to optimize for" | |
| ) | |
| client_url = gr.Textbox( | |
| label="Your Page URL", | |
| placeholder="https://yoursite.com/page", | |
| info="The URL of your page to analyze" | |
| ) | |
| competitor_urls = gr.Textbox( | |
| label="Competitor URLs", | |
| placeholder="https://competitor1.com/page\nhttps://competitor2.com/page", | |
| lines=5, | |
| info="One URL per line (2-5 competitors recommended)" | |
| ) | |
| analyze_btn = gr.Button("π Run Analysis", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Results") | |
| with gr.Tabs(): | |
| with gr.TabItem("π SEO Report"): | |
| report_output = gr.Markdown( | |
| label="AI-Generated SEO Analysis Report", | |
| value="Click 'Run Analysis' to generate your comprehensive SEO report with actionable insights..." | |
| ) | |
| with gr.TabItem("π Page Summary"): | |
| summary_output = gr.Dataframe( | |
| label="Page Performance Summary", | |
| headers=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"], | |
| value=pd.DataFrame(columns=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"]) | |
| ) | |
| with gr.TabItem("π― Top Content"): | |
| top_content_output = gr.Dataframe( | |
| label="Top Performing Content Sections", | |
| headers=["Rank", "Type", "Score", "Content Preview", "URL"], | |
| value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"]) | |
| ) | |
| with gr.TabItem("π Vector Data"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π₯ Download Complete Analysis Data") | |
| gr.Markdown(""" | |
| **Contains:** | |
| - All content chunks with similarity scores | |
| - Full content text for each chunk | |
| - Header information and chunk types | |
| - Perfect for further analysis in Excel/Python | |
| """) | |
| download_file = gr.File( | |
| label="Vector Data CSV (Generated after analysis)", | |
| interactive=False | |
| ) | |
| # Enhanced example section | |
| gr.Markdown(""" | |
| ### π‘ Example Usage | |
| **Keyword:** `content marketing strategy` | |
| **Your URL:** `https://yoursite.com/content-marketing-guide` | |
| **Competitors:** | |
| ``` | |
| https://hubspot.com/content-marketing | |
| https://contentmarketinginstitute.com/strategy | |
| https://neilpatel.com/blog/content-marketing-strategy | |
| ``` | |
| ### β¨ What's New | |
| - **Enhanced Content Extraction**: Uses Trafilatura for better content quality | |
| - **Intelligent Chunking**: Header-aware splitting for more accurate analysis | |
| - **Improved Accuracy**: Better handling of complex page structures | |
| - **Glass Theme**: Modern, sleek interface design | |
| """) | |
| # Event handlers | |
| analyze_btn.click( | |
| fn=handle_analysis_and_download, | |
| inputs=[api_key, keyword, client_url, competitor_urls], | |
| outputs=[report_output, summary_output, top_content_output, download_file] | |
| ) | |
| gr.Markdown(""" | |
| ### β οΈ Important Notes | |
| - Analysis may take 2-5 minutes depending on content size | |
| - Requires OpenAI API key (costs ~$0.01-0.10 per analysis) | |
| - Enhanced extraction works best with any type of web content | |
| - Trafilatura respects robots.txt and implements smart rate limiting | |
| - Glass theme provides modern, professional appearance | |
| """) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |