Spaces:

sammoftah
/

code-search-engine

Running

File size: 10,513 Bytes

a6356f4

import gradio as gr
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from pygments import highlight
from pygments.lexers import get_lexer_by_name, guess_lexer
from pygments.formatters import HtmlFormatter
import re
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from shared.components import create_method_panel, create_premium_hero

# Load code-specific model
embedder = SentenceTransformer('microsoft/codebert-base')

# Global storage
dataset_sample = None
embeddings = None
code_samples = []

def load_code_dataset(progress=gr.Progress()):
    """Load a sample of The Stack dataset."""
    global dataset_sample, embeddings, code_samples

    progress(0, desc="Loading The Stack dataset...")
    try:
        # Load Python subset (smaller and more accessible)
        dataset_sample = load_dataset(
            "bigcode/the-stack-smol",
            data_dir="data/python",
            split="train",
            streaming=True
        )

        # Take first 500 samples
        code_samples = []
        progress(0.3, desc="Sampling code repositories...")
        for i, item in enumerate(dataset_sample):
            if i >= 500:
                break

            code = item.get('content', '')
            if len(code) < 50 or len(code) > 5000:  # Filter very short/long
                continue

            code_samples.append({
                'code': code,
                'language': 'python',
                'size': len(code),
                'max_stars_repo_name': item.get('max_stars_repo_name', 'unknown'),
                'max_stars_count': item.get('max_stars_count', 0),
                'license': item.get('max_stars_repo_licenses', ['unknown'])[0] if item.get('max_stars_repo_licenses') else 'unknown'
            })

            if len(code_samples) >= 300:
                break

        progress(0.7, desc="Creating code embeddings...")
        code_texts = [c['code'][:512] for c in code_samples]  # Use first 512 chars
        embeddings = embedder.encode(code_texts, show_progress_bar=False)

        progress(1.0, desc="Ready!")
        avg_stars = np.mean([c['max_stars_count'] for c in code_samples])
        return f"✅ Loaded {len(code_samples)} code samples (avg stars: {avg_stars:.0f})"

    except Exception as e:
        return f"❌ Error: {str(e)}\nNote: Using fallback - dataset requires internet"

def extract_function_name(code):
    """Extract main function/class name from code."""
    # Look for function definitions
    func_match = re.search(r'def\s+(\w+)\s*\(', code)
    if func_match:
        return func_match.group(1)

    # Look for class definitions
    class_match = re.search(r'class\s+(\w+)\s*[:\(]', code)
    if class_match:
        return class_match.group(1)

    return "code snippet"

def syntax_highlight_code(code, language='python'):
    """Apply syntax highlighting to code."""
    try:
        lexer = get_lexer_by_name(language)
        formatter = HtmlFormatter(style='monokai', noclasses=True)
        highlighted = highlight(code, lexer, formatter)
        return highlighted
    except:
        return f"<pre><code>{code}</code></pre>"

def search_code(query, language='python', min_stars=0, top_k=5):
    """Search for code samples."""
    if embeddings is None or not code_samples:
        return []

    # Filter by language and stars
    filtered_samples = [
        (i, sample) for i, sample in enumerate(code_samples)
        if sample['language'] == language and sample['max_stars_count'] >= min_stars
    ]

    if not filtered_samples:
        # Fallback: remove star filter
        filtered_samples = [(i, sample) for i, sample in enumerate(code_samples)]

    indices = [i for i, _ in filtered_samples]
    filtered_embeddings = embeddings[indices]

    # Search
    query_embedding = embedder.encode([query])
    similarities = np.dot(filtered_embeddings, query_embedding.T).flatten()
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    # Map back to original samples
    results = []
    for idx in top_indices:
        original_idx = indices[idx]
        sample = code_samples[original_idx].copy()
        sample['similarity'] = float(similarities[idx])
        results.append(sample)

    return results

def format_code_results(results, query):
    """Format code search results."""
    if not results:
        return "<p>No code samples found. Try adjusting filters or query.</p>"

    html = f"<h2>🔍 Code Search Results</h2>"
    html += f"<p><strong>Query:</strong> {query}</p>"
    html += f"<p><strong>Found:</strong> {len(results)} relevant code samples</p>"
    html += "<hr>"

    for i, result in enumerate(results, 1):
        html += f"<div style='margin: 20px 0; padding: 15px; background: #1e1e1e; border-radius: 8px;'>"
        html += f"<h3 style='color: #fff;'>Result {i}: {extract_function_name(result['code'])}</h3>"

        # Metadata
        html += f"<p style='color: #888;'>"
        html += f"<strong>Repo:</strong> {result['max_stars_repo_name']} | "
        html += f"<strong>Stars:</strong> ⭐ {result['max_stars_count']} | "
        html += f"<strong>License:</strong> {result['license']} | "
        html += f"<strong>Relevance:</strong> {result['similarity']:.3f}"
        html += f"</p>"

        # Code
        code = result['code'][:1000]  # Limit display length
        highlighted = syntax_highlight_code(code, result['language'])
        html += highlighted

        # Copy button (using JavaScript)
        escaped_code = result['code'].replace('`', '\\`').replace('$', '\\$')
        html += f"""
        <button onclick="navigator.clipboard.writeText(`{escaped_code}`);
                        this.innerText='Copied!';
                        setTimeout(() => this.innerText='Copy Code', 2000);"
                style="margin-top: 10px; padding: 8px 16px; background: #4CAF50; color: white;
                       border: none; border-radius: 4px; cursor: pointer;">
            Copy Code
        </button>
        """

        html += "</div>"

    return html

def perform_code_search(query, language, min_stars, num_results, progress=gr.Progress()):
    """Perform code search."""
    if not query:
        return "<p>Please enter a search query</p>", ""

    if embeddings is None:
        return "<p>Please load the dataset first</p>", ""

    progress(0, desc="Searching code...")
    results = search_code(query, language, min_stars, top_k=num_results)

    progress(0.7, desc="Formatting results...")
    formatted = format_code_results(results, query)

    progress(1.0, desc="Done!")

    # Stats
    stats = f"""
### 📊 Search Statistics

- **Total samples**: {len(code_samples)}
- **Results**: {len(results)}
- **Language**: {language}
- **Min stars**: {min_stars}
- **Model**: CodeBERT (Microsoft)

### 🧠 How CodeBERT Works

CodeBERT is trained on code and documentation:
- Understands programming patterns
- Maps code to natural language
- Trained on GitHub repos
- Supports multiple languages
"""

    return formatted, stats

# Gradio Interface
with gr.Blocks(title="Code Search Engine", theme=gr.themes.Soft()) as demo:
    create_premium_hero(
        "Semantic Code Search Engine",
        "Search code with natural language using code embeddings, dataset sampling, and syntax-highlighted retrieval results.",
        "💻",
        badge="Code Intelligence",
        highlights=["CodeBERT", "The Stack sample", "Semantic retrieval"],
    )
    create_method_panel({
        "Technique": "Encode code snippets into vectors and rank them against natural-language queries.",
        "What it proves": "You can adapt embedding search beyond documents into developer tooling.",
        "HF capability": "Combines Hub datasets with transformer embeddings in an interactive Space.",
    })

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Step 1: Load Dataset")
            load_btn = gr.Button("Load Code Dataset", variant="primary")
            load_status = gr.Textbox(label="Status", interactive=False)

            gr.Markdown("### Step 2: Search Code")
            query_input = gr.Textbox(
                label="What are you looking for?",
                placeholder="e.g., binary search implementation",
                lines=2
            )

            language = gr.Dropdown(
                choices=['python'],
                value='python',
                label="Language (more coming soon)"
            )

            min_stars = gr.Slider(
                minimum=0,
                maximum=1000,
                value=0,
                step=10,
                label="Minimum GitHub Stars"
            )

            num_results = gr.Slider(
                minimum=3,
                maximum=10,
                value=5,
                step=1,
                label="Number of Results"
            )

            search_btn = gr.Button("Search Code", variant="primary")

            gr.Markdown("""
            ### 💡 Example Searches:
            - "binary search tree"
            - "web scraper with requests"
            - "recursive fibonacci"
            - "API client with authentication"
            - "data validation decorator"
            """)

        with gr.Column(scale=2):
            results_output = gr.HTML(label="Code Results")

            with gr.Accordion("📊 Statistics & Info", open=False):
                stats_output = gr.Markdown()

    gr.Markdown("""
    ### 🎯 Why Semantic Code Search?

    **Traditional search** (GitHub, Google):
    - Keyword matching only
    - Must know exact function names
    - Hard to find by functionality

    **Semantic search** (this tool):
    - Search by what code does, not what it's called
    - "sort a list" finds quicksort, mergesort, etc.
    - Understands programming concepts

    ### 🔧 Features:

    - **Syntax highlighting** with Pygments
    - **Copy to clipboard** button
    - **Filter by stars** (code quality proxy)
    - **License information** (know before you use)
    - **CodeBERT embeddings** (code + NL understanding)

    Perfect for developers learning, debugging, or finding code examples!
    """)

    load_btn.click(
        load_code_dataset,
        outputs=[load_status]
    )

    search_btn.click(
        perform_code_search,
        inputs=[query_input, language, min_stars, num_results],
        outputs=[results_output, stats_output]
    )

if __name__ == "__main__":
    demo.launch()