sammoftah's picture
Deploy Code Search Engine
a6356f4 verified
import gradio as gr
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from pygments import highlight
from pygments.lexers import get_lexer_by_name, guess_lexer
from pygments.formatters import HtmlFormatter
import re
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from shared.components import create_method_panel, create_premium_hero
# Load code-specific model
embedder = SentenceTransformer('microsoft/codebert-base')
# Global storage
dataset_sample = None
embeddings = None
code_samples = []
def load_code_dataset(progress=gr.Progress()):
"""Load a sample of The Stack dataset."""
global dataset_sample, embeddings, code_samples
progress(0, desc="Loading The Stack dataset...")
try:
# Load Python subset (smaller and more accessible)
dataset_sample = load_dataset(
"bigcode/the-stack-smol",
data_dir="data/python",
split="train",
streaming=True
)
# Take first 500 samples
code_samples = []
progress(0.3, desc="Sampling code repositories...")
for i, item in enumerate(dataset_sample):
if i >= 500:
break
code = item.get('content', '')
if len(code) < 50 or len(code) > 5000: # Filter very short/long
continue
code_samples.append({
'code': code,
'language': 'python',
'size': len(code),
'max_stars_repo_name': item.get('max_stars_repo_name', 'unknown'),
'max_stars_count': item.get('max_stars_count', 0),
'license': item.get('max_stars_repo_licenses', ['unknown'])[0] if item.get('max_stars_repo_licenses') else 'unknown'
})
if len(code_samples) >= 300:
break
progress(0.7, desc="Creating code embeddings...")
code_texts = [c['code'][:512] for c in code_samples] # Use first 512 chars
embeddings = embedder.encode(code_texts, show_progress_bar=False)
progress(1.0, desc="Ready!")
avg_stars = np.mean([c['max_stars_count'] for c in code_samples])
return f"βœ… Loaded {len(code_samples)} code samples (avg stars: {avg_stars:.0f})"
except Exception as e:
return f"❌ Error: {str(e)}\nNote: Using fallback - dataset requires internet"
def extract_function_name(code):
"""Extract main function/class name from code."""
# Look for function definitions
func_match = re.search(r'def\s+(\w+)\s*\(', code)
if func_match:
return func_match.group(1)
# Look for class definitions
class_match = re.search(r'class\s+(\w+)\s*[:\(]', code)
if class_match:
return class_match.group(1)
return "code snippet"
def syntax_highlight_code(code, language='python'):
"""Apply syntax highlighting to code."""
try:
lexer = get_lexer_by_name(language)
formatter = HtmlFormatter(style='monokai', noclasses=True)
highlighted = highlight(code, lexer, formatter)
return highlighted
except:
return f"<pre><code>{code}</code></pre>"
def search_code(query, language='python', min_stars=0, top_k=5):
"""Search for code samples."""
if embeddings is None or not code_samples:
return []
# Filter by language and stars
filtered_samples = [
(i, sample) for i, sample in enumerate(code_samples)
if sample['language'] == language and sample['max_stars_count'] >= min_stars
]
if not filtered_samples:
# Fallback: remove star filter
filtered_samples = [(i, sample) for i, sample in enumerate(code_samples)]
indices = [i for i, _ in filtered_samples]
filtered_embeddings = embeddings[indices]
# Search
query_embedding = embedder.encode([query])
similarities = np.dot(filtered_embeddings, query_embedding.T).flatten()
top_indices = np.argsort(similarities)[-top_k:][::-1]
# Map back to original samples
results = []
for idx in top_indices:
original_idx = indices[idx]
sample = code_samples[original_idx].copy()
sample['similarity'] = float(similarities[idx])
results.append(sample)
return results
def format_code_results(results, query):
"""Format code search results."""
if not results:
return "<p>No code samples found. Try adjusting filters or query.</p>"
html = f"<h2>πŸ” Code Search Results</h2>"
html += f"<p><strong>Query:</strong> {query}</p>"
html += f"<p><strong>Found:</strong> {len(results)} relevant code samples</p>"
html += "<hr>"
for i, result in enumerate(results, 1):
html += f"<div style='margin: 20px 0; padding: 15px; background: #1e1e1e; border-radius: 8px;'>"
html += f"<h3 style='color: #fff;'>Result {i}: {extract_function_name(result['code'])}</h3>"
# Metadata
html += f"<p style='color: #888;'>"
html += f"<strong>Repo:</strong> {result['max_stars_repo_name']} | "
html += f"<strong>Stars:</strong> ⭐ {result['max_stars_count']} | "
html += f"<strong>License:</strong> {result['license']} | "
html += f"<strong>Relevance:</strong> {result['similarity']:.3f}"
html += f"</p>"
# Code
code = result['code'][:1000] # Limit display length
highlighted = syntax_highlight_code(code, result['language'])
html += highlighted
# Copy button (using JavaScript)
escaped_code = result['code'].replace('`', '\\`').replace('$', '\\$')
html += f"""
<button onclick="navigator.clipboard.writeText(`{escaped_code}`);
this.innerText='Copied!';
setTimeout(() => this.innerText='Copy Code', 2000);"
style="margin-top: 10px; padding: 8px 16px; background: #4CAF50; color: white;
border: none; border-radius: 4px; cursor: pointer;">
Copy Code
</button>
"""
html += "</div>"
return html
def perform_code_search(query, language, min_stars, num_results, progress=gr.Progress()):
"""Perform code search."""
if not query:
return "<p>Please enter a search query</p>", ""
if embeddings is None:
return "<p>Please load the dataset first</p>", ""
progress(0, desc="Searching code...")
results = search_code(query, language, min_stars, top_k=num_results)
progress(0.7, desc="Formatting results...")
formatted = format_code_results(results, query)
progress(1.0, desc="Done!")
# Stats
stats = f"""
### πŸ“Š Search Statistics
- **Total samples**: {len(code_samples)}
- **Results**: {len(results)}
- **Language**: {language}
- **Min stars**: {min_stars}
- **Model**: CodeBERT (Microsoft)
### 🧠 How CodeBERT Works
CodeBERT is trained on code and documentation:
- Understands programming patterns
- Maps code to natural language
- Trained on GitHub repos
- Supports multiple languages
"""
return formatted, stats
# Gradio Interface
with gr.Blocks(title="Code Search Engine", theme=gr.themes.Soft()) as demo:
create_premium_hero(
"Semantic Code Search Engine",
"Search code with natural language using code embeddings, dataset sampling, and syntax-highlighted retrieval results.",
"πŸ’»",
badge="Code Intelligence",
highlights=["CodeBERT", "The Stack sample", "Semantic retrieval"],
)
create_method_panel({
"Technique": "Encode code snippets into vectors and rank them against natural-language queries.",
"What it proves": "You can adapt embedding search beyond documents into developer tooling.",
"HF capability": "Combines Hub datasets with transformer embeddings in an interactive Space.",
})
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Step 1: Load Dataset")
load_btn = gr.Button("Load Code Dataset", variant="primary")
load_status = gr.Textbox(label="Status", interactive=False)
gr.Markdown("### Step 2: Search Code")
query_input = gr.Textbox(
label="What are you looking for?",
placeholder="e.g., binary search implementation",
lines=2
)
language = gr.Dropdown(
choices=['python'],
value='python',
label="Language (more coming soon)"
)
min_stars = gr.Slider(
minimum=0,
maximum=1000,
value=0,
step=10,
label="Minimum GitHub Stars"
)
num_results = gr.Slider(
minimum=3,
maximum=10,
value=5,
step=1,
label="Number of Results"
)
search_btn = gr.Button("Search Code", variant="primary")
gr.Markdown("""
### πŸ’‘ Example Searches:
- "binary search tree"
- "web scraper with requests"
- "recursive fibonacci"
- "API client with authentication"
- "data validation decorator"
""")
with gr.Column(scale=2):
results_output = gr.HTML(label="Code Results")
with gr.Accordion("πŸ“Š Statistics & Info", open=False):
stats_output = gr.Markdown()
gr.Markdown("""
### 🎯 Why Semantic Code Search?
**Traditional search** (GitHub, Google):
- Keyword matching only
- Must know exact function names
- Hard to find by functionality
**Semantic search** (this tool):
- Search by what code does, not what it's called
- "sort a list" finds quicksort, mergesort, etc.
- Understands programming concepts
### πŸ”§ Features:
- **Syntax highlighting** with Pygments
- **Copy to clipboard** button
- **Filter by stars** (code quality proxy)
- **License information** (know before you use)
- **CodeBERT embeddings** (code + NL understanding)
Perfect for developers learning, debugging, or finding code examples!
""")
load_btn.click(
load_code_dataset,
outputs=[load_status]
)
search_btn.click(
perform_code_search,
inputs=[query_input, language, min_stars, num_results],
outputs=[results_output, stats_output]
)
if __name__ == "__main__":
demo.launch()