Spaces:

sammoftah
/

code-search-engine

Running

App Files Files Community

code-search-engine / app.py

sammoftah

Deploy Code Search Engine

a6356f4 verified 5 days ago

raw

history blame contribute delete

10.5 kB

	import gradio as gr
	from datasets import load_dataset
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from pygments import highlight
	from pygments.lexers import get_lexer_by_name, guess_lexer
	from pygments.formatters import HtmlFormatter
	import re
	import os
	import sys

	sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
	from shared.components import create_method_panel, create_premium_hero

	# Load code-specific model
	embedder = SentenceTransformer('microsoft/codebert-base')

	# Global storage
	dataset_sample = None
	embeddings = None
	code_samples = []

	def load_code_dataset(progress=gr.Progress()):
	"""Load a sample of The Stack dataset."""
	global dataset_sample, embeddings, code_samples

	progress(0, desc="Loading The Stack dataset...")
	try:
	# Load Python subset (smaller and more accessible)
	dataset_sample = load_dataset(
	"bigcode/the-stack-smol",
	data_dir="data/python",
	split="train",
	streaming=True
	)

	# Take first 500 samples
	code_samples = []
	progress(0.3, desc="Sampling code repositories...")
	for i, item in enumerate(dataset_sample):
	if i >= 500:
	break

	code = item.get('content', '')
	if len(code) < 50 or len(code) > 5000: # Filter very short/long
	continue

	code_samples.append({
	'code': code,
	'language': 'python',
	'size': len(code),
	'max_stars_repo_name': item.get('max_stars_repo_name', 'unknown'),
	'max_stars_count': item.get('max_stars_count', 0),
	'license': item.get('max_stars_repo_licenses', ['unknown'])[0] if item.get('max_stars_repo_licenses') else 'unknown'
	})

	if len(code_samples) >= 300:
	break

	progress(0.7, desc="Creating code embeddings...")
	code_texts = [c['code'][:512] for c in code_samples] # Use first 512 chars
	embeddings = embedder.encode(code_texts, show_progress_bar=False)

	progress(1.0, desc="Ready!")
	avg_stars = np.mean([c['max_stars_count'] for c in code_samples])
	return f"✅ Loaded {len(code_samples)} code samples (avg stars: {avg_stars:.0f})"

	except Exception as e:
	return f"❌ Error: {str(e)}\nNote: Using fallback - dataset requires internet"

	def extract_function_name(code):
	"""Extract main function/class name from code."""
	# Look for function definitions
	func_match = re.search(r'def\s+(\w+)\s*\(', code)
	if func_match:
	return func_match.group(1)

	# Look for class definitions
	class_match = re.search(r'class\s+(\w+)\s*[:\(]', code)
	if class_match:
	return class_match.group(1)

	return "code snippet"

	def syntax_highlight_code(code, language='python'):
	"""Apply syntax highlighting to code."""
	try:
	lexer = get_lexer_by_name(language)
	formatter = HtmlFormatter(style='monokai', noclasses=True)
	highlighted = highlight(code, lexer, formatter)
	return highlighted
	except:
	return f"<pre><code>{code}</code></pre>"

	def search_code(query, language='python', min_stars=0, top_k=5):
	"""Search for code samples."""
	if embeddings is None or not code_samples:
	return []

	# Filter by language and stars
	filtered_samples = [
	(i, sample) for i, sample in enumerate(code_samples)
	if sample['language'] == language and sample['max_stars_count'] >= min_stars
	]

	if not filtered_samples:
	# Fallback: remove star filter
	filtered_samples = [(i, sample) for i, sample in enumerate(code_samples)]

	indices = [i for i, _ in filtered_samples]
	filtered_embeddings = embeddings[indices]

	# Search
	query_embedding = embedder.encode([query])
	similarities = np.dot(filtered_embeddings, query_embedding.T).flatten()
	top_indices = np.argsort(similarities)[-top_k:][::-1]

	# Map back to original samples
	results = []
	for idx in top_indices:
	original_idx = indices[idx]
	sample = code_samples[original_idx].copy()
	sample['similarity'] = float(similarities[idx])
	results.append(sample)

	return results

	def format_code_results(results, query):
	"""Format code search results."""
	if not results:
	return "<p>No code samples found. Try adjusting filters or query.</p>"

	html = f"<h2>🔍 Code Search Results</h2>"
	html += f"<p><strong>Query:</strong> {query}</p>"
	html += f"<p><strong>Found:</strong> {len(results)} relevant code samples</p>"
	html += "<hr>"

	for i, result in enumerate(results, 1):
	html += f"<div style='margin: 20px 0; padding: 15px; background: #1e1e1e; border-radius: 8px;'>"
	html += f"<h3 style='color: #fff;'>Result {i}: {extract_function_name(result['code'])}</h3>"

	# Metadata
	html += f"<p style='color: #888;'>"
	html += f"<strong>Repo:</strong> {result['max_stars_repo_name']} \| "
	html += f"<strong>Stars:</strong> ⭐ {result['max_stars_count']} \| "
	html += f"<strong>License:</strong> {result['license']} \| "
	html += f"<strong>Relevance:</strong> {result['similarity']:.3f}"
	html += f"</p>"

	# Code
	code = result['code'][:1000] # Limit display length
	highlighted = syntax_highlight_code(code, result['language'])
	html += highlighted

	# Copy button (using JavaScript)
	escaped_code = result['code'].replace('`', '\\`').replace('$', '\\$')
	html += f"""
	<button onclick="navigator.clipboard.writeText(`{escaped_code}`);
	this.innerText='Copied!';
	setTimeout(() => this.innerText='Copy Code', 2000);"
	style="margin-top: 10px; padding: 8px 16px; background: #4CAF50; color: white;
	border: none; border-radius: 4px; cursor: pointer;">
	Copy Code
	</button>
	"""

	html += "</div>"

	return html

	def perform_code_search(query, language, min_stars, num_results, progress=gr.Progress()):
	"""Perform code search."""
	if not query:
	return "<p>Please enter a search query</p>", ""

	if embeddings is None:
	return "<p>Please load the dataset first</p>", ""

	progress(0, desc="Searching code...")
	results = search_code(query, language, min_stars, top_k=num_results)

	progress(0.7, desc="Formatting results...")
	formatted = format_code_results(results, query)

	progress(1.0, desc="Done!")

	# Stats
	stats = f"""
	### 📊 Search Statistics

	- Total samples: {len(code_samples)}
	- Results: {len(results)}
	- Language: {language}
	- Min stars: {min_stars}
	- Model: CodeBERT (Microsoft)

	### 🧠 How CodeBERT Works

	CodeBERT is trained on code and documentation:
	- Understands programming patterns
	- Maps code to natural language
	- Trained on GitHub repos
	- Supports multiple languages
	"""

	return formatted, stats

	# Gradio Interface
	with gr.Blocks(title="Code Search Engine", theme=gr.themes.Soft()) as demo:
	create_premium_hero(
	"Semantic Code Search Engine",
	"Search code with natural language using code embeddings, dataset sampling, and syntax-highlighted retrieval results.",
	"💻",
	badge="Code Intelligence",
	highlights=["CodeBERT", "The Stack sample", "Semantic retrieval"],
	)
	create_method_panel({
	"Technique": "Encode code snippets into vectors and rank them against natural-language queries.",
	"What it proves": "You can adapt embedding search beyond documents into developer tooling.",
	"HF capability": "Combines Hub datasets with transformer embeddings in an interactive Space.",
	})

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Step 1: Load Dataset")
	load_btn = gr.Button("Load Code Dataset", variant="primary")
	load_status = gr.Textbox(label="Status", interactive=False)

	gr.Markdown("### Step 2: Search Code")
	query_input = gr.Textbox(
	label="What are you looking for?",
	placeholder="e.g., binary search implementation",
	lines=2
	)

	language = gr.Dropdown(
	choices=['python'],
	value='python',
	label="Language (more coming soon)"
	)

	min_stars = gr.Slider(
	minimum=0,
	maximum=1000,
	value=0,
	step=10,
	label="Minimum GitHub Stars"
	)

	num_results = gr.Slider(
	minimum=3,
	maximum=10,
	value=5,
	step=1,
	label="Number of Results"
	)

	search_btn = gr.Button("Search Code", variant="primary")

	gr.Markdown("""
	### 💡 Example Searches:
	- "binary search tree"
	- "web scraper with requests"
	- "recursive fibonacci"
	- "API client with authentication"
	- "data validation decorator"
	""")

	with gr.Column(scale=2):
	results_output = gr.HTML(label="Code Results")

	with gr.Accordion("📊 Statistics & Info", open=False):
	stats_output = gr.Markdown()

	gr.Markdown("""
	### 🎯 Why Semantic Code Search?

	Traditional search (GitHub, Google):
	- Keyword matching only
	- Must know exact function names
	- Hard to find by functionality

	Semantic search (this tool):
	- Search by what code does, not what it's called
	- "sort a list" finds quicksort, mergesort, etc.
	- Understands programming concepts

	### 🔧 Features:

	- Syntax highlighting with Pygments
	- Copy to clipboard button
	- Filter by stars (code quality proxy)
	- License information (know before you use)
	- CodeBERT embeddings (code + NL understanding)

	Perfect for developers learning, debugging, or finding code examples!
	""")

	load_btn.click(
	load_code_dataset,
	outputs=[load_status]
	)

	search_btn.click(
	perform_code_search,
	inputs=[query_input, language, min_stars, num_results],
	outputs=[results_output, stats_output]
	)

	if __name__ == "__main__":
	demo.launch()