Spaces:
Running
Running
File size: 10,513 Bytes
a6356f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | import gradio as gr
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from pygments import highlight
from pygments.lexers import get_lexer_by_name, guess_lexer
from pygments.formatters import HtmlFormatter
import re
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from shared.components import create_method_panel, create_premium_hero
# Load code-specific model
embedder = SentenceTransformer('microsoft/codebert-base')
# Global storage
dataset_sample = None
embeddings = None
code_samples = []
def load_code_dataset(progress=gr.Progress()):
"""Load a sample of The Stack dataset."""
global dataset_sample, embeddings, code_samples
progress(0, desc="Loading The Stack dataset...")
try:
# Load Python subset (smaller and more accessible)
dataset_sample = load_dataset(
"bigcode/the-stack-smol",
data_dir="data/python",
split="train",
streaming=True
)
# Take first 500 samples
code_samples = []
progress(0.3, desc="Sampling code repositories...")
for i, item in enumerate(dataset_sample):
if i >= 500:
break
code = item.get('content', '')
if len(code) < 50 or len(code) > 5000: # Filter very short/long
continue
code_samples.append({
'code': code,
'language': 'python',
'size': len(code),
'max_stars_repo_name': item.get('max_stars_repo_name', 'unknown'),
'max_stars_count': item.get('max_stars_count', 0),
'license': item.get('max_stars_repo_licenses', ['unknown'])[0] if item.get('max_stars_repo_licenses') else 'unknown'
})
if len(code_samples) >= 300:
break
progress(0.7, desc="Creating code embeddings...")
code_texts = [c['code'][:512] for c in code_samples] # Use first 512 chars
embeddings = embedder.encode(code_texts, show_progress_bar=False)
progress(1.0, desc="Ready!")
avg_stars = np.mean([c['max_stars_count'] for c in code_samples])
return f"β
Loaded {len(code_samples)} code samples (avg stars: {avg_stars:.0f})"
except Exception as e:
return f"β Error: {str(e)}\nNote: Using fallback - dataset requires internet"
def extract_function_name(code):
"""Extract main function/class name from code."""
# Look for function definitions
func_match = re.search(r'def\s+(\w+)\s*\(', code)
if func_match:
return func_match.group(1)
# Look for class definitions
class_match = re.search(r'class\s+(\w+)\s*[:\(]', code)
if class_match:
return class_match.group(1)
return "code snippet"
def syntax_highlight_code(code, language='python'):
"""Apply syntax highlighting to code."""
try:
lexer = get_lexer_by_name(language)
formatter = HtmlFormatter(style='monokai', noclasses=True)
highlighted = highlight(code, lexer, formatter)
return highlighted
except:
return f"<pre><code>{code}</code></pre>"
def search_code(query, language='python', min_stars=0, top_k=5):
"""Search for code samples."""
if embeddings is None or not code_samples:
return []
# Filter by language and stars
filtered_samples = [
(i, sample) for i, sample in enumerate(code_samples)
if sample['language'] == language and sample['max_stars_count'] >= min_stars
]
if not filtered_samples:
# Fallback: remove star filter
filtered_samples = [(i, sample) for i, sample in enumerate(code_samples)]
indices = [i for i, _ in filtered_samples]
filtered_embeddings = embeddings[indices]
# Search
query_embedding = embedder.encode([query])
similarities = np.dot(filtered_embeddings, query_embedding.T).flatten()
top_indices = np.argsort(similarities)[-top_k:][::-1]
# Map back to original samples
results = []
for idx in top_indices:
original_idx = indices[idx]
sample = code_samples[original_idx].copy()
sample['similarity'] = float(similarities[idx])
results.append(sample)
return results
def format_code_results(results, query):
"""Format code search results."""
if not results:
return "<p>No code samples found. Try adjusting filters or query.</p>"
html = f"<h2>π Code Search Results</h2>"
html += f"<p><strong>Query:</strong> {query}</p>"
html += f"<p><strong>Found:</strong> {len(results)} relevant code samples</p>"
html += "<hr>"
for i, result in enumerate(results, 1):
html += f"<div style='margin: 20px 0; padding: 15px; background: #1e1e1e; border-radius: 8px;'>"
html += f"<h3 style='color: #fff;'>Result {i}: {extract_function_name(result['code'])}</h3>"
# Metadata
html += f"<p style='color: #888;'>"
html += f"<strong>Repo:</strong> {result['max_stars_repo_name']} | "
html += f"<strong>Stars:</strong> β {result['max_stars_count']} | "
html += f"<strong>License:</strong> {result['license']} | "
html += f"<strong>Relevance:</strong> {result['similarity']:.3f}"
html += f"</p>"
# Code
code = result['code'][:1000] # Limit display length
highlighted = syntax_highlight_code(code, result['language'])
html += highlighted
# Copy button (using JavaScript)
escaped_code = result['code'].replace('`', '\\`').replace('$', '\\$')
html += f"""
<button onclick="navigator.clipboard.writeText(`{escaped_code}`);
this.innerText='Copied!';
setTimeout(() => this.innerText='Copy Code', 2000);"
style="margin-top: 10px; padding: 8px 16px; background: #4CAF50; color: white;
border: none; border-radius: 4px; cursor: pointer;">
Copy Code
</button>
"""
html += "</div>"
return html
def perform_code_search(query, language, min_stars, num_results, progress=gr.Progress()):
"""Perform code search."""
if not query:
return "<p>Please enter a search query</p>", ""
if embeddings is None:
return "<p>Please load the dataset first</p>", ""
progress(0, desc="Searching code...")
results = search_code(query, language, min_stars, top_k=num_results)
progress(0.7, desc="Formatting results...")
formatted = format_code_results(results, query)
progress(1.0, desc="Done!")
# Stats
stats = f"""
### π Search Statistics
- **Total samples**: {len(code_samples)}
- **Results**: {len(results)}
- **Language**: {language}
- **Min stars**: {min_stars}
- **Model**: CodeBERT (Microsoft)
### π§ How CodeBERT Works
CodeBERT is trained on code and documentation:
- Understands programming patterns
- Maps code to natural language
- Trained on GitHub repos
- Supports multiple languages
"""
return formatted, stats
# Gradio Interface
with gr.Blocks(title="Code Search Engine", theme=gr.themes.Soft()) as demo:
create_premium_hero(
"Semantic Code Search Engine",
"Search code with natural language using code embeddings, dataset sampling, and syntax-highlighted retrieval results.",
"π»",
badge="Code Intelligence",
highlights=["CodeBERT", "The Stack sample", "Semantic retrieval"],
)
create_method_panel({
"Technique": "Encode code snippets into vectors and rank them against natural-language queries.",
"What it proves": "You can adapt embedding search beyond documents into developer tooling.",
"HF capability": "Combines Hub datasets with transformer embeddings in an interactive Space.",
})
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Step 1: Load Dataset")
load_btn = gr.Button("Load Code Dataset", variant="primary")
load_status = gr.Textbox(label="Status", interactive=False)
gr.Markdown("### Step 2: Search Code")
query_input = gr.Textbox(
label="What are you looking for?",
placeholder="e.g., binary search implementation",
lines=2
)
language = gr.Dropdown(
choices=['python'],
value='python',
label="Language (more coming soon)"
)
min_stars = gr.Slider(
minimum=0,
maximum=1000,
value=0,
step=10,
label="Minimum GitHub Stars"
)
num_results = gr.Slider(
minimum=3,
maximum=10,
value=5,
step=1,
label="Number of Results"
)
search_btn = gr.Button("Search Code", variant="primary")
gr.Markdown("""
### π‘ Example Searches:
- "binary search tree"
- "web scraper with requests"
- "recursive fibonacci"
- "API client with authentication"
- "data validation decorator"
""")
with gr.Column(scale=2):
results_output = gr.HTML(label="Code Results")
with gr.Accordion("π Statistics & Info", open=False):
stats_output = gr.Markdown()
gr.Markdown("""
### π― Why Semantic Code Search?
**Traditional search** (GitHub, Google):
- Keyword matching only
- Must know exact function names
- Hard to find by functionality
**Semantic search** (this tool):
- Search by what code does, not what it's called
- "sort a list" finds quicksort, mergesort, etc.
- Understands programming concepts
### π§ Features:
- **Syntax highlighting** with Pygments
- **Copy to clipboard** button
- **Filter by stars** (code quality proxy)
- **License information** (know before you use)
- **CodeBERT embeddings** (code + NL understanding)
Perfect for developers learning, debugging, or finding code examples!
""")
load_btn.click(
load_code_dataset,
outputs=[load_status]
)
search_btn.click(
perform_code_search,
inputs=[query_input, language, min_stars, num_results],
outputs=[results_output, stats_output]
)
if __name__ == "__main__":
demo.launch()
|