| | """ |
| | DevRev Search Evaluation Leaderboard |
| | |
| | An interactive leaderboard for benchmarking search and retrieval systems |
| | on enterprise knowledge bases. Built with Gradio and ready for Hugging Face Spaces. |
| | |
| | Uses MTEB-style standardized JSON format for evaluation results. |
| | """ |
| |
|
| | import base64 |
| | import io |
| | import json |
| | import os |
| | from datetime import datetime |
| | from pathlib import Path |
| |
|
| | import gradio as gr |
| | import matplotlib.pyplot as plt |
| | import pandas as pd |
| | from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns |
| |
|
| |
|
| | def load_results_from_json(): |
| | """Load evaluation results from standardized JSON files""" |
| | results = [] |
| |
|
| | |
| | results_dirs = ["results", "leaderboard/results", "."] |
| | results_dir = None |
| |
|
| | for dir_path in results_dirs: |
| | if os.path.exists(dir_path): |
| | temp_dir = Path(dir_path) |
| | if any(temp_dir.glob("*.json")): |
| | results_dir = temp_dir |
| | break |
| |
|
| | if not results_dir: |
| | print( |
| | "No results directory found. Please create a 'results' directory with JSON files." |
| | ) |
| | return [] |
| |
|
| | |
| | for json_file in results_dir.glob("*.json"): |
| | |
| | if json_file.name == "RESULT_SCHEMA.json": |
| | continue |
| |
|
| | try: |
| | with open(json_file, "r") as f: |
| | data = json.load(f) |
| | |
| | if "model_name" in data and "metrics" in data: |
| | results.append(data) |
| | print(f"Loaded: {json_file.name}") |
| | except Exception as e: |
| | print(f"Error loading {json_file}: {e}") |
| |
|
| | return results |
| |
|
| |
|
| | def create_leaderboard_data(): |
| | """Create the leaderboard dataframe from JSON results""" |
| |
|
| | |
| | results = load_results_from_json() |
| |
|
| | if not results: |
| | print( |
| | "No evaluation results found. Please add JSON files to the 'results' directory." |
| | ) |
| | return pd.DataFrame() |
| |
|
| | |
| | data = [] |
| | for result in results: |
| | metrics = result.get("metrics", {}) |
| |
|
| | |
| | paper_field = result.get("paper", "N/A") |
| | if paper_field and paper_field != "N/A": |
| | |
| | references = [ref.strip() for ref in paper_field.split(";")] |
| | formatted_refs = [] |
| | for ref in references: |
| | if ref.startswith("http"): |
| | |
| | formatted_refs.append(f"[{ref}]({ref})") |
| | else: |
| | |
| | formatted_refs.append(ref) |
| | paper_display = " | ".join(formatted_refs) |
| | else: |
| | paper_display = "N/A" |
| |
|
| | row = { |
| | "π Rank": 0, |
| | "π§ Method": result.get("model_name", "Unknown"), |
| | "π Paper/Details": paper_display, |
| | "π·οΈ Type": result.get("model_type", "Unknown"), |
| | "π Recall@5": metrics.get("recall@5", 0), |
| | "π Recall@10": metrics.get("recall@10", 0), |
| | "π Recall@25": metrics.get("recall@25", 0), |
| | "π Recall@50": metrics.get("recall@50", 0), |
| | "π Precision@5": metrics.get("precision@5", 0), |
| | "π Precision@10": metrics.get("precision@10", 0), |
| | "π Precision@25": metrics.get("precision@25", 0), |
| | "π Precision@50": metrics.get("precision@50", 0), |
| | "π Open Source": "β
" if result.get("open_source", False) else "β", |
| | "π
Date": result.get("evaluation_date", "N/A"), |
| | } |
| | data.append(row) |
| |
|
| | |
| | df = pd.DataFrame(data) |
| |
|
| | |
| | df = df.sort_values(["π Recall@10", "π Precision@10"], ascending=False) |
| |
|
| | |
| | df["π Rank"] = range(1, len(df) + 1) |
| |
|
| | |
| | columns_order = [ |
| | "π Rank", |
| | "π§ Method", |
| | "π Paper/Details", |
| | "π·οΈ Type", |
| | "π Recall@5", |
| | "π Recall@10", |
| | "π Recall@25", |
| | "π Recall@50", |
| | "π Precision@5", |
| | "π Precision@10", |
| | "π Precision@25", |
| | "π Precision@50", |
| | "π Open Source", |
| | "π
Date", |
| | ] |
| | df = df[columns_order] |
| |
|
| | return df |
| |
|
| |
|
| | def create_comparison_plot(): |
| | """Create performance comparison visualizations""" |
| | df = create_leaderboard_data() |
| |
|
| | if df.empty: |
| | return "<p style='text-align: center; color: #666;'>No data available for visualization. Please add evaluation results to the 'results' directory.</p>" |
| |
|
| | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) |
| |
|
| | |
| | df_sorted = df.sort_values("π Recall@50", ascending=True) |
| |
|
| | |
| | methods = df_sorted["π§ Method"].tolist() |
| | recall_50 = df_sorted["π Recall@50"].tolist() |
| | colors = ["#ff6b6b" if "DevRev" in m else "#4ecdc4" for m in methods] |
| |
|
| | ax1.barh(methods, recall_50, color=colors, alpha=0.8) |
| | ax1.set_xlabel("Recall@50 (%)", fontsize=12) |
| | ax1.set_title("Recall@50 Comparison", fontsize=14, fontweight="bold") |
| | ax1.grid(True, axis="x", alpha=0.3) |
| |
|
| | |
| | for i, (method, recall) in enumerate(zip(methods, recall_50)): |
| | ax1.text(recall + 0.5, i, f"{recall:.1f}%", va="center", fontsize=10) |
| |
|
| | |
| | precision_50 = df_sorted["π Precision@50"].tolist() |
| |
|
| | ax2.barh(methods, precision_50, color=colors, alpha=0.8) |
| | ax2.set_xlabel("Precision@50 (%)", fontsize=12) |
| | ax2.set_title("Precision@50 Comparison", fontsize=14, fontweight="bold") |
| | ax2.grid(True, axis="x", alpha=0.3) |
| |
|
| | |
| | for i, (method, precision) in enumerate(zip(methods, precision_50)): |
| | ax2.text( |
| | precision + 0.5, |
| | i, |
| | f"{precision:.1f}%", |
| | va="center", |
| | fontsize=10, |
| | ) |
| |
|
| | plt.tight_layout() |
| |
|
| | |
| | buf = io.BytesIO() |
| | plt.savefig(buf, format="png", dpi=150, bbox_inches="tight") |
| | buf.seek(0) |
| | img_base64 = base64.b64encode(buf.read()).decode() |
| | plt.close() |
| |
|
| | return f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; max-width: 1000px; margin: 20px auto; display: block;">' |
| |
|
| |
|
| | def create_interface(): |
| | """Create the Gradio interface with leaderboard and visualizations""" |
| |
|
| | deep_link_js = r""" |
| | () => { |
| | function openAboutAndScroll() { |
| | if (window.location.hash !== "#about") return; |
| | |
| | // Switch to the About tab (Gradio tabs are rendered as role="tab" buttons) |
| | const tabs = Array.from(document.querySelectorAll('button[role="tab"]')); |
| | const aboutTab = tabs.find((b) => (b.innerText || "").includes("About")); |
| | if (aboutTab) aboutTab.click(); |
| | |
| | // The About content is mounted after tab switch; retry briefly. |
| | let attempts = 0; |
| | const timer = setInterval(() => { |
| | const el = document.getElementById("about"); |
| | if (el) { |
| | el.scrollIntoView({ behavior: "smooth", block: "start" }); |
| | clearInterval(timer); |
| | } |
| | attempts += 1; |
| | if (attempts > 25) clearInterval(timer); |
| | }, 200); |
| | } |
| | |
| | window.addEventListener("hashchange", openAboutAndScroll); |
| | openAboutAndScroll(); |
| | setTimeout(openAboutAndScroll, 600); |
| | } |
| | """ |
| |
|
| | with gr.Blocks( |
| | title="DevRev Search Evaluation Leaderboard", js=deep_link_js |
| | ) as demo: |
| | |
| | gr.HTML( |
| | """ |
| | <div style="text-align: center; margin-bottom: 30px;"> |
| | <h1 style="font-size: 3em; font-weight: bold; margin-bottom: 10px;"> |
| | π DevRev Search Evaluation Leaderboard |
| | </h1> |
| | <p style="font-size: 1.2em; color: #666;"> |
| | Benchmarking Search and Retrieval Systems for Enterprise Knowledge Bases |
| | </p> |
| | </div> |
| | """ |
| | ) |
| |
|
| | |
| | with gr.Tabs(): |
| | |
| | with gr.TabItem("π Main Leaderboard"): |
| | gr.Markdown( |
| | """ |
| | ### Evaluation Overview |
| | This leaderboard displays metrics of search systems on the test queries present in [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search). |
| | All methods are evaluated on the same set of agent support queries with consistent evaluation protocols. |
| | |
| | **Metrics**: Recall@K and Precision@K measure the effectiveness of retrieving relevant articles within the top K retrieved articles. |
| | |
| | **Leaderboard ranking**: Sorted by **Recall@10** (primary) and **Precision@10** (secondary). |
| | |
| | **To add your results**: Submission details are available in the [About](#about) section. |
| | """ |
| | ) |
| |
|
| | |
| | df = create_leaderboard_data() |
| |
|
| | if not df.empty: |
| | |
| | default_columns = [ |
| | "π Rank", |
| | "π§ Method", |
| | "π·οΈ Type", |
| | "π Recall@10", |
| | "π Recall@50", |
| | "π Precision@10", |
| | "π Precision@50", |
| | "π Open Source", |
| | ] |
| |
|
| | |
| | type_column = ColumnFilter("π·οΈ Type", type="checkboxgroup") |
| | open_source_column = ColumnFilter( |
| | "π Open Source", type="checkboxgroup" |
| | ) |
| |
|
| | |
| | Leaderboard( |
| | value=df, |
| | datatype=[ |
| | "number", |
| | "markdown", |
| | "markdown", |
| | "str", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "str", |
| | "str", |
| | ], |
| | select_columns=SelectColumns( |
| | default_selection=default_columns, |
| | cant_deselect=[ |
| | "π Rank", |
| | "π§ Method", |
| | "π Recall@10", |
| | ], |
| | label="Select Columns to Display", |
| | ), |
| | search_columns=[ |
| | "π§ Method", |
| | "π Paper/Details", |
| | "π·οΈ Type", |
| | ], |
| | hide_columns=["π
Date"], |
| | filter_columns=[type_column, open_source_column], |
| | interactive=False, |
| | ) |
| | else: |
| | gr.HTML( |
| | """ |
| | <div style="text-align: center; padding: 50px; background: #f5f5f5; border-radius: 10px;"> |
| | <h3>No Results Found</h3> |
| | <p>Please add JSON evaluation files to the 'results' directory.</p> |
| | <p>See the About tab for the required format.</p> |
| | </div> |
| | """ |
| | ) |
| |
|
| | |
| | with gr.TabItem("βΉοΈ About"): |
| | gr.Markdown( |
| | """ |
| | ## About This Leaderboard |
| | |
| | This leaderboard tracks the performance of various search and retrieval systems on the [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search). |
| | |
| | ### π Evaluation Metrics |
| | |
| | - **Recall@K**: The percentage of relevant article chunks retrieved in the top K article chunks |
| | - **Precision@K**: The percentage of retrieved article chunks that are relevant among the top K article chunks |
| | |
| | ### π€ How to Submit |
| | |
| | 1. Run your retrieval on the test queries in DevRev Search Dataset |
| | 2. Submit the results in same format as annotated_queries in the dataset through email to prateek.jain@devrev.ai |
| | 3. Also include a **one-line system detail/link**, the **system type**, and whether it is **open source** |
| | |
| | ### π Resources |
| | |
| | - [Computer by DevRev](https://devrev.ai/meet-computer) |
| | - [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search) |
| | |
| | ### π Acknowledgments |
| | |
| | Inspired by: |
| | - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) |
| | - [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard) |
| | |
| | ### π Citation |
| | |
| | ```bibtex |
| | @misc{devrev_search_leaderboard_2026, |
| | title={DevRev Search Leaderboard}, |
| | author={Research@DevRev}, |
| | year={2026}, |
| | url={https://huggingface.co/spaces/devrev/search} |
| | } |
| | ``` |
| | """, |
| | elem_id="about", |
| | ) |
| |
|
| | |
| | gr.HTML( |
| | f""" |
| | <div style="text-align: center; margin-top: 50px; padding: 20px; border-top: 1px solid #e0e0e0; color: #666;"> |
| | <p> |
| | Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")} |
| | </p> |
| | </div> |
| | """ |
| | ) |
| |
|
| | return demo |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | demo = create_interface() |
| | demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_api=False) |
| |
|