Spaces:

technophyle
/

code-compass

Runtime error

App Files Files Community

technophyle commited on about 17 hours ago

Commit

35c1d2c

verified ·

1 Parent(s): f1089a9

Sync from GitHub via hub-sync

Browse files

Files changed (7) hide show

README.md +54 -9
server_app.py +11 -1
src/document_processor.py +21 -12
src/hybrid_search.py +6 -4
src/qdrant_keepalive.py +83 -0
src/rag_system.py +38 -12
src/vector_store.py +17 -7

README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 ---
 title: Code Compass API
-emoji: 🚀
 colorFrom: blue
 colorTo: indigo
 sdk: docker
@@ -9,13 +8,59 @@ app_port: 7860
 # Code Compass Backend
-FastAPI backend for a session-oriented GitHub repo QA tool.
-Behavior:
-- Clones a public GitHub repo
-- Chunks it with tree-sitter
-- Builds retrieval state with a Qdrant adapter
-- Answers questions with Groq-hosted Llama or Amazon Bedrock Claude depending on environment configuration
-- Deletes the cloned repo after indexing
-- Keeps only lightweight repo metadata in SQLite

 ---
 title: Code Compass API
 colorFrom: blue
 colorTo: indigo
 sdk: docker
 # Code Compass Backend
+FastAPI backend for Code Compass, a personal full-stack RAG project that indexes public GitHub repositories and answers questions with grounded source citations.
+## What This Demonstrates
+- End-to-end AI application design, not just a prompt wrapper
+- Backend API design with FastAPI, Pydantic validation, and session-scoped state
+- Code-aware retrieval using tree-sitter chunking, vector search, BM25, rank fusion, and reranking
+- Grounded answer generation with file-level citations
+- Deployment-aware tradeoffs for cost, model choice, and free-tier infrastructure
+- Evaluation workflow prepared for retrieval and answer-quality metrics
+## Backend Responsibilities
+- Clone a public GitHub repository into temporary storage
+- Filter and chunk source files for retrieval
+- Generate embeddings and store chunks in Qdrant
+- Maintain lightweight repository and session metadata in SQLite
+- Run indexing as a background task
+- Retrieve evidence with semantic search, lexical search, fusion, and reranking
+- Generate answers from the selected context and return citations to the UI
+- Delete cloned repository files after indexing
+## Runtime Configuration
+Local development is configured for higher-quality experimentation:
+- `LLM_PROVIDER=bedrock`
+- `EMBEDDING_PROVIDER=bedrock`
+- Claude on Amazon Bedrock for answer generation
+- Cohere Embed on Amazon Bedrock for semantic retrieval
+Production is configured for lower-cost hosting:
+- `LLM_PROVIDER=groq`
+- `EMBEDDING_PROVIDER=local`
+- Groq-hosted Llama for answer generation
+- Local sentence-transformer embeddings for retrieval
+- Qdrant Cloud for vector storage
+## Qdrant Keepalive
+The backend starts a lightweight Qdrant keepalive scheduler when `QDRANT_URL` is configured. It calls the configured collection every 12 hours by default so a free-tier Qdrant cluster does not become inactive while the backend process is running.
+Configuration:
+- `QDRANT_URL`
+- `QDRANT_API_KEY`
+- `QDRANT_COLLECTION=repo_qa_chunks`
+- `QDRANT_KEEPALIVE_ENABLED=true`
+- `QDRANT_KEEPALIVE_INTERVAL_SECONDS=43200`
+The main repository also includes a GitHub Actions keepalive workflow for cases where the backend host is asleep.
+## Metrics
+Metrics will be added after the next benchmark rerun. The evaluation harness is set up to report retrieval hit rate, top-1 hit rate, mean reciprocal rank, source recall, grounded answer rate, checklist pass rate, and optional RAGAS judge metrics.

server_app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pydantic import BaseModel, Field, HttpUrl
 from dotenv import load_dotenv
 from src.bedrock_claude import BedrockTransientError, is_bedrock_retryable_error
 from src.rag_system import CodebaseRAGSystem
 load_dotenv(Path(__file__).with_name(".env"))
@@ -34,6 +35,7 @@ app.add_middleware(
 )
 rag_system: Optional[CodebaseRAGSystem] = None
 class RepoIndexRequest(BaseModel):
@@ -60,9 +62,17 @@ def require_session_id(x_session_id: Optional[str] = Header(None, alias="X-Sessi
 @app.on_event("startup")
 def startup():
-    global rag_system
     Path("./data").mkdir(exist_ok=True)
     rag_system = CodebaseRAGSystem()
 @app.get("/")

 from dotenv import load_dotenv
 from src.bedrock_claude import BedrockTransientError, is_bedrock_retryable_error
+from src.qdrant_keepalive import QdrantKeepAliveScheduler
 from src.rag_system import CodebaseRAGSystem
 load_dotenv(Path(__file__).with_name(".env"))
 )
 rag_system: Optional[CodebaseRAGSystem] = None
+qdrant_keepalive: Optional[QdrantKeepAliveScheduler] = None
 class RepoIndexRequest(BaseModel):
 @app.on_event("startup")
 def startup():
+    global qdrant_keepalive, rag_system
     Path("./data").mkdir(exist_ok=True)
     rag_system = CodebaseRAGSystem()
+    qdrant_keepalive = QdrantKeepAliveScheduler(rag_system.vector_store)
+    qdrant_keepalive.start()
+@app.on_event("shutdown")
+def shutdown():
+    if qdrant_keepalive is not None:
+        qdrant_keepalive.stop()
 @app.get("/")

src/document_processor.py CHANGED Viewed

@@ -5,8 +5,9 @@ import pypdf
 class DocumentProcessor:
-    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
@@ -19,7 +20,6 @@ class DocumentProcessor:
                     text += page.extract_text() + "\n"
         except Exception as e:
             raise ValueError(f"Error reading PDF: {str(e)}")
         return text.strip()
     def chunk_text(self, text: str) -> List[str]:
@@ -35,24 +35,34 @@ class DocumentProcessor:
             chunk = text[start:end]
             if end < text_length:
                 last_period = chunk.rfind(".")
                 last_newline = chunk.rfind("\n")
-                break_point = max(last_period, last_newline)
-                if break_point > self.chunk_size * 0.5:
-                    chunk = chunk[: break_point + 1]
-                    end = start + break_point + 1
-            chunks.append(chunk.strip())
             start = end - self.chunk_overlap
         return [c for c in chunks if c]
     def process_document(self, file_path: str) -> Tuple[str, List[str]]:
         file_ext = Path(file_path).suffix.lower()
         if file_ext == ".pdf":
             text = self.extract_text_from_pdf(file_path)
         elif file_ext == ".txt":
@@ -62,7 +72,6 @@ class DocumentProcessor:
             raise ValueError(f"Unsupported file type: {file_ext}")
         chunks = self.chunk_text(text)
         return text, chunks
     @staticmethod
@@ -71,4 +80,4 @@ class DocumentProcessor:
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
                 hash_md5.update(chunk)
-        return hash_md5.hexdigest()

 class DocumentProcessor:
+    # FIX: 512 chars was too small — caused mid-thought splits that hurt faithfulness.
+    # 1200 chars with 150 overlap keeps full function signatures + docstrings intact.
+    def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 150):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
                     text += page.extract_text() + "\n"
         except Exception as e:
             raise ValueError(f"Error reading PDF: {str(e)}")
         return text.strip()
     def chunk_text(self, text: str) -> List[str]:
             chunk = text[start:end]
             if end < text_length:
+                # FIX: prefer paragraph breaks > sentence breaks > newlines.
+                # Previously only checked period + newline, missing paragraph boundaries.
+                last_double_newline = chunk.rfind("\n\n")
                 last_period = chunk.rfind(".")
                 last_newline = chunk.rfind("\n")
+                if last_double_newline > self.chunk_size * 0.4:
+                    break_point = last_double_newline + 1
+                elif last_period > self.chunk_size * 0.5:
+                    break_point = last_period + 1
+                elif last_newline > self.chunk_size * 0.4:
+                    break_point = last_newline + 1
+                else:
+                    break_point = self.chunk_size
+                chunk = chunk[:break_point]
+                end = start + break_point
+            stripped = chunk.strip()
+            if stripped:
+                chunks.append(stripped)
             start = end - self.chunk_overlap
         return [c for c in chunks if c]
     def process_document(self, file_path: str) -> Tuple[str, List[str]]:
         file_ext = Path(file_path).suffix.lower()
         if file_ext == ".pdf":
             text = self.extract_text_from_pdf(file_path)
         elif file_ext == ".txt":
             raise ValueError(f"Unsupported file type: {file_ext}")
         chunks = self.chunk_text(text)
         return text, chunks
     @staticmethod
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
                 hash_md5.update(chunk)
+        return hash_md5.hexdigest()

src/hybrid_search.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import math
 import re
 from collections import defaultdict
 from typing import List
@@ -6,7 +5,6 @@ from typing import List
 from rank_bm25 import BM25Okapi
 from sentence_transformers import CrossEncoder
 TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*")
@@ -27,7 +25,6 @@ class HybridSearchEngine:
     def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]:
         if not chunks:
             return []
         tokens = tokenize(query)
         if not tokens:
             return []
@@ -73,6 +70,11 @@ class HybridSearchEngine:
         return merged[:top_k]
     def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]:
         if not candidates:
             return []
@@ -99,4 +101,4 @@ class HybridSearchEngine:
             enriched["semantic_rank"] = rank
             enriched["semantic_score"] = float(item.get("semantic_score", 0.0))
             normalized.append(enriched)
-        return normalized

 import re
 from collections import defaultdict
 from typing import List
 from rank_bm25 import BM25Okapi
 from sentence_transformers import CrossEncoder
 TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*")
     def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]:
         if not chunks:
             return []
         tokens = tokenize(query)
         if not tokens:
             return []
         return merged[:top_k]
     def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]:
+        """
+        FIX: top_k now defaults to 6 and callers should pass a small final number (4-6),
+        NOT search_depth (which was up to 120). Reranking 120 items then dumping them
+        all into the LLM context was the main faithfulness killer.
+        """
         if not candidates:
             return []
             enriched["semantic_rank"] = rank
             enriched["semantic_score"] = float(item.get("semantic_score", 0.0))
             normalized.append(enriched)
+        return normalized

src/qdrant_keepalive.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import threading
+from typing import Optional
+from src.vector_store import QdrantVectorStore
+class QdrantKeepAliveScheduler:
+    def __init__(self, vector_store: QdrantVectorStore):
+        self.vector_store = vector_store
+        self.interval_seconds = self._interval_seconds()
+        self.run_on_start = self._env_flag("QDRANT_KEEPALIVE_RUN_ON_START", True)
+        self.keepalive_enabled = self._env_flag("QDRANT_KEEPALIVE_ENABLED", True)
+        self.enabled = self.keepalive_enabled and self.vector_store.is_remote()
+        self._stop_event = threading.Event()
+        self._thread: Optional[threading.Thread] = None
+    def start(self):
+        if not self.enabled:
+            reason = (
+                "disabled by QDRANT_KEEPALIVE_ENABLED"
+                if not self.keepalive_enabled
+                else "set QDRANT_URL to enable remote Qdrant pings"
+            )
+            print(
+                f"[qdrant-keepalive] Disabled; {reason}",
+                flush=True,
+            )
+            return
+        if self._thread and self._thread.is_alive():
+            return
+        self._stop_event.clear()
+        self._thread = threading.Thread(
+            target=self._run,
+            name="qdrant-keepalive",
+            daemon=True,
+        )
+        self._thread.start()
+        print(
+            f"[qdrant-keepalive] Started interval_seconds={self.interval_seconds}",
+            flush=True,
+        )
+    def stop(self):
+        self._stop_event.set()
+        if self._thread and self._thread.is_alive():
+            self._thread.join(timeout=5)
+        self._thread = None
+    def _run(self):
+        if self.run_on_start:
+            self._ping()
+        while not self._stop_event.wait(self.interval_seconds):
+            self._ping()
+    def _ping(self):
+        try:
+            stats = self.vector_store.keep_alive()
+            print(
+                "[qdrant-keepalive] Ping succeeded "
+                f"collection={stats['collection_name']} "
+                f"points={stats['total_vectors']}",
+                flush=True,
+            )
+        except Exception as exc:
+            print(f"[qdrant-keepalive] Ping failed: {exc}", flush=True)
+    @staticmethod
+    def _env_flag(name: str, default: bool) -> bool:
+        value = os.getenv(name)
+        if value is None:
+            return default
+        return value.strip().lower() not in {"0", "false", "no", "off"}
+    @staticmethod
+    def _interval_seconds() -> int:
+        value = os.getenv("QDRANT_KEEPALIVE_INTERVAL_SECONDS", "43200")
+        try:
+            return max(60, int(value))
+        except ValueError:
+            return 43200

src/rag_system.py CHANGED Viewed

@@ -333,7 +333,7 @@ class CodebaseRAGSystem:
         session_key: str,
         question: str,
         top_k: int = 8,
-        history: Optional[List[object]] = None,
     ) -> dict:
         session = get_db_session(self.database_url)
         try:
@@ -369,8 +369,10 @@ class CodebaseRAGSystem:
                 else top_k * shallow_multiplier
             )
             search_depth = max(top_k, min(search_depth, 120))
             retrieval_query = self._build_retrieval_query(question, normalized_history)
             query_embedding = self.embedder.embed_text(retrieval_query)
             semantic_hits = []
             for score, meta in self.vector_store.search(query_embedding, k=search_depth, repo_filter=repo_id):
                 serialized = dict(meta)
@@ -384,6 +386,7 @@ class CodebaseRAGSystem:
             )
             semantic_hits = self.hybrid_search.normalize_semantic_results(semantic_hits)
             fused = self.hybrid_search.reciprocal_rank_fusion(lexical_hits, semantic_hits, top_k=search_depth)
             path_hits = self._path_intent_search(
                 self.repo_chunks[repo_id],
                 question,
@@ -391,13 +394,25 @@ class CodebaseRAGSystem:
                 top_k=search_depth,
             )
             fused = self._merge_ranked_candidates(fused, path_hits, top_k=search_depth)
             rerank_query = retrieval_query if question_intent in deep_search_intents else question
-            reranked = self.hybrid_search.rerank(rerank_query, fused, top_k=search_depth)
             reranked = self._prioritize_results(question, retrieval_query, reranked, top_k=top_k)
-            reranked = self._select_answer_sources(question, reranked, top_k=top_k)
-            answer = self._generate_answer(repo, question, reranked, normalized_history)
             return answer
         finally:
             session.close()
@@ -411,12 +426,13 @@ class CodebaseRAGSystem:
         finally:
             session.close()
     def _generate_answer(
         self,
-        repo: Repository,
         question: str,
-        sources: List[dict],
-        history: Optional[List[dict]] = None,
     ) -> dict:
         if not sources:
             return {
@@ -428,7 +444,10 @@ class CodebaseRAGSystem:
         context_blocks = []
         slim_sources = []
         for index, source in enumerate(sources, start=1):
             context_blocks.append(
                 "\n".join(
                     [
@@ -436,7 +455,7 @@ class CodebaseRAGSystem:
                         f"File: {source['file_path']}",
                         f"Symbol: {source['symbol_name']}",
                         f"Lines: {source['line_start']}-{source['line_end']}",
-                        source["content"][:2500],
                     ]
                 )
             )
@@ -495,17 +514,23 @@ Rules:
 """
         joined_context = "\n\n".join(context_blocks)
         user_prompt = f"""
 Repository: {repo.owner}/{repo.name}
-Question: {question}
 Recent conversation:
 {self._format_history(history or [])}
-Context:
-{joined_context}
 """
         answer_text, finish_reason = self._generate_markdown_response(system_prompt, user_prompt)
         if self._looks_incomplete(answer_text, finish_reason):
             repair_prompt = f"""
 The draft answer below appears to be cut off or incomplete.
@@ -519,7 +544,7 @@ Draft answer:
                 f"{user_prompt.strip()}\n\n{repair_prompt.strip()}",
             )
             if self._looks_incomplete(answer_text, finish_reason):
-                short_prompt = f"""
 Answer the question again, but keep it concise and complete.
 Use 2 short paragraphs or 4-6 bullets max.
 Do not leave the answer unfinished.
@@ -528,6 +553,7 @@ Do not leave the answer unfinished.
                     system_prompt,
                     f"{user_prompt.strip()}\n\n{short_prompt.strip()}",
                 )
         answer_text = self._finalize_answer(answer_text)
         confidence = self._estimate_confidence(sources)
         summary = " ".join(answer_text.split())[:160] if answer_text else ""

         session_key: str,
         question: str,
         top_k: int = 8,
+        history=None,
     ) -> dict:
         session = get_db_session(self.database_url)
         try:
                 else top_k * shallow_multiplier
             )
             search_depth = max(top_k, min(search_depth, 120))
             retrieval_query = self._build_retrieval_query(question, normalized_history)
             query_embedding = self.embedder.embed_text(retrieval_query)
             semantic_hits = []
             for score, meta in self.vector_store.search(query_embedding, k=search_depth, repo_filter=repo_id):
                 serialized = dict(meta)
             )
             semantic_hits = self.hybrid_search.normalize_semantic_results(semantic_hits)
             fused = self.hybrid_search.reciprocal_rank_fusion(lexical_hits, semantic_hits, top_k=search_depth)
             path_hits = self._path_intent_search(
                 self.repo_chunks[repo_id],
                 question,
                 top_k=search_depth,
             )
             fused = self._merge_ranked_candidates(fused, path_hits, top_k=search_depth)
             rerank_query = retrieval_query if question_intent in deep_search_intents else question
+            # FIX: rerank to a small candidate pool first (20), then let
+            # _prioritize_results and _select_answer_sources trim to final top_k.
+            # Previously rerank was called with search_depth (up to 120), meaning
+            # the LLM received far too many chunks and faithfulness dropped.
+            rerank_pool = min(search_depth, 20)
+            reranked = self.hybrid_search.rerank(rerank_query, fused, top_k=rerank_pool)
             reranked = self._prioritize_results(question, retrieval_query, reranked, top_k=top_k)
+            # FIX: cap final sources at 5 instead of top_k (8).
+            # 5 sources × 1500 chars = ~7500 chars context, which the LLM handles well.
+            # 8 sources × 2500 chars = ~20000 chars, which causes lost-in-the-middle issues.
+            final_top_k = min(top_k, 5)
+            reranked = self._select_answer_sources(question, reranked, top_k=final_top_k)
+            answer = self._generate_answer(repo, question, reranked, normalized_history)
             return answer
         finally:
             session.close()
         finally:
             session.close()
     def _generate_answer(
         self,
+        repo,
         question: str,
+        sources: list,
+        history=None,
     ) -> dict:
         if not sources:
             return {
         context_blocks = []
         slim_sources = []
         for index, source in enumerate(sources, start=1):
+            content_preview = source["content"][:1500]
             context_blocks.append(
                 "\n".join(
                     [
                         f"File: {source['file_path']}",
                         f"Symbol: {source['symbol_name']}",
                         f"Lines: {source['line_start']}-{source['line_end']}",
+                        content_preview,
                     ]
                 )
             )
 """
         joined_context = "\n\n".join(context_blocks)
+        # FIX: context is placed BEFORE the question (prompt ordering fix).
         user_prompt = f"""
 Repository: {repo.owner}/{repo.name}
+Context from the codebase:
+{joined_context}
 Recent conversation:
 {self._format_history(history or [])}
+Now answer this question using only the context above:
+{question}
 """
         answer_text, finish_reason = self._generate_markdown_response(system_prompt, user_prompt)
         if self._looks_incomplete(answer_text, finish_reason):
             repair_prompt = f"""
 The draft answer below appears to be cut off or incomplete.
                 f"{user_prompt.strip()}\n\n{repair_prompt.strip()}",
             )
             if self._looks_incomplete(answer_text, finish_reason):
+                short_prompt = """
 Answer the question again, but keep it concise and complete.
 Use 2 short paragraphs or 4-6 bullets max.
 Do not leave the answer unfinished.
                     system_prompt,
                     f"{user_prompt.strip()}\n\n{short_prompt.strip()}",
                 )
         answer_text = self._finalize_answer(answer_text)
         confidence = self._estimate_confidence(sources)
         summary = " ".join(answer_text.split())[:160] if answer_text else ""

src/vector_store.py CHANGED Viewed

@@ -11,18 +11,18 @@ class QdrantVectorStore:
         self.embedding_dim = embedding_dim
         self.collection_name = os.getenv("QDRANT_COLLECTION", "repo_qa_chunks")
         self.upsert_batch_size = max(1, int(os.getenv("QDRANT_UPSERT_BATCH_SIZE", "64")))
         self.client = self._create_client()
         self._ensure_collection()
     def _create_client(self):
-        url = self._clean_env("QDRANT_URL")
-        api_key = self._clean_env("QDRANT_API_KEY")
-        timeout = int(os.getenv("QDRANT_TIMEOUT_SECONDS", "120"))
-        if url:
             return QdrantClient(
-                url=url,
-                api_key=api_key,
-                timeout=timeout,
                 check_compatibility=False,
             )
         return QdrantClient(":memory:")
@@ -149,6 +149,16 @@ class QdrantVectorStore:
     def load(self):
         self._ensure_collection()
     def get_stats(self) -> dict:
         info = self.client.get_collection(self.collection_name)
         return {

         self.embedding_dim = embedding_dim
         self.collection_name = os.getenv("QDRANT_COLLECTION", "repo_qa_chunks")
         self.upsert_batch_size = max(1, int(os.getenv("QDRANT_UPSERT_BATCH_SIZE", "64")))
+        self.qdrant_url = self._clean_env("QDRANT_URL")
+        self.qdrant_api_key = self._clean_env("QDRANT_API_KEY")
+        self.timeout = int(os.getenv("QDRANT_TIMEOUT_SECONDS", "120"))
         self.client = self._create_client()
         self._ensure_collection()
     def _create_client(self):
+        if self.qdrant_url:
             return QdrantClient(
+                url=self.qdrant_url,
+                api_key=self.qdrant_api_key,
+                timeout=self.timeout,
                 check_compatibility=False,
             )
         return QdrantClient(":memory:")
     def load(self):
         self._ensure_collection()
+    def is_remote(self) -> bool:
+        return self.qdrant_url is not None
+    def keep_alive(self) -> dict:
+        info = self.client.get_collection(self.collection_name)
+        return {
+            "total_vectors": info.points_count or 0,
+            "collection_name": self.collection_name,
+        }
     def get_stats(self) -> dict:
         info = self.client.get_collection(self.collection_name)
         return {