technophyle commited on
Commit
0815510
·
verified ·
1 Parent(s): 89769f4

Sync from GitHub via hub-sync

Browse files
Files changed (2) hide show
  1. evals/run_eval.py +45 -1
  2. src/rag_system.py +174 -93
evals/run_eval.py CHANGED
@@ -186,7 +186,19 @@ STOPWORDS = {
186
 
187
 
188
  def tokenize_text(text: str):
189
- return re.findall(r"[a-z0-9_./+-]+", (text or "").lower())
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
 
192
  def normalize_keywords(keywords):
@@ -504,6 +516,36 @@ def build_headline_metrics(custom_metrics, audit):
504
  }
505
 
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  def build_resume_summary(custom_metrics, audit, ragas_report, ragas_error):
508
  lines = [
509
  (
@@ -812,6 +854,7 @@ def run():
812
  category_breakdown = summarize_by_category(details)
813
  ragas_report, ragas_error = run_ragas(rows, outputs)
814
  headline_metrics = build_headline_metrics(custom_metrics, audit)
 
815
  resume_summary = build_resume_summary(custom_metrics, audit, ragas_report, ragas_error)
816
  readiness = benchmark_readiness(audit, ragas_error)
817
 
@@ -836,6 +879,7 @@ def run():
836
  "eval_set_audit": audit,
837
  "headline_metrics": headline_metrics,
838
  "benchmark_readiness": readiness,
 
839
  "ragas": ragas_report,
840
  "ragas_error": ragas_error,
841
  "custom_metrics": custom_metrics,
 
186
 
187
 
188
  def tokenize_text(text: str):
189
+ tokens = []
190
+ for raw_token in re.findall(r"[A-Za-z0-9_./+-]+", text or ""):
191
+ token = raw_token.lower()
192
+ tokens.append(token)
193
+
194
+ camel_parts = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", raw_token).split()
195
+ split_parts = re.split(r"[._/+-]+", token)
196
+ for part in [*camel_parts, *split_parts]:
197
+ normalized = part.strip().lower()
198
+ if normalized and normalized != token:
199
+ tokens.append(normalized)
200
+
201
+ return tokens
202
 
203
 
204
  def normalize_keywords(keywords):
 
516
  }
517
 
518
 
519
+ def build_metric_guidance(custom_metrics, ragas_report):
520
+ retrieval_gate_thresholds = {
521
+ "retrieval_hit_rate": 0.8,
522
+ "top1_hit_rate": 0.8,
523
+ "mrr": 0.75,
524
+ }
525
+ retrieval_gate_pass = all(
526
+ custom_metrics[key] >= threshold
527
+ for key, threshold in retrieval_gate_thresholds.items()
528
+ )
529
+
530
+ next_focus = []
531
+ if custom_metrics["source_recall"] < 0.7:
532
+ next_focus.append("Improve multi-source recall for cross-file and implementation questions.")
533
+ if custom_metrics["duplicate_source_rate"] > 0.15:
534
+ next_focus.append("Reduce duplicate or near-duplicate source chunks before answer generation.")
535
+ if custom_metrics["grounded_answer_rate"] < 0.75:
536
+ next_focus.append("Tighten answer grounding and checklist coverage before presenting this as a broad benchmark.")
537
+ if ragas_report and ragas_report.get("context_precision", 1.0) < 0.7:
538
+ next_focus.append("Treat low RAGAS context precision as a context-selection signal, not as the primary pass/fail gate.")
539
+
540
+ return {
541
+ "primary_gate": "pass" if retrieval_gate_pass else "needs_work",
542
+ "primary_gate_basis": "deterministic_retrieval",
543
+ "primary_gate_thresholds": retrieval_gate_thresholds,
544
+ "ragas_role": "supporting_signal_not_primary_gate",
545
+ "next_focus": next_focus,
546
+ }
547
+
548
+
549
  def build_resume_summary(custom_metrics, audit, ragas_report, ragas_error):
550
  lines = [
551
  (
 
854
  category_breakdown = summarize_by_category(details)
855
  ragas_report, ragas_error = run_ragas(rows, outputs)
856
  headline_metrics = build_headline_metrics(custom_metrics, audit)
857
+ metric_guidance = build_metric_guidance(custom_metrics, ragas_report)
858
  resume_summary = build_resume_summary(custom_metrics, audit, ragas_report, ragas_error)
859
  readiness = benchmark_readiness(audit, ragas_error)
860
 
 
879
  "eval_set_audit": audit,
880
  "headline_metrics": headline_metrics,
881
  "benchmark_readiness": readiness,
882
+ "metric_guidance": metric_guidance,
883
  "ragas": ragas_report,
884
  "ragas_error": ragas_error,
885
  "custom_metrics": custom_metrics,
src/rag_system.py CHANGED
@@ -353,7 +353,15 @@ class CodebaseRAGSystem:
353
 
354
  normalized_history = self._normalize_history(history or [])
355
  question_intent = self._question_intent(question)
356
- search_depth = top_k * 4 if question_intent in {"api", "implementation", "cross_file", "setup"} else top_k * 2
 
 
 
 
 
 
 
 
357
  retrieval_query = self._build_retrieval_query(question, normalized_history)
358
  query_embedding = self.embedder.embed_text(retrieval_query)
359
  semantic_hits = []
@@ -369,7 +377,7 @@ class CodebaseRAGSystem:
369
  )
370
  semantic_hits = self.hybrid_search.normalize_semantic_results(semantic_hits)
371
  fused = self.hybrid_search.reciprocal_rank_fusion(lexical_hits, semantic_hits, top_k=search_depth)
372
- rerank_query = retrieval_query if question_intent in {"api", "implementation", "cross_file", "setup"} else question
373
  reranked = self.hybrid_search.rerank(rerank_query, fused, top_k=search_depth)
374
  reranked = self._prioritize_results(question, retrieval_query, reranked, top_k=top_k)
375
  reranked = self._select_answer_sources(question, reranked, top_k=top_k)
@@ -809,7 +817,7 @@ Do not leave the answer unfinished.
809
  ]
810
  )
811
  if not history:
812
- return normalized
813
 
814
  recent_user = [
815
  turn["content"].strip()
@@ -911,14 +919,6 @@ Do not leave the answer unfinished.
911
  if len(selected) == top_k:
912
  break
913
 
914
- if len(selected) < top_k:
915
- for item in results:
916
- if item in selected:
917
- continue
918
- selected.append(item)
919
- if len(selected) == top_k:
920
- break
921
-
922
  return selected
923
 
924
  @staticmethod
@@ -955,6 +955,8 @@ Do not leave the answer unfinished.
955
  return "general"
956
  if CodebaseRAGSystem._is_repo_overview_question(normalized):
957
  return "overview"
 
 
958
  if any(token in normalized for token in {"error", "invalid", "conflict", "raises", "guard against"}):
959
  return "error_handling"
960
  if any(token in normalized for token in {"how are", "how does", "flow", "across files", "code path"}):
@@ -973,53 +975,33 @@ Do not leave the answer unfinished.
973
  normalized = " ".join((question or "").split())
974
  lowered = normalized.lower()
975
  hints = []
 
976
 
977
  if any(token in lowered for token in {"export", "expose", "import"}):
978
- hints.extend(["package exports", "__init__.py", "public api", "re-export"])
979
- if "how is select exposed to users in sqlmodel" in lowered:
980
- hints.extend(
981
- [
982
- "sqlmodel/__init__.py",
983
- "sqlmodel/sql/expression.py",
984
- "select re-export",
985
- "top-level select import",
986
- ]
987
- )
988
- if "select" in lowered:
989
- hints.extend(
990
- [
991
- "select",
992
- "expression",
993
- "query builder",
994
- "public api",
995
- "sqlmodel/sql/expression.py",
996
- "sqlmodel/__init__.py",
997
- "re-export",
998
- "top-level import",
999
- ]
1000
- )
1001
  if "session.exec" in lowered or ("session" in lowered and "exec" in lowered):
1002
- hints.extend(["session exec", "orm/session.py", "asyncio/session.py"])
1003
- if "relationship" in lowered:
1004
- hints.extend(["relationship", "Relationship", "main.py"])
1005
- if "field" in lowered:
1006
- hints.extend(["Field", "FieldInfo", "main.py"])
1007
- if "create_engine" in lowered:
1008
- hints.extend(["create_engine", "__init__.py", "re-export"])
1009
- if "create_all" in lowered or "metadata" in lowered:
1010
- hints.extend(
1011
- [
1012
- "metadata create_all",
1013
- "table creation",
1014
- "engine",
1015
- "SQLModel.metadata",
1016
- "README.md",
1017
- "sqlmodel/main.py",
1018
- "docs_src",
1019
- ]
1020
- )
1021
  if "__init__" in lowered or "exports" in lowered:
1022
- hints.extend(["sqlmodel/__init__.py", "package exports", "public api"])
 
 
 
 
 
1023
 
1024
  if not hints:
1025
  return normalized
@@ -1069,58 +1051,157 @@ Do not leave the answer unfinished.
1069
 
1070
  def _canonical_path_priority(self, item: dict, question: str) -> int:
1071
  file_path = (item.get("file_path") or "").lower()
1072
- normalized = " ".join((question or "").lower().split())
 
 
 
 
 
 
 
 
 
 
 
 
 
1073
  score = 0
1074
 
1075
- if file_path == "sqlmodel/__init__.py":
1076
- score += 4 if any(token in normalized for token in {"export", "expose", "import", "create_engine", "select"}) else 0
1077
- if file_path == "sqlmodel/sql/expression.py":
1078
- score += 5 if "select" in normalized else 0
1079
- if file_path == "sqlmodel/sql/_expression_select_gen.py":
1080
- score += 2 if "select" in normalized else 0
1081
- if file_path == "sqlmodel/sql/_expression_select_cls.py":
1082
- score += 2 if "select" in normalized else 0
1083
- if file_path == "readme.md":
1084
- score += 4 if any(token in normalized for token in {"metadata", "create_all", "workflow", "readme"}) else 0
1085
- if file_path.startswith("docs_src/"):
1086
- score += 3 if any(token in normalized for token in {"metadata", "create_all", "table", "workflow"}) else 0
1087
- if file_path == "sqlmodel/main.py":
1088
- score += 3 if any(token in normalized for token in {"field", "relationship", "metadata", "table", "sqlmodel"}) else 0
1089
-
1090
- if "__init__.py" in file_path:
1091
- score += 2 if any(token in normalized for token in {"export", "expose", "import", "public api"}) else 0
1092
- if any(token in normalized for token in {"select", "expression"}):
1093
- if "expression" in file_path or "_expression_select" in file_path:
1094
  score += 3
1095
- if normalized == "how is select exposed to users in sqlmodel?":
1096
- if file_path == "sqlmodel/__init__.py":
1097
- score += 6
1098
- if file_path == "sqlmodel/sql/expression.py":
1099
- score += 6
1100
- if "session" in normalized:
1101
- if file_path.endswith("session.py") or "/session.py" in file_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1102
  score += 3
1103
- if "relationship" in normalized and file_path.endswith("main.py"):
1104
- score += 2
1105
- if "field" in normalized and file_path.endswith("main.py"):
1106
- score += 2
1107
- if any(token in normalized for token in {"create_engine", "export", "expose"}) and "__init__.py" in file_path:
1108
- score += 2
1109
- if any(token in normalized for token in {"metadata", "create_all", "table"}) and (
1110
- "docs_src/" in file_path or file_path.endswith("main.py") or file_path == "readme.md"
1111
- ):
1112
- score += 2
1113
- if self._is_doc_source(item) and self._question_intent(question) in {
1114
  "api",
1115
  "implementation",
1116
  "cross_file",
1117
  "error_handling",
1118
- "setup",
1119
  }:
1120
  score -= 1
1121
 
1122
  return score
1123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1124
  @staticmethod
1125
  def _is_substantive_assistant_message(content: str) -> bool:
1126
  normalized = " ".join((content or "").strip().lower().split())
 
353
 
354
  normalized_history = self._normalize_history(history or [])
355
  question_intent = self._question_intent(question)
356
+ deep_search_intents = {
357
+ "api",
358
+ "implementation",
359
+ "cross_file",
360
+ "error_handling",
361
+ "setup",
362
+ "tests",
363
+ }
364
+ search_depth = top_k * 4 if question_intent in deep_search_intents else top_k * 2
365
  retrieval_query = self._build_retrieval_query(question, normalized_history)
366
  query_embedding = self.embedder.embed_text(retrieval_query)
367
  semantic_hits = []
 
377
  )
378
  semantic_hits = self.hybrid_search.normalize_semantic_results(semantic_hits)
379
  fused = self.hybrid_search.reciprocal_rank_fusion(lexical_hits, semantic_hits, top_k=search_depth)
380
+ rerank_query = retrieval_query if question_intent in deep_search_intents else question
381
  reranked = self.hybrid_search.rerank(rerank_query, fused, top_k=search_depth)
382
  reranked = self._prioritize_results(question, retrieval_query, reranked, top_k=top_k)
383
  reranked = self._select_answer_sources(question, reranked, top_k=top_k)
 
817
  ]
818
  )
819
  if not history:
820
+ return self._expand_query_for_intent(normalized)
821
 
822
  recent_user = [
823
  turn["content"].strip()
 
919
  if len(selected) == top_k:
920
  break
921
 
 
 
 
 
 
 
 
 
922
  return selected
923
 
924
  @staticmethod
 
955
  return "general"
956
  if CodebaseRAGSystem._is_repo_overview_question(normalized):
957
  return "overview"
958
+ if any(token in normalized for token in {"test", "tests", "pytest", "spec"}):
959
+ return "tests"
960
  if any(token in normalized for token in {"error", "invalid", "conflict", "raises", "guard against"}):
961
  return "error_handling"
962
  if any(token in normalized for token in {"how are", "how does", "flow", "across files", "code path"}):
 
975
  normalized = " ".join((question or "").split())
976
  lowered = normalized.lower()
977
  hints = []
978
+ code_terms = self._query_code_terms(normalized)
979
 
980
  if any(token in lowered for token in {"export", "expose", "import"}):
981
+ hints.extend(["package exports", "__init__.py", "index", "public api", "re-export"])
982
+ if any(token in lowered for token in {"public api", "exposed", "exported"}):
983
+ hints.extend(["public api", "__init__.py", "index"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
  if "session.exec" in lowered or ("session" in lowered and "exec" in lowered):
985
+ hints.extend(["session exec", "session.py", "execute", "scalars"])
986
+ if "async" in lowered:
987
+ hints.extend(["async", "await", "asyncio"])
988
+ if any(token in lowered for token in {"relationship", "field", "function", "method", "class"}):
989
+ hints.extend(["class", "function", "method", "metadata"])
990
+ if any(token in lowered for token in {"under the hood", "implementation", "code path", "conversion"}):
991
+ hints.extend(["implementation", "source", "call path", "class", "function"])
992
+ if any(token in lowered for token in {"error", "invalid", "conflict", "raise", "raises", "guard"}):
993
+ hints.extend(["raise", "raises", "exception", "validation", "guard"])
994
+ if any(token in lowered for token in {"test", "tests", "pytest", "spec"}):
995
+ hints.extend(["test", "tests", "pytest", "spec"])
996
+ if any(token in lowered for token in {"create", "setup", "install", "configuration", "metadata", "table"}):
997
+ hints.extend(["create", "setup", "configure", "initialize", "schema", "README.md", "docs"])
 
 
 
 
 
 
998
  if "__init__" in lowered or "exports" in lowered:
999
+ hints.extend(["__init__.py", "package exports", "public api"])
1000
+
1001
+ for term in sorted(code_terms):
1002
+ parts = [part for part in re.split(r"[._/-]+", term) if len(part) > 2]
1003
+ if len(parts) > 1:
1004
+ hints.append(" ".join(parts))
1005
 
1006
  if not hints:
1007
  return normalized
 
1051
 
1052
  def _canonical_path_priority(self, item: dict, question: str) -> int:
1053
  file_path = (item.get("file_path") or "").lower()
1054
+ source_text = " ".join(
1055
+ [
1056
+ file_path,
1057
+ str(item.get("symbol_name") or "").lower(),
1058
+ str(item.get("signature") or "").lower(),
1059
+ ]
1060
+ )
1061
+ basename = file_path.rsplit("/", 1)[-1]
1062
+ stem = basename.rsplit(".", 1)[0]
1063
+ symbol_name = str(item.get("symbol_name") or "").lower()
1064
+ signature = str(item.get("signature") or "").lower()
1065
+ intent = self._question_intent(question)
1066
+ code_terms = self._query_code_terms(question)
1067
+ path_fragments = self._query_path_fragments(question)
1068
  score = 0
1069
 
1070
+ for fragment in path_fragments:
1071
+ if file_path == fragment or file_path.endswith(f"/{fragment}") or fragment in file_path:
1072
+ score += 8
1073
+
1074
+ matched_terms = {term for term in code_terms if term in source_text}
1075
+ score += min(len(matched_terms), 6)
1076
+
1077
+ for term in code_terms:
1078
+ if term == basename or term == stem:
1079
+ score += 4
1080
+ elif term in basename:
1081
+ score += 3
1082
+ if term and term in symbol_name:
 
 
 
 
 
 
1083
  score += 3
1084
+ if term and term in file_path:
1085
+ score += 2
1086
+ if term and term in signature:
1087
+ score += 1
1088
+
1089
+ if intent == "api":
1090
+ if basename == "__init__.py" or stem in {"index", "public", "api"}:
1091
+ score += 4
1092
+ if any(token in file_path for token in {"api", "route", "router", "controller"}):
1093
+ score += 2
1094
+ if intent in {"implementation", "cross_file"}:
1095
+ if not self._is_doc_source(item):
1096
+ score += 2
1097
+ if item.get("symbol_type") != "fallback_chunk":
1098
+ score += 1
1099
+ if intent == "tests":
1100
+ if (
1101
+ file_path.startswith("tests/")
1102
+ or "/tests/" in file_path
1103
+ or basename.startswith("test_")
1104
+ or basename.endswith("_test.py")
1105
+ or basename.endswith(".test.js")
1106
+ or basename.endswith(".spec.js")
1107
+ or basename.endswith(".test.ts")
1108
+ or basename.endswith(".spec.ts")
1109
+ ):
1110
+ score += 5
1111
+ if intent == "error_handling":
1112
+ if any(token in source_text for token in {"raise", "except", "error", "invalid", "exception"}):
1113
+ score += 3
1114
+ if "test" in file_path:
1115
+ score += 1
1116
+ if intent == "setup":
1117
+ setup_files = {
1118
+ "readme.md",
1119
+ "package.json",
1120
+ "pyproject.toml",
1121
+ "requirements.txt",
1122
+ "dockerfile",
1123
+ "docker-compose.yml",
1124
+ "compose.yml",
1125
+ }
1126
+ if basename in setup_files or any(token in file_path for token in {"config", "settings", "setup"}):
1127
  score += 3
1128
+ if any(token in source_text for token in {"create", "configure", "initialize", "metadata", "schema"}):
1129
+ score += 1
1130
+ if intent in {"docs", "overview"} and self._is_doc_source(item):
1131
+ score += self._doc_priority(item) + 1
1132
+ if self._is_doc_source(item) and intent in {
 
 
 
 
 
 
1133
  "api",
1134
  "implementation",
1135
  "cross_file",
1136
  "error_handling",
1137
+ "tests",
1138
  }:
1139
  score -= 1
1140
 
1141
  return score
1142
 
1143
+ @staticmethod
1144
+ def _query_code_terms(text: str) -> set:
1145
+ stopwords = {
1146
+ "about",
1147
+ "against",
1148
+ "also",
1149
+ "and",
1150
+ "are",
1151
+ "between",
1152
+ "code",
1153
+ "does",
1154
+ "file",
1155
+ "for",
1156
+ "from",
1157
+ "happen",
1158
+ "happens",
1159
+ "how",
1160
+ "into",
1161
+ "main",
1162
+ "me",
1163
+ "model",
1164
+ "models",
1165
+ "path",
1166
+ "project",
1167
+ "that",
1168
+ "the",
1169
+ "this",
1170
+ "through",
1171
+ "under",
1172
+ "using",
1173
+ "what",
1174
+ "when",
1175
+ "where",
1176
+ "which",
1177
+ "with",
1178
+ }
1179
+ raw_terms = re.findall(
1180
+ r"[A-Za-z_][A-Za-z0-9_]*(?:[./-][A-Za-z_][A-Za-z0-9_]*)*",
1181
+ text or "",
1182
+ )
1183
+ terms = set()
1184
+ for raw_term in raw_terms:
1185
+ expanded = {raw_term}
1186
+ expanded.update(re.split(r"[._/-]+", raw_term))
1187
+ expanded.update(re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", raw_term).split())
1188
+
1189
+ for term in expanded:
1190
+ normalized = term.strip("_./-").lower()
1191
+ if len(normalized) < 3 or normalized in stopwords:
1192
+ continue
1193
+ terms.add(normalized)
1194
+ return terms
1195
+
1196
+ @staticmethod
1197
+ def _query_path_fragments(text: str) -> set:
1198
+ fragments = set()
1199
+ for fragment in re.findall(r"[A-Za-z0-9_./-]+(?:\.[A-Za-z0-9_./-]+|/[A-Za-z0-9_./-]+)", text or ""):
1200
+ normalized = fragment.strip().strip("./").lower()
1201
+ if "/" in normalized or "." in normalized:
1202
+ fragments.add(normalized)
1203
+ return fragments
1204
+
1205
  @staticmethod
1206
  def _is_substantive_assistant_message(content: str) -> bool:
1207
  normalized = " ".join((content or "").strip().lower().split())