[KM-437][DB] db pipeline+ metadata

#1
Files changed (46) hide show
  1. .dockerignore +0 -6
  2. .gitattributes +0 -1
  3. .gitignore +1 -13
  4. .vscode/launch.json +0 -25
  5. Dockerfile +0 -2
  6. README.md +0 -2
  7. main.py +0 -2
  8. pyproject.toml +10 -19
  9. src/agents/chatbot.py +1 -11
  10. src/agents/orchestration.py +0 -5
  11. src/api/v1/chat.py +17 -80
  12. src/api/v1/db_client.py +3 -471
  13. src/api/v1/document.py +128 -43
  14. src/config/agents/system_prompt.md +2 -1
  15. src/config/settings.py +0 -5
  16. src/database_client/database_client_service.py +0 -164
  17. src/db/postgres/init_db.py +1 -43
  18. src/db/postgres/models.py +0 -16
  19. src/document/document_service.py +1 -17
  20. src/knowledge/parquet_service.py +0 -77
  21. src/knowledge/processing_service.py +56 -145
  22. src/models/credentials.py +0 -164
  23. src/models/sql_query.py +0 -8
  24. src/models/structured_output.py +0 -4
  25. src/pipeline/db_pipeline/__init__.py +0 -3
  26. src/pipeline/db_pipeline/db_pipeline_service.py +0 -302
  27. src/pipeline/db_pipeline/extractor.py +0 -283
  28. src/pipeline/document_pipeline/__init__.py +0 -0
  29. src/pipeline/document_pipeline/document_pipeline.py +0 -94
  30. src/query/__init__.py +0 -0
  31. src/query/base.py +0 -32
  32. src/query/executors/__init__.py +0 -0
  33. src/query/executors/db_executor.py +0 -648
  34. src/query/executors/tabular.py +0 -287
  35. src/query/query_executor.py +0 -42
  36. src/rag/base.py +0 -20
  37. src/rag/retriever.py +48 -24
  38. src/rag/retrievers/__init__.py +0 -0
  39. src/rag/retrievers/baseline.py +0 -76
  40. src/rag/retrievers/document.py +0 -158
  41. src/rag/retrievers/schema.py +0 -411
  42. src/rag/router.py +0 -179
  43. src/storage/az_blob/az_blob.py +0 -34
  44. src/tools/search.py +3 -3
  45. src/utils/db_credential_encryption.py +0 -70
  46. uv.lock +10 -440
.dockerignore DELETED
@@ -1,6 +0,0 @@
1
- .venv
2
- software/
3
- __pycache__
4
- *.py[oc]
5
- .env
6
- .env.*
 
 
 
 
 
 
 
.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- software/** filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
.gitignore CHANGED
@@ -26,10 +26,6 @@ test/users/user_accounts.csv
26
  .env.prd
27
  .env.example
28
 
29
- CLAUDE.md
30
-
31
- /experiments
32
- src/rag/experiments/
33
  erd/
34
  playground/
35
  playground_retriever.py
@@ -37,12 +33,4 @@ playground_chat.py
37
  playground_flush_cache.py
38
  playground_create_user.py
39
  API_CONTRACT.md
40
- context_engineering/
41
- sample_file/
42
- test_tesseract.py
43
-
44
- # Windows binaries — installed via apt in Docker instead
45
- software/
46
-
47
- tests/
48
- .claude/
 
26
  .env.prd
27
  .env.example
28
 
 
 
 
 
29
  erd/
30
  playground/
31
  playground_retriever.py
 
33
  playground_flush_cache.py
34
  playground_create_user.py
35
  API_CONTRACT.md
36
+ context_engineering/
 
 
 
 
 
 
 
 
.vscode/launch.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- // Use IntelliSense to learn about possible attributes.
3
- // Hover to view descriptions of existing attributes.
4
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
- "version": "0.2.0",
6
- "configurations": [
7
- {
8
- "name": "DataEyond: FastAPI (debug)",
9
- "type": "debugpy",
10
- "request": "launch",
11
- "module": "uvicorn",
12
- "args": [
13
- "main:app",
14
- "--host", "0.0.0.0",
15
- "--port", "7860",
16
- "--reload"
17
- ],
18
- "jinja": true,
19
- "justMyCode": true,
20
- "envFile": "${workspaceFolder}/.env",
21
- "console": "integratedTerminal",
22
- "cwd": "${workspaceFolder}"
23
- }
24
- ]
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -12,8 +12,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
12
  libpq-dev \
13
  gcc \
14
  libgomp1 \
15
- tesseract-ocr \
16
- poppler-utils \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
  RUN addgroup --system app && \
 
12
  libpq-dev \
13
  gcc \
14
  libgomp1 \
 
 
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
  RUN addgroup --system app && \
README.md CHANGED
@@ -11,8 +11,6 @@ short_description: AI Agent core service
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
13
 
14
- # Agentic Service Data Eyond
15
-
16
  How to run:
17
  `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
18
 
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
13
 
 
 
14
  How to run:
15
  `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
16
 
main.py CHANGED
@@ -10,7 +10,6 @@ from src.api.v1.chat import router as chat_router
10
  from src.api.v1.room import router as room_router
11
  from src.api.v1.users import router as users_router
12
  from src.api.v1.knowledge import router as knowledge_router
13
- from src.api.v1.db_client import router as db_client_router
14
  from src.db.postgres.init_db import init_db
15
  import uvicorn
16
 
@@ -36,7 +35,6 @@ app.include_router(document_router)
36
  app.include_router(knowledge_router)
37
  app.include_router(room_router)
38
  app.include_router(chat_router)
39
- app.include_router(db_client_router)
40
 
41
 
42
  @app.on_event("startup")
 
10
  from src.api.v1.room import router as room_router
11
  from src.api.v1.users import router as users_router
12
  from src.api.v1.knowledge import router as knowledge_router
 
13
  from src.db.postgres.init_db import init_db
14
  import uvicorn
15
 
 
35
  app.include_router(knowledge_router)
36
  app.include_router(room_router)
37
  app.include_router(chat_router)
 
38
 
39
 
40
  @app.on_event("startup")
pyproject.toml CHANGED
@@ -79,18 +79,6 @@ dependencies = [
79
  "jsonpatch>=1.33",
80
  "pymongo>=4.14.0",
81
  "psycopg2>=2.9.11",
82
- # --- SQL parsing / guardrails ---
83
- "sqlglot>=25.0.0",
84
- # --- User-DB connectors (db_pipeline) ---
85
- "pymysql>=1.1.1",
86
- "pymssql>=2.3.0",
87
- "sqlalchemy-bigquery>=1.11.0",
88
- "snowflake-sqlalchemy>=1.7.0",
89
- # --- OCR (pdf processing) ---
90
- "pdf2image>=1.17.0",
91
- "pytesseract>=0.3.13",
92
- "pypdf2>=3.0.1",
93
- "pyarrow>=24.0.0",
94
  ]
95
 
96
  [project.optional-dependencies]
@@ -104,6 +92,16 @@ dev = [
104
  "pre-commit==4.0.1",
105
  ]
106
 
 
 
 
 
 
 
 
 
 
 
107
  [tool.hatch.build.targets.wheel]
108
  packages = ["src/agent_service"]
109
 
@@ -135,10 +133,3 @@ testpaths = ["tests"]
135
  filterwarnings = [
136
  "ignore::DeprecationWarning",
137
  ]
138
-
139
- [dependency-groups]
140
- dev = [
141
- "pytest>=8.3.4",
142
- "pytest-asyncio>=0.24.0",
143
- "ruff>=0.8.4",
144
- ]
 
79
  "jsonpatch>=1.33",
80
  "pymongo>=4.14.0",
81
  "psycopg2>=2.9.11",
 
 
 
 
 
 
 
 
 
 
 
 
82
  ]
83
 
84
  [project.optional-dependencies]
 
92
  "pre-commit==4.0.1",
93
  ]
94
 
95
+ [tool.uv]
96
+ dev-dependencies = [
97
+ "pytest==8.3.4",
98
+ "pytest-asyncio==0.24.0",
99
+ "pytest-cov==6.0.0",
100
+ "ruff==0.8.4",
101
+ "mypy==1.13.0",
102
+ "pre-commit==4.0.1",
103
+ ]
104
+
105
  [tool.hatch.build.targets.wheel]
106
  packages = ["src/agent_service"]
107
 
 
133
  filterwarnings = [
134
  "ignore::DeprecationWarning",
135
  ]
 
 
 
 
 
 
 
src/agents/chatbot.py CHANGED
@@ -1,6 +1,5 @@
1
  """Chatbot agent with RAG capabilities."""
2
 
3
- import tiktoken
4
  from langchain_openai import AzureChatOpenAI
5
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
6
  from langchain_core.output_parsers import StrOutputParser
@@ -10,14 +9,6 @@ from langchain_core.messages import HumanMessage, AIMessage
10
 
11
  logger = get_logger("chatbot")
12
 
13
- _enc = tiktoken.get_encoding("cl100k_base")
14
-
15
-
16
- def _count_tokens(messages: list, context: str) -> dict:
17
- msg_tokens = sum(len(_enc.encode(m.content)) for m in messages)
18
- ctx_tokens = len(_enc.encode(context))
19
- return {"messages_tokens": msg_tokens, "context_tokens": ctx_tokens, "total": msg_tokens + ctx_tokens}
20
-
21
 
22
  class ChatbotAgent:
23
  """Chatbot agent with RAG capabilities."""
@@ -73,8 +64,7 @@ class ChatbotAgent:
73
  async def astream_response(self, messages: list, context: str = ""):
74
  """Stream response tokens as they are generated."""
75
  try:
76
- token_counts = _count_tokens(messages, context)
77
- logger.info("LLM input tokens", **token_counts)
78
  async for token in self.chain.astream({"messages": messages, "context": context}):
79
  yield token
80
  except Exception as e:
 
1
  """Chatbot agent with RAG capabilities."""
2
 
 
3
  from langchain_openai import AzureChatOpenAI
4
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
5
  from langchain_core.output_parsers import StrOutputParser
 
9
 
10
  logger = get_logger("chatbot")
11
 
 
 
 
 
 
 
 
 
12
 
13
  class ChatbotAgent:
14
  """Chatbot agent with RAG capabilities."""
 
64
  async def astream_response(self, messages: list, context: str = ""):
65
  """Stream response tokens as they are generated."""
66
  try:
67
+ logger.info("Streaming chatbot response")
 
68
  async for token in self.chain.astream({"messages": messages, "context": context}):
69
  yield token
70
  except Exception as e:
src/agents/orchestration.py CHANGED
@@ -35,11 +35,6 @@ Intent Routing:
35
  - greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
36
  - goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
37
  - other -> needs_search=True, search_query=<standalone rewritten query>
38
-
39
- Source Routing (set source_hint):
40
- - Columns, tables, sheets, data types, schema, row counts, statistics -> source_hint=schema
41
- - Document content, paragraphs, reports, articles, text -> source_hint=document
42
- - Unclear or spans both -> source_hint=both
43
  """),
44
  MessagesPlaceholder(variable_name="history"),
45
  ("user", "{message}")
 
35
  - greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
36
  - goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
37
  - other -> needs_search=True, search_query=<standalone rewritten query>
 
 
 
 
 
38
  """),
39
  MessagesPlaceholder(variable_name="history"),
40
  ("user", "{message}")
src/api/v1/chat.py CHANGED
@@ -9,9 +9,6 @@ from src.db.postgres.models import ChatMessage, MessageSource
9
  from src.agents.orchestration import orchestrator
10
  from src.agents.chatbot import chatbot
11
  from src.rag.retriever import retriever
12
- from src.rag.base import RetrievalResult
13
- from src.query.query_executor import query_executor
14
- from src.query.base import QueryResult
15
  from src.db.redis.connection import get_redis
16
  from src.config.settings import settings
17
  from src.middlewares.logging import get_logger, log_execution
@@ -48,66 +45,34 @@ class ChatRequest(BaseModel):
48
  message: str
49
 
50
 
51
- def _format_context(results: List[RetrievalResult]) -> str:
52
  """Format retrieval results as context string for the LLM."""
53
  lines = []
54
  for result in results:
55
- data = result.metadata.get("data", {})
56
- filename = data.get("filename", "Unknown")
57
- page = data.get("page_label")
58
  source_label = f"{filename}, p.{page}" if page else filename
59
- lines.append(f"[Source: {source_label}]\n{result.content}\n")
60
  return "\n".join(lines)
61
 
62
 
63
- def _extract_sources(results: List[RetrievalResult]) -> List[Dict[str, Any]]:
64
  """Extract deduplicated source references from retrieval results."""
65
  seen = set()
66
  sources = []
67
  for result in results:
68
- meta = result.metadata
69
- data = meta.get("data", {})
70
- if "document_id" in data:
71
- key = (data.get("document_id"), data.get("page_label"))
72
- if key not in seen:
73
- seen.add(key)
74
- sources.append({
75
- "document_id": data.get("document_id"),
76
- "filename": data.get("filename", "Unknown"),
77
- "page_label": data.get("page_label", "Unknown"),
78
- })
79
- else:
80
- key = (data.get("table_name"), data.get("column_name"))
81
- if key not in seen:
82
- seen.add(key)
83
- table_name = data.get("table_name")
84
- user_id = meta.get("user_id")
85
- sources.append({
86
- "document_id": f"{user_id}_{table_name}",
87
- "filename": data.get("table_name", "Unknown"),
88
- "page_label": data.get("column_name", "Unknown"),
89
- })
90
-
91
- logger.debug(f"Extracted sources: {sources}")
92
  return sources
93
 
94
 
95
- def _format_query_results(results: list[QueryResult]) -> str:
96
- if not results:
97
- return ""
98
- lines = []
99
- for r in results:
100
- name = r.metadata.get("client_name", r.source_id)
101
- lines.append(f"[Query result — {name}, tables: {r.table_or_file}]")
102
- lines.append(f"SQL: {r.metadata.get('sql', '')}")
103
- if r.columns and r.rows:
104
- lines.append(" | ".join(r.columns))
105
- for row in r.rows[:20]:
106
- lines.append(" | ".join(str(row.get(c, "")) for c in r.columns))
107
- lines.append(f"({r.row_count} rows total)\n")
108
- return "\n".join(lines)
109
-
110
-
111
  async def get_cached_response(redis, cache_key: str) -> Optional[str]:
112
  cached = await redis.get(cache_key)
113
  if cached:
@@ -190,12 +155,9 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
190
  sources: List[Dict[str, Any]] = []
191
 
192
  if intent_result is None:
193
- # Step 2: Launch retrieval and history loading in parallel, then run orchestrator.
194
- # k=5
195
- # tables — db_executor's FK expansion is one-hop and cannot bridge
196
- # 2-hop gaps (e.g. customers -> order_items -> products) on its own.
197
  retrieval_task = asyncio.create_task(
198
- retriever.retrieve(request.message, request.user_id, db, k=5)
199
  )
200
  history_task = asyncio.create_task(
201
  load_history(db, request.room_id, limit=6) # 6 msgs (3 pairs) for orchestrator
@@ -203,28 +165,18 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
203
  history = await history_task # fast DB query (<100ms), done before orchestrator finishes
204
  intent_result = await orchestrator.analyze_message(request.message, history)
205
 
206
- search_query = intent_result.get("search_query", request.message) or request.message
207
  if not intent_result.get("needs_search"):
208
  retrieval_task.cancel()
209
- try:
210
- await retrieval_task
211
- except asyncio.CancelledError:
212
- pass
213
  raw_results = []
214
  else:
 
215
  logger.info(f"Searching for: {search_query}")
216
  if search_query != request.message:
217
  retrieval_task.cancel()
218
- try:
219
- await retrieval_task
220
- except asyncio.CancelledError:
221
- pass
222
  raw_results = await retriever.retrieve(
223
  query=search_query,
224
  user_id=request.user_id,
225
  db=db,
226
- k=5,
227
- source_hint=intent_result.get("source_hint", "both"),
228
  )
229
  else:
230
  raw_results = await retrieval_task
@@ -232,21 +184,6 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
232
  context = _format_context(raw_results)
233
  sources = _extract_sources(raw_results)
234
 
235
- source_hint = intent_result.get("source_hint", "both")
236
- if source_hint in ("schema", "both"):
237
- # Use search_query (orchestrator's standalone rewrite) so follow-up
238
- # messages like "dive deeper" or "show me last year" resolve correctly.
239
- # For first-turn questions search_query == request.message, so no change.
240
- query_results = await query_executor.execute(
241
- results=raw_results,
242
- user_id=request.user_id,
243
- db=db,
244
- question=search_query,
245
- )
246
- query_context = _format_query_results(query_results)
247
- if query_context:
248
- context = query_context + "\n\n" + context
249
-
250
  # Step 3: Direct response for greetings / non-document intents
251
  if intent_result.get("direct_response"):
252
  response = intent_result["direct_response"]
 
9
  from src.agents.orchestration import orchestrator
10
  from src.agents.chatbot import chatbot
11
  from src.rag.retriever import retriever
 
 
 
12
  from src.db.redis.connection import get_redis
13
  from src.config.settings import settings
14
  from src.middlewares.logging import get_logger, log_execution
 
45
  message: str
46
 
47
 
48
+ def _format_context(results: List[Dict[str, Any]]) -> str:
49
  """Format retrieval results as context string for the LLM."""
50
  lines = []
51
  for result in results:
52
+ filename = result["metadata"].get("filename", "Unknown")
53
+ page = result["metadata"].get("page_label")
 
54
  source_label = f"{filename}, p.{page}" if page else filename
55
+ lines.append(f"[Source: {source_label}]\n{result['content']}\n")
56
  return "\n".join(lines)
57
 
58
 
59
+ def _extract_sources(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
60
  """Extract deduplicated source references from retrieval results."""
61
  seen = set()
62
  sources = []
63
  for result in results:
64
+ meta = result["metadata"]
65
+ key = (meta.get("document_id"), meta.get("page_label"))
66
+ if key not in seen:
67
+ seen.add(key)
68
+ sources.append({
69
+ "document_id": meta.get("document_id"),
70
+ "filename": meta.get("filename", "Unknown"),
71
+ "page_label": meta.get("page_label"),
72
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return sources
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  async def get_cached_response(redis, cache_key: str) -> Optional[str]:
77
  cached = await redis.get(cache_key)
78
  if cached:
 
155
  sources: List[Dict[str, Any]] = []
156
 
157
  if intent_result is None:
158
+ # Step 2: Launch retrieval and history loading in parallel, then run orchestrator
 
 
 
159
  retrieval_task = asyncio.create_task(
160
+ retriever.retrieve(request.message, request.user_id, db)
161
  )
162
  history_task = asyncio.create_task(
163
  load_history(db, request.room_id, limit=6) # 6 msgs (3 pairs) for orchestrator
 
165
  history = await history_task # fast DB query (<100ms), done before orchestrator finishes
166
  intent_result = await orchestrator.analyze_message(request.message, history)
167
 
 
168
  if not intent_result.get("needs_search"):
169
  retrieval_task.cancel()
 
 
 
 
170
  raw_results = []
171
  else:
172
+ search_query = intent_result.get("search_query", request.message)
173
  logger.info(f"Searching for: {search_query}")
174
  if search_query != request.message:
175
  retrieval_task.cancel()
 
 
 
 
176
  raw_results = await retriever.retrieve(
177
  query=search_query,
178
  user_id=request.user_id,
179
  db=db,
 
 
180
  )
181
  else:
182
  raw_results = await retrieval_task
 
184
  context = _format_context(raw_results)
185
  sources = _extract_sources(raw_results)
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # Step 3: Direct response for greetings / non-document intents
188
  if intent_result.get("direct_response"):
189
  response = intent_result["direct_response"]
src/api/v1/db_client.py CHANGED
@@ -1,473 +1,5 @@
1
- """API endpoints for user-registered database connections.
2
 
3
- Credential schemas (DbType, PostgresCredentials, etc.) live in
4
- `src/models/credentials.py` — they are imported below (with noqa: F401) so
5
- FastAPI/Swagger picks them up for OpenAPI schema generation even though they
6
- are not referenced by name in this file.
7
- """
8
 
9
- from typing import Any, Dict, List, Literal, Optional
10
- from datetime import datetime
11
-
12
- from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
13
- from pydantic import BaseModel, Field
14
- from sqlalchemy.ext.asyncio import AsyncSession
15
-
16
- from src.database_client.database_client_service import database_client_service
17
- from src.db.postgres.connection import get_db
18
- from src.middlewares.logging import get_logger, log_execution
19
- from src.middlewares.rate_limit import limiter
20
- from src.models.credentials import ( # noqa: F401 — re-exported for Swagger schema discovery
21
- BigQueryCredentials,
22
- CredentialSchemas,
23
- DbType,
24
- MysqlCredentials,
25
- PostgresCredentials,
26
- SnowflakeCredentials,
27
- SqlServerCredentials,
28
- SupabaseCredentials,
29
- )
30
- from src.pipeline.db_pipeline import db_pipeline_service
31
- from src.utils.db_credential_encryption import decrypt_credentials_dict
32
-
33
- logger = get_logger("database_client_api")
34
-
35
- router = APIRouter(prefix="/api/v1", tags=["Database Clients"])
36
-
37
-
38
- # ---------------------------------------------------------------------------
39
- # Request / Response schemas
40
- # ---------------------------------------------------------------------------
41
-
42
-
43
- class DatabaseClientCreate(BaseModel):
44
- """
45
- Payload to register a new external database connection.
46
-
47
- The `credentials` object shape depends on `db_type`:
48
-
49
- | db_type | Required fields |
50
- |-------------|----------------------------------------------------------|
51
- | postgres | host, port, database, username, password, ssl_mode |
52
- | mysql | host, port, database, username, password, ssl |
53
- | sqlserver | host, port, database, username, password, driver? |
54
- | supabase | host, port, database, username, password, ssl_mode |
55
- | bigquery | project_id, dataset_id, location?, service_account_json |
56
- | snowflake | account, warehouse, database, schema?, username, password, role? |
57
-
58
- Sensitive fields (`password`, `service_account_json`) are encrypted
59
- at rest using Fernet symmetric encryption.
60
- """
61
-
62
- name: str = Field(..., description="Display name for this connection.", examples=["Production DB"])
63
- db_type: DbType = Field(..., description="Type of the database engine.", examples=["postgres"])
64
- credentials: Dict[str, Any] = Field(
65
- ...,
66
- description="Connection credentials. Shape depends on db_type. See schema descriptions above.",
67
- examples=[
68
- {
69
- "host": "db.example.com",
70
- "port": 5432,
71
- "database": "mydb",
72
- "username": "admin",
73
- "password": "s3cr3t!",
74
- "ssl_mode": "require",
75
- }
76
- ],
77
- )
78
-
79
-
80
- class DatabaseClientUpdate(BaseModel):
81
- """
82
- Payload to update an existing database connection.
83
-
84
- All fields are optional — only provided fields will be updated.
85
- If `credentials` is provided, it replaces the entire credentials object
86
- and sensitive fields are re-encrypted.
87
- """
88
-
89
- name: Optional[str] = Field(None, description="New display name for this connection.", examples=["Staging DB"])
90
- credentials: Optional[Dict[str, Any]] = Field(
91
- None,
92
- description="Updated credentials object. Replaces existing credentials entirely if provided.",
93
- examples=[{"host": "new-host.example.com", "port": 5432, "database": "mydb", "username": "admin", "password": "n3wP@ss!", "ssl_mode": "require"}],
94
- )
95
- status: Optional[Literal["active", "inactive"]] = Field(
96
- None,
97
- description="Set to 'inactive' to soft-disable the connection without deleting it.",
98
- examples=["inactive"],
99
- )
100
-
101
-
102
- class DatabaseClientResponse(BaseModel):
103
- """
104
- Database connection record returned by the API.
105
-
106
- Credentials are **never** included in the response for security reasons.
107
- """
108
-
109
- id: str = Field(..., description="Unique identifier of the database connection.")
110
- user_id: str = Field(..., description="ID of the user who owns this connection.")
111
- name: str = Field(..., description="Display name of the connection.")
112
- db_type: str = Field(..., description="Database engine type.")
113
- status: str = Field(..., description="Connection status: 'active' or 'inactive'.")
114
- created_at: datetime = Field(..., description="Timestamp when the connection was registered.")
115
- updated_at: Optional[datetime] = Field(None, description="Timestamp of the last update, if any.")
116
-
117
- model_config = {"from_attributes": True}
118
-
119
-
120
- # ---------------------------------------------------------------------------
121
- # Supported DB types registry
122
- # ---------------------------------------------------------------------------
123
-
124
- _DB_TYPES: List[Dict[str, Any]] = [
125
- {
126
- "db_type": "postgres",
127
- "display_name": "PostgreSQL",
128
- "logo": "postgres",
129
- "status": "active",
130
- "message": None,
131
- "fields": [
132
- {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
133
- {"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number"},
134
- {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
135
- {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
136
- {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
137
- {"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["disable", "require", "verify-ca", "verify-full"]},
138
- ],
139
- },
140
- {
141
- "db_type": "mysql",
142
- "display_name": "MySQL",
143
- "logo": "mysql",
144
- "status": "active",
145
- "message": None,
146
- "fields": [
147
- {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
148
- {"name": "port", "type": "integer", "required": False, "default": 3306, "description": "Port number"},
149
- {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
150
- {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
151
- {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
152
- {"name": "ssl", "type": "boolean", "required": False, "default": True, "description": "Enable SSL"},
153
- ],
154
- },
155
- {
156
- "db_type": "supabase",
157
- "display_name": "Supabase",
158
- "logo": "supabase",
159
- "status": "active",
160
- "message": None,
161
- "fields": [
162
- {"name": "host", "type": "string", "required": True, "default": None, "description": "Supabase database host"},
163
- {"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number (5432 direct, 6543 pooler)"},
164
- {"name": "database", "type": "string", "required": False, "default": "postgres", "description": "Database name"},
165
- {"name": "username", "type": "string", "required": True, "default": None, "description": "Database user"},
166
- {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
167
- {"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["require", "verify-ca", "verify-full"]},
168
- ],
169
- },
170
- {
171
- "db_type": "sqlserver",
172
- "display_name": "SQL Server",
173
- "logo": "sqlserver",
174
- "status": "inactive",
175
- "message": "Coming soon",
176
- "fields": [
177
- {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
178
- {"name": "port", "type": "integer", "required": False, "default": 1433, "description": "Port number"},
179
- {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
180
- {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
181
- {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
182
- {"name": "driver", "type": "string", "required": False, "default": None, "description": "ODBC driver name"},
183
- ],
184
- },
185
- {
186
- "db_type": "bigquery",
187
- "display_name": "BigQuery",
188
- "logo": "bigquery",
189
- "status": "inactive",
190
- "message": "Coming soon",
191
- "fields": [
192
- {"name": "project_id", "type": "string", "required": True, "default": None, "description": "GCP project ID"},
193
- {"name": "dataset_id", "type": "string", "required": True, "default": None, "description": "BigQuery dataset name"},
194
- {"name": "location", "type": "string", "required": False, "default": "US", "description": "Dataset location/region"},
195
- {"name": "service_account_json", "type": "string", "required": True, "default": None, "description": "GCP Service Account key JSON", "sensitive": True},
196
- ],
197
- },
198
- {
199
- "db_type": "snowflake",
200
- "display_name": "Snowflake",
201
- "logo": "snowflake",
202
- "status": "inactive",
203
- "message": "Coming soon",
204
- "fields": [
205
- {"name": "account", "type": "string", "required": True, "default": None, "description": "Snowflake account identifier"},
206
- {"name": "warehouse", "type": "string", "required": True, "default": None, "description": "Virtual warehouse name"},
207
- {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
208
- {"name": "schema", "type": "string", "required": False, "default": "PUBLIC", "description": "Schema name"},
209
- {"name": "username", "type": "string", "required": True, "default": None, "description": "Snowflake username"},
210
- {"name": "password", "type": "string", "required": True, "default": None, "description": "Snowflake password", "sensitive": True},
211
- {"name": "role", "type": "string", "required": False, "default": None, "description": "Snowflake role"},
212
- ],
213
- },
214
- ]
215
-
216
-
217
- # ---------------------------------------------------------------------------
218
- # Endpoints
219
- # ---------------------------------------------------------------------------
220
-
221
-
222
- @router.get(
223
- "/database-clients/dbtypes",
224
- summary="List supported database types",
225
- response_description="All database types supported by DataEyond with their connection parameters.",
226
- )
227
- async def list_db_types():
228
- """
229
- Return every database type DataEyond can connect to, along with the
230
- credential fields the frontend should render, a logo filename, and
231
- an active/inactive status with an optional message.
232
- """
233
- return _DB_TYPES
234
-
235
-
236
- @router.post(
237
- "/database-clients",
238
- response_model=DatabaseClientResponse,
239
- status_code=status.HTTP_201_CREATED,
240
- summary="Register a new database connection",
241
- response_description="The newly created database connection record (credentials excluded).",
242
- responses={
243
- 201: {"description": "Connection registered successfully."},
244
- 422: {"description": "Validation error — check the credentials shape for the given db_type."},
245
- 500: {"description": "Internal server error."},
246
- },
247
- )
248
- @limiter.limit("10/minute")
249
- @log_execution(logger)
250
- async def create_database_client(
251
- request: Request,
252
- payload: DatabaseClientCreate,
253
- user_id: str = Query(..., description="ID of the user registering the connection."),
254
- db: AsyncSession = Depends(get_db),
255
- ):
256
- """
257
- Register a new external database connection for a user.
258
-
259
- The `credentials` object must match the shape for the chosen `db_type`
260
- (see **CredentialSchemas** in the schema section below for exact fields).
261
- Sensitive fields (`password`, `service_account_json`) are encrypted
262
- before being persisted — they are never returned in any response.
263
- """
264
- try:
265
- client = await database_client_service.create(
266
- db=db,
267
- user_id=user_id,
268
- name=payload.name,
269
- db_type=payload.db_type,
270
- credentials=payload.credentials,
271
- )
272
- return DatabaseClientResponse.model_validate(client)
273
- except Exception as e:
274
- logger.error(f"Failed to create database client for user {user_id}", error=str(e))
275
- raise HTTPException(
276
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
277
- detail=f"Failed to create database client: {str(e)}",
278
- )
279
-
280
-
281
- @router.get(
282
- "/database-clients/{user_id}",
283
- response_model=List[DatabaseClientResponse],
284
- summary="List all database connections for a user",
285
- response_description="List of database connections (credentials excluded).",
286
- responses={
287
- 200: {"description": "Returns an empty list if the user has no connections."},
288
- },
289
- )
290
- @log_execution(logger)
291
- async def list_database_clients(
292
- user_id: str,
293
- db: AsyncSession = Depends(get_db),
294
- ):
295
- """
296
- Return all database connections registered by the specified user,
297
- ordered by creation date (newest first).
298
-
299
- Credentials are never included in the response.
300
- """
301
- clients = await database_client_service.get_user_clients(db, user_id)
302
- return [DatabaseClientResponse.model_validate(c) for c in clients]
303
-
304
-
305
- @router.get(
306
- "/database-clients/{user_id}/{client_id}",
307
- response_model=DatabaseClientResponse,
308
- summary="Get a single database connection",
309
- response_description="Database connection detail (credentials excluded).",
310
- responses={
311
- 404: {"description": "Connection not found."},
312
- 403: {"description": "Access denied — user_id does not own this connection."},
313
- },
314
- )
315
- @log_execution(logger)
316
- async def get_database_client(
317
- user_id: str,
318
- client_id: str,
319
- db: AsyncSession = Depends(get_db),
320
- ):
321
- """
322
- Return the detail of a single database connection.
323
-
324
- Returns **403** if the `user_id` in the path does not match the owner
325
- of the requested connection.
326
- """
327
- client = await database_client_service.get(db, client_id)
328
-
329
- if not client:
330
- raise HTTPException(status_code=404, detail="Database client not found")
331
-
332
- if client.user_id != user_id:
333
- raise HTTPException(status_code=403, detail="Access denied")
334
-
335
- return DatabaseClientResponse.model_validate(client)
336
-
337
-
338
- @router.put(
339
- "/database-clients/{client_id}",
340
- response_model=DatabaseClientResponse,
341
- summary="Update a database connection",
342
- response_description="Updated database connection record (credentials excluded).",
343
- responses={
344
- 404: {"description": "Connection not found."},
345
- 403: {"description": "Access denied — user_id does not own this connection."},
346
- },
347
- )
348
- @log_execution(logger)
349
- async def update_database_client(
350
- client_id: str,
351
- payload: DatabaseClientUpdate,
352
- user_id: str = Query(..., description="ID of the user who owns the connection."),
353
- db: AsyncSession = Depends(get_db),
354
- ):
355
- """
356
- Update an existing database connection.
357
-
358
- Only fields present in the request body are updated.
359
- If `credentials` is provided it **replaces** the entire credentials object
360
- and sensitive fields are re-encrypted automatically.
361
- """
362
- client = await database_client_service.get(db, client_id)
363
-
364
- if not client:
365
- raise HTTPException(status_code=404, detail="Database client not found")
366
-
367
- if client.user_id != user_id:
368
- raise HTTPException(status_code=403, detail="Access denied")
369
-
370
- updated = await database_client_service.update(
371
- db=db,
372
- client_id=client_id,
373
- name=payload.name,
374
- credentials=payload.credentials,
375
- status=payload.status,
376
- )
377
- return DatabaseClientResponse.model_validate(updated)
378
-
379
-
380
- @router.delete(
381
- "/database-clients/{client_id}",
382
- status_code=status.HTTP_200_OK,
383
- summary="Delete a database connection",
384
- responses={
385
- 200: {"description": "Connection deleted successfully."},
386
- 404: {"description": "Connection not found."},
387
- 403: {"description": "Access denied — user_id does not own this connection."},
388
- },
389
- )
390
- @log_execution(logger)
391
- async def delete_database_client(
392
- client_id: str,
393
- user_id: str = Query(..., description="ID of the user who owns the connection."),
394
- db: AsyncSession = Depends(get_db),
395
- ):
396
- """
397
- Permanently delete a database connection.
398
-
399
- This action is irreversible. The stored credentials are also removed.
400
- """
401
- client = await database_client_service.get(db, client_id)
402
-
403
- if not client:
404
- raise HTTPException(status_code=404, detail="Database client not found")
405
-
406
- if client.user_id != user_id:
407
- raise HTTPException(status_code=403, detail="Access denied")
408
-
409
- await database_client_service.delete(db, client_id)
410
- return {"status": "success", "message": "Database client deleted successfully"}
411
-
412
-
413
- @router.post(
414
- "/database-clients/{client_id}/ingest",
415
- status_code=status.HTTP_200_OK,
416
- summary="Ingest schema from a registered database into the vector store",
417
- response_description="Count of chunks ingested.",
418
- responses={
419
- 200: {"description": "Ingestion completed successfully."},
420
- 403: {"description": "Access denied — user_id does not own this connection."},
421
- 404: {"description": "Connection not found."},
422
- 501: {"description": "The connection's db_type is not yet supported by the pipeline."},
423
- 500: {"description": "Ingestion failed (connection error, profiling error, etc.)."},
424
- },
425
- )
426
- @limiter.limit("5/minute")
427
- @log_execution(logger)
428
- async def ingest_database_client(
429
- request: Request,
430
- client_id: str,
431
- user_id: str = Query(..., description="ID of the user who owns the connection."),
432
- db: AsyncSession = Depends(get_db),
433
- ):
434
- """
435
- Decrypt the stored credentials, connect to the user's database, introspect
436
- its schema, profile each column, embed the descriptions, and store them in
437
- the shared PGVector collection tagged with `source_type="database"`.
438
-
439
- Chunks become retrievable via the same retriever used for document chunks.
440
- """
441
- client = await database_client_service.get(db, client_id)
442
-
443
- if not client:
444
- raise HTTPException(status_code=404, detail="Database client not found")
445
-
446
- if client.user_id != user_id:
447
- raise HTTPException(status_code=403, detail="Access denied")
448
-
449
- if client.status != "active":
450
- raise HTTPException(
451
- status_code=status.HTTP_409_CONFLICT,
452
- detail="Cannot ingest from an inactive database connection.",
453
- )
454
-
455
- try:
456
- creds = decrypt_credentials_dict(client.credentials)
457
- with db_pipeline_service.engine_scope(
458
- db_type=client.db_type,
459
- credentials=creds,
460
- ) as engine:
461
- total = await db_pipeline_service.run(user_id=user_id, client_id=client_id, engine=engine)
462
- except NotImplementedError as e:
463
- raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail=str(e))
464
- except Exception as e:
465
- logger.error(
466
- f"Ingestion failed for client {client_id}", user_id=user_id, error=str(e)
467
- )
468
- raise HTTPException(
469
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
470
- detail=f"Ingestion failed: {e}",
471
- )
472
-
473
- return {"status": "success", "client_id": client_id, "chunks_ingested": total}
 
1
+ from typing import Literal, Dict
2
 
 
 
 
 
 
3
 
4
+ dbtypes: Literal["postgresql", "mysql", "sqlite"] = Literal["postgresql", "mysql", "sqlite"]
5
+ creds: Dict[str, str]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/api/v1/document.py CHANGED
@@ -1,20 +1,21 @@
1
  """Document management API endpoints."""
2
-
3
- from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
  from src.db.postgres.connection import get_db
6
  from src.document.document_service import document_service
 
 
7
  from src.middlewares.logging import get_logger, log_execution
8
  from src.middlewares.rate_limit import limiter
9
- from src.pipeline.document_pipeline.document_pipeline import document_pipeline
10
  from pydantic import BaseModel
11
  from typing import List
12
-
13
  logger = get_logger("document_api")
14
-
15
  router = APIRouter(prefix="/api/v1", tags=["Documents"])
16
-
17
-
18
  class DocumentResponse(BaseModel):
19
  id: str
20
  filename: str
@@ -22,27 +23,6 @@ class DocumentResponse(BaseModel):
22
  file_size: int
23
  file_type: str
24
  created_at: str
25
-
26
-
27
- # NOTE: Keep in sync with SUPPORTED_FILE_TYPES in src/pipeline/document_pipeline/document_pipeline.py
28
- _DOC_TYPES = [
29
- {"doc_type": "pdf", "max_size": 10, "status": "active", "message": None},
30
- {"doc_type": "docx", "max_size": 10, "status": "active", "message": None},
31
- {"doc_type": "txt", "max_size": 10, "status": "active", "message": None},
32
- {"doc_type": "csv", "max_size": 10, "status": "active", "message": None},
33
- {"doc_type": "xlsx", "max_size": 10, "status": "active", "message": None},
34
- ]
35
-
36
-
37
- @router.get(
38
- "/documents/doctypes",
39
- summary="List supported document types",
40
- response_description="All document types supported by DataEyond with their size limits and status.",
41
- )
42
- @log_execution(logger)
43
- async def get_document_types():
44
- """Return every document type DataEyond can process, with max file size and active/inactive status."""
45
- return {"status": "success", "data": _DOC_TYPES}
46
 
47
 
48
  @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
@@ -64,8 +44,8 @@ async def list_documents(
64
  )
65
  for doc in documents
66
  ]
67
-
68
-
69
  @router.post("/document/upload")
70
  @limiter.limit("10/minute")
71
  @log_execution(logger)
@@ -77,12 +57,57 @@ async def upload_document(
77
  ):
78
  """Upload a document."""
79
  if not user_id:
80
- raise HTTPException(status_code=400, detail="user_id is required")
81
-
82
- data = await document_pipeline.upload(file, user_id, db)
83
- return {"status": "success", "message": "Document uploaded successfully", "data": data}
84
-
85
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  @router.delete("/document/delete")
87
  @log_execution(logger)
88
  async def delete_document(
@@ -91,10 +116,31 @@ async def delete_document(
91
  db: AsyncSession = Depends(get_db)
92
  ):
93
  """Delete a document."""
94
- await document_pipeline.delete(document_id, user_id, db)
95
- return {"status": "success", "message": "Document deleted successfully"}
96
-
97
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  @router.post("/document/process")
99
  @log_execution(logger)
100
  async def process_document(
@@ -103,6 +149,45 @@ async def process_document(
103
  db: AsyncSession = Depends(get_db)
104
  ):
105
  """Process document and ingest to vector index."""
106
- data = await document_pipeline.process(document_id, user_id, db)
107
- return {"status": "success", "message": "Document processed successfully", "data": data}
108
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Document management API endpoints."""
2
+
3
+ from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File, status
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
  from src.db.postgres.connection import get_db
6
  from src.document.document_service import document_service
7
+ from src.knowledge.processing_service import knowledge_processor
8
+ from src.storage.az_blob.az_blob import blob_storage
9
  from src.middlewares.logging import get_logger, log_execution
10
  from src.middlewares.rate_limit import limiter
 
11
  from pydantic import BaseModel
12
  from typing import List
13
+
14
  logger = get_logger("document_api")
15
+
16
  router = APIRouter(prefix="/api/v1", tags=["Documents"])
17
+
18
+
19
  class DocumentResponse(BaseModel):
20
  id: str
21
  filename: str
 
23
  file_size: int
24
  file_type: str
25
  created_at: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
 
44
  )
45
  for doc in documents
46
  ]
47
+
48
+
49
  @router.post("/document/upload")
50
  @limiter.limit("10/minute")
51
  @log_execution(logger)
 
57
  ):
58
  """Upload a document."""
59
  if not user_id:
60
+ raise HTTPException(
61
+ status_code=400,
62
+ detail="user_id is required"
63
+ )
64
+
65
+ try:
66
+ # Read file content
67
+ content = await file.read()
68
+ file_size = len(content)
69
+
70
+ # Get file type
71
+ filename = file.filename
72
+ file_type = filename.split('.')[-1].lower() if '.' in filename else 'txt'
73
+
74
+ if file_type not in ['pdf', 'docx', 'txt']:
75
+ raise HTTPException(
76
+ status_code=400,
77
+ detail="Unsupported file type. Supported: pdf, docx, txt"
78
+ )
79
+
80
+ # Upload to blob storage
81
+ blob_name = await blob_storage.upload_file(content, filename, user_id)
82
+
83
+ # Create document record
84
+ document = await document_service.create_document(
85
+ db=db,
86
+ user_id=user_id,
87
+ filename=filename,
88
+ blob_name=blob_name,
89
+ file_size=file_size,
90
+ file_type=file_type
91
+ )
92
+
93
+ return {
94
+ "status": "success",
95
+ "message": "Document uploaded successfully",
96
+ "data": {
97
+ "id": document.id,
98
+ "filename": document.filename,
99
+ "status": document.status
100
+ }
101
+ }
102
+
103
+ except Exception as e:
104
+ logger.error(f"Upload failed for user {user_id}", error=str(e))
105
+ raise HTTPException(
106
+ status_code=500,
107
+ detail=f"Upload failed: {str(e)}"
108
+ )
109
+
110
+
111
  @router.delete("/document/delete")
112
  @log_execution(logger)
113
  async def delete_document(
 
116
  db: AsyncSession = Depends(get_db)
117
  ):
118
  """Delete a document."""
119
+ document = await document_service.get_document(db, document_id)
120
+
121
+ if not document:
122
+ raise HTTPException(
123
+ status_code=404,
124
+ detail="Document not found"
125
+ )
126
+
127
+ if document.user_id != user_id:
128
+ raise HTTPException(
129
+ status_code=403,
130
+ detail="Access denied"
131
+ )
132
+
133
+ success = await document_service.delete_document(db, document_id)
134
+
135
+ if success:
136
+ return {"status": "success", "message": "Document deleted successfully"}
137
+ else:
138
+ raise HTTPException(
139
+ status_code=500,
140
+ detail="Failed to delete document"
141
+ )
142
+
143
+
144
  @router.post("/document/process")
145
  @log_execution(logger)
146
  async def process_document(
 
149
  db: AsyncSession = Depends(get_db)
150
  ):
151
  """Process document and ingest to vector index."""
152
+ document = await document_service.get_document(db, document_id)
153
+
154
+ if not document:
155
+ raise HTTPException(
156
+ status_code=404,
157
+ detail="Document not found"
158
+ )
159
+
160
+ if document.user_id != user_id:
161
+ raise HTTPException(
162
+ status_code=403,
163
+ detail="Access denied"
164
+ )
165
+
166
+ try:
167
+ # Update status to processing
168
+ await document_service.update_document_status(db, document_id, "processing")
169
+
170
+ # Process document
171
+ chunks_count = await knowledge_processor.process_document(document, db)
172
+
173
+ # Update status to completed
174
+ await document_service.update_document_status(db, document_id, "completed")
175
+
176
+ return {
177
+ "status": "success",
178
+ "message": "Document processed successfully",
179
+ "data": {
180
+ "document_id": document_id,
181
+ "chunks_processed": chunks_count
182
+ }
183
+ }
184
+
185
+ except Exception as e:
186
+ logger.error(f"Processing failed for document {document_id}", error=str(e))
187
+ await document_service.update_document_status(
188
+ db, document_id, "failed", str(e)
189
+ )
190
+ raise HTTPException(
191
+ status_code=500,
192
+ detail=f"Processing failed: {str(e)}"
193
+ )
src/config/agents/system_prompt.md CHANGED
@@ -3,7 +3,8 @@ You are a helpful AI assistant with access to user's uploaded documents. Your ro
3
  1. Answer questions based on provided document context
4
  2. If no relevant information is found in documents, acknowledge this honestly
5
  3. Be concise and direct in your responses
6
- 4. If user's question is unclear, ask for clarification
 
7
 
8
  When document context is provided:
9
  - Use information from documents to answer accurately
 
3
  1. Answer questions based on provided document context
4
  2. If no relevant information is found in documents, acknowledge this honestly
5
  3. Be concise and direct in your responses
6
+ 4. Cite source documents when providing information
7
+ 5. If user's question is unclear, ask for clarification
8
 
9
  When document context is provided:
10
  - Use information from documents to answer accurately
src/config/settings.py CHANGED
@@ -61,11 +61,6 @@ class Settings(BaseSettings):
61
  # Bcrypt salt (for users - existing)
62
  emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
63
 
64
- # DB credential encryption (Fernet key for user-registered database creds)
65
- dataeyond_db_credential_key: str = Field(
66
- alias="dataeyond__db__credential__key"
67
- )
68
-
69
 
70
  # Singleton instance
71
  settings = Settings()
 
61
  # Bcrypt salt (for users - existing)
62
  emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
63
 
 
 
 
 
 
64
 
65
  # Singleton instance
66
  settings = Settings()
src/database_client/database_client_service.py DELETED
@@ -1,164 +0,0 @@
1
- """Service for managing user-registered external database connections."""
2
-
3
- import uuid
4
- from typing import List, Optional
5
-
6
- from sqlalchemy import delete, select
7
- from sqlalchemy.ext.asyncio import AsyncSession
8
-
9
- from src.db.postgres.models import DatabaseClient
10
- from src.middlewares.logging import get_logger
11
- from src.utils.db_credential_encryption import (
12
- decrypt_credentials_dict,
13
- encrypt_credentials_dict,
14
- )
15
-
16
- logger = get_logger("database_client_service")
17
-
18
-
19
- # Fields that identify the same physical database per db_type.
20
- _CONNECTION_IDENTITY_KEYS: dict[str, tuple[str, ...]] = {
21
- "postgres": ("host", "port", "database"),
22
- "supabase": ("host", "port", "database"),
23
- "mysql": ("host", "port", "database"),
24
- "sqlserver": ("host", "port", "database"),
25
- "bigquery": ("project_id", "dataset_id"),
26
- "snowflake": ("account", "warehouse", "database"),
27
- }
28
-
29
-
30
- class DatabaseClientService:
31
- """Service for managing user-registered external database connections."""
32
-
33
- async def _find_duplicate(
34
- self,
35
- db: AsyncSession,
36
- user_id: str,
37
- db_type: str,
38
- credentials: dict,
39
- ) -> Optional[DatabaseClient]:
40
- """Return an existing client if it points to the same physical database."""
41
- identity_keys = _CONNECTION_IDENTITY_KEYS.get(db_type, ())
42
- if not identity_keys:
43
- return None
44
-
45
- result = await db.execute(
46
- select(DatabaseClient).where(
47
- DatabaseClient.user_id == user_id,
48
- DatabaseClient.db_type == db_type,
49
- )
50
- )
51
- for existing in result.scalars().all():
52
- decrypted = decrypt_credentials_dict(existing.credentials)
53
- if all(
54
- decrypted.get(k) == credentials.get(k) for k in identity_keys
55
- ):
56
- return existing
57
- return None
58
-
59
- async def create(
60
- self,
61
- db: AsyncSession,
62
- user_id: str,
63
- name: str,
64
- db_type: str,
65
- credentials: dict,
66
- ) -> DatabaseClient:
67
- """Register a new database client connection.
68
-
69
- If a connection to the same physical database already exists for this
70
- user, the existing record is returned instead of creating a duplicate.
71
- Credentials are encrypted before being stored.
72
- """
73
- existing = await self._find_duplicate(db, user_id, db_type, credentials)
74
- if existing:
75
- logger.info(
76
- f"Duplicate connection detected, returning existing client {existing.id}"
77
- )
78
- return existing
79
-
80
- client = DatabaseClient(
81
- id=str(uuid.uuid4()),
82
- user_id=user_id,
83
- name=name,
84
- db_type=db_type,
85
- credentials=encrypt_credentials_dict(credentials),
86
- status="active",
87
- )
88
- db.add(client)
89
- await db.commit()
90
- await db.refresh(client)
91
- logger.info(f"Created database client {client.id} for user {user_id}")
92
- return client
93
-
94
- async def get_user_clients(
95
- self,
96
- db: AsyncSession,
97
- user_id: str,
98
- ) -> List[DatabaseClient]:
99
- """Return all active and inactive database clients for a user."""
100
- result = await db.execute(
101
- select(DatabaseClient)
102
- .where(DatabaseClient.user_id == user_id)
103
- .order_by(DatabaseClient.created_at.desc())
104
- )
105
- return result.scalars().all()
106
-
107
- async def get(
108
- self,
109
- db: AsyncSession,
110
- client_id: str,
111
- ) -> Optional[DatabaseClient]:
112
- """Return a single database client by its ID."""
113
- result = await db.execute(
114
- select(DatabaseClient).where(DatabaseClient.id == client_id)
115
- )
116
- return result.scalars().first()
117
-
118
- async def update(
119
- self,
120
- db: AsyncSession,
121
- client_id: str,
122
- name: Optional[str] = None,
123
- credentials: Optional[dict] = None,
124
- status: Optional[str] = None,
125
- ) -> Optional[DatabaseClient]:
126
- """Update an existing database client connection.
127
-
128
- Only non-None fields are updated.
129
- Credentials are re-encrypted if provided.
130
- """
131
- client = await self.get(db, client_id)
132
- if not client:
133
- return None
134
-
135
- if name is not None:
136
- client.name = name
137
- if credentials is not None:
138
- client.credentials = encrypt_credentials_dict(credentials)
139
- if status is not None:
140
- client.status = status
141
-
142
- await db.commit()
143
- await db.refresh(client)
144
- logger.info(f"Updated database client {client_id}")
145
- return client
146
-
147
- async def delete(
148
- self,
149
- db: AsyncSession,
150
- client_id: str,
151
- ) -> bool:
152
- """Permanently delete a database client connection."""
153
- result = await db.execute(
154
- delete(DatabaseClient).where(DatabaseClient.id == client_id)
155
- )
156
- await db.commit()
157
- deleted = result.rowcount > 0
158
- if deleted:
159
- logger.info(f"Deleted database client {client_id}")
160
- return deleted
161
-
162
-
163
- database_client_service = DatabaseClientService()
164
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/db/postgres/init_db.py CHANGED
@@ -2,14 +2,7 @@
2
 
3
  from sqlalchemy import text
4
  from src.db.postgres.connection import engine, Base
5
- from src.db.postgres.models import (
6
- ChatMessage,
7
- DatabaseClient,
8
- Document,
9
- MessageSource,
10
- Room,
11
- User,
12
- )
13
 
14
 
15
  async def init_db():
@@ -28,38 +21,3 @@ async def init_db():
28
  await conn.execute(text(
29
  "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
30
  ))
31
-
32
- # HNSW index for fast approximate vector similarity search
33
- # Only created when the embedding column has explicit dimensions (HNSW requirement).
34
- # atttypmod > 0 means the vector column was created with a dimension (e.g. vector(1536));
35
- # atttypmod = -1 means dimensionless — HNSW would fail with "column does not have dimensions".
36
- await conn.execute(text("""
37
- DO $$
38
- BEGIN
39
- IF EXISTS (
40
- SELECT FROM pg_attribute a
41
- JOIN pg_class c ON c.oid = a.attrelid
42
- WHERE c.relname = 'langchain_pg_embedding'
43
- AND a.attname = 'embedding'
44
- AND a.atttypmod > 0
45
- ) THEN
46
- CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_hnsw
47
- ON langchain_pg_embedding USING hnsw (embedding vector_cosine_ops);
48
- END IF;
49
- END $$
50
- """))
51
-
52
- # GIN index for FTS on schema chunks — only created if table exists
53
- # (langchain_pg_embedding is created by PGVector on first use, not by create_all)
54
- await conn.execute(text("""
55
- DO $$
56
- BEGIN
57
- IF EXISTS (
58
- SELECT FROM information_schema.tables
59
- WHERE table_name = 'langchain_pg_embedding'
60
- ) THEN
61
- CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_fts
62
- ON langchain_pg_embedding USING GIN (to_tsvector('english', document));
63
- END IF;
64
- END $$
65
- """))
 
2
 
3
  from sqlalchemy import text
4
  from src.db.postgres.connection import engine, Base
5
+ from src.db.postgres.models import Document, Room, ChatMessage, User, MessageSource
 
 
 
 
 
 
 
6
 
7
 
8
  async def init_db():
 
21
  await conn.execute(text(
22
  "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
23
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/db/postgres/models.py CHANGED
@@ -4,7 +4,6 @@ from uuid import uuid4
4
  from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
5
  from sqlalchemy.orm import relationship
6
  from sqlalchemy.sql import func
7
- from sqlalchemy.dialects.postgresql import JSONB
8
  from src.db.postgres.connection import Base
9
 
10
 
@@ -82,18 +81,3 @@ class MessageSource(Base):
82
  created_at = Column(DateTime(timezone=True), server_default=func.now())
83
 
84
  message = relationship("ChatMessage", back_populates="sources")
85
-
86
-
87
- class DatabaseClient(Base):
88
- """User-registered external database connections."""
89
- __tablename__ = "databases"
90
-
91
- id = Column(String, primary_key=True, default=lambda: str(uuid4()))
92
- user_id = Column(String, nullable=False, index=True)
93
- name = Column(String, nullable=False) # display name, e.g. "Prod DB"
94
- db_type = Column(String, nullable=False) # postgres|mysql|sqlserver|supabase|bigquery|snowflake
95
- credentials = Column(JSONB, nullable=False) # per-type JSON; sensitive fields Fernet-encrypted
96
- status = Column(String, nullable=False, default="active") # active | inactive
97
- created_at = Column(DateTime(timezone=True), server_default=func.now())
98
- updated_at = Column(DateTime(timezone=True), onupdate=func.now())
99
-
 
4
  from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
5
  from sqlalchemy.orm import relationship
6
  from sqlalchemy.sql import func
 
7
  from src.db.postgres.connection import Base
8
 
9
 
 
81
  created_at = Column(DateTime(timezone=True), server_default=func.now())
82
 
83
  message = relationship("ChatMessage", back_populates="sources")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/document/document_service.py CHANGED
@@ -1,9 +1,8 @@
1
  """Service for managing documents."""
2
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
- from sqlalchemy import select, delete, text
5
  from src.db.postgres.models import Document
6
- from src.db.postgres.connection import _pgvector_engine
7
  from src.storage.az_blob.az_blob import blob_storage
8
  from src.middlewares.logging import get_logger
9
  from typing import List, Optional
@@ -78,21 +77,6 @@ class DocumentService:
78
  # Delete from blob storage
79
  await blob_storage.delete_file(document.blob_name)
80
 
81
- # Delete vector embeddings from pgvector (scoped to user + collection to avoid cross-user over-delete)
82
- async with _pgvector_engine.begin() as conn:
83
- await conn.execute(
84
- text("""
85
- DELETE FROM langchain_pg_embedding
86
- WHERE cmetadata->>'user_id' = :user_id
87
- AND cmetadata->>'source_type' = 'document'
88
- AND cmetadata->'data'->>'document_id' = :doc_id
89
- AND collection_id = (
90
- SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'
91
- )
92
- """),
93
- {"user_id": document.user_id, "doc_id": document_id},
94
- )
95
-
96
  # Delete from database
97
  await db.execute(
98
  delete(Document).where(Document.id == document_id)
 
1
  """Service for managing documents."""
2
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
+ from sqlalchemy import select, delete
5
  from src.db.postgres.models import Document
 
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.middlewares.logging import get_logger
8
  from typing import List, Optional
 
77
  # Delete from blob storage
78
  await blob_storage.delete_file(document.blob_name)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # Delete from database
81
  await db.execute(
82
  delete(Document).where(Document.id == document_id)
src/knowledge/parquet_service.py DELETED
@@ -1,77 +0,0 @@
1
- """Parquet service — converts, uploads, downloads, and deletes Parquet files for CSV/XLSX.
2
-
3
- Parquet files are stored in Azure Blob alongside the original document using
4
- a deterministic naming convention based on document_id:
5
-
6
- CSV: {user_id}/{document_id}.parquet
7
- XLSX sheet: {user_id}/{document_id}__{safe_sheet_name}.parquet
8
-
9
- This allows tabular.py to construct the correct blob name at retrieval time
10
- without needing to store it separately, and allows document_pipeline.py to
11
- delete all Parquet files for a document using a prefix delete.
12
- """
13
-
14
- import io
15
-
16
- import pandas as pd
17
-
18
- from src.middlewares.logging import get_logger
19
- from src.storage.az_blob.az_blob import blob_storage
20
-
21
- logger = get_logger("parquet_service")
22
-
23
-
24
- def _safe_sheet_name(sheet_name: str) -> str:
25
- return sheet_name.replace("/", "_").replace(" ", "_").replace("\\", "_")
26
-
27
-
28
- def parquet_blob_name(user_id: str, document_id: str, sheet_name: str | None = None) -> str:
29
- """Construct deterministic Parquet blob name."""
30
- if sheet_name:
31
- return f"{user_id}/{document_id}__{_safe_sheet_name(sheet_name)}.parquet"
32
- return f"{user_id}/{document_id}.parquet"
33
-
34
-
35
- def _to_parquet_bytes(df: pd.DataFrame) -> bytes:
36
- buf = io.BytesIO()
37
- df.to_parquet(buf, index=False)
38
- return buf.getvalue()
39
-
40
-
41
- async def upload_parquet(
42
- df: pd.DataFrame,
43
- user_id: str,
44
- document_id: str,
45
- sheet_name: str | None = None,
46
- ) -> str:
47
- """Convert DataFrame to Parquet and upload to Azure Blob. Returns blob_name."""
48
- blob_name = parquet_blob_name(user_id, document_id, sheet_name)
49
- parquet_bytes = _to_parquet_bytes(df)
50
- await blob_storage.upload_bytes(parquet_bytes, blob_name)
51
- logger.info(f"Uploaded Parquet {blob_name} ({len(parquet_bytes)} bytes)")
52
- return blob_name
53
-
54
-
55
- async def download_parquet(
56
- user_id: str,
57
- document_id: str,
58
- sheet_name: str | None = None,
59
- ) -> pd.DataFrame:
60
- """Download Parquet from Azure Blob and return as DataFrame."""
61
- blob_name = parquet_blob_name(user_id, document_id, sheet_name)
62
- content = await blob_storage.download_file(blob_name)
63
- df = pd.read_parquet(io.BytesIO(content))
64
- logger.info(f"Downloaded Parquet {blob_name}: {len(df)} rows, {len(df.columns)} columns")
65
- return df
66
-
67
-
68
- async def delete_document_parquets(user_id: str, document_id: str) -> int:
69
- """Delete all Parquet files for a document (CSV = 1 file, XLSX = one per sheet).
70
-
71
- Uses prefix delete: {user_id}/{document_id} matches all Parquet variants
72
- for this document without touching the original blob (which uses a random UUID name).
73
- """
74
- prefix = f"{user_id}/{document_id}"
75
- deleted = await blob_storage.delete_blobs_with_prefix(prefix)
76
- logger.info(f"Deleted {deleted} Parquet file(s) for document {document_id}")
77
- return deleted
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/knowledge/processing_service.py CHANGED
@@ -5,20 +5,16 @@ from langchain_core.documents import Document as LangChainDocument
5
  from src.db.postgres.vector_store import get_vector_store
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.db.postgres.models import Document as DBDocument
 
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
  from src.middlewares.logging import get_logger
10
- from src.knowledge.parquet_service import upload_parquet
 
11
  from typing import List
12
- from datetime import datetime, timezone, timedelta
13
- import sys
14
  import docx
15
- import pandas as pd
16
- import pytesseract
17
- from pdf2image import convert_from_bytes
18
  from io import BytesIO
19
 
20
- _JAKARTA_TZ = timezone(timedelta(hours=7))
21
-
22
  logger = get_logger("knowledge_processing")
23
 
24
 
@@ -44,10 +40,6 @@ class KnowledgeProcessingService:
44
 
45
  if db_doc.file_type == "pdf":
46
  documents = await self._build_pdf_documents(content, db_doc)
47
- elif db_doc.file_type == "csv":
48
- documents = await self._build_csv_documents(content, db_doc)
49
- elif db_doc.file_type == "xlsx":
50
- documents = await self._build_excel_documents(content, db_doc)
51
  else:
52
  text = self._extract_text(content, db_doc.file_type)
53
  if not text.strip():
@@ -57,15 +49,10 @@ class KnowledgeProcessingService:
57
  LangChainDocument(
58
  page_content=chunk,
59
  metadata={
 
60
  "user_id": db_doc.user_id,
61
- "source_type": "document",
62
- "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
63
- "data": {
64
- "document_id": db_doc.id,
65
- "filename": db_doc.filename,
66
- "file_type": db_doc.file_type,
67
- "chunk_index": i,
68
- },
69
  }
70
  )
71
  for i, chunk in enumerate(chunks)
@@ -87,138 +74,62 @@ class KnowledgeProcessingService:
87
  async def _build_pdf_documents(
88
  self, content: bytes, db_doc: DBDocument
89
  ) -> List[LangChainDocument]:
90
- """Build LangChain documents from PDF with page_label metadata using Tesseract OCR."""
 
 
 
 
91
  documents: List[LangChainDocument] = []
92
 
93
- poppler_path = None
94
- if sys.platform == "win32":
95
- pytesseract.pytesseract.tesseract_cmd = r"./software/Tesseract-OCR/tesseract.exe"
96
- poppler_path = "./software/poppler-24.08.0/Library/bin"
97
-
98
- images = convert_from_bytes(content, poppler_path=poppler_path)
99
- logger.info(f"Tesseract OCR: converting {len(images)} pages")
100
-
101
- for page_num, image in enumerate(images, start=1):
102
- page_text = pytesseract.image_to_string(image)
103
- if not page_text.strip():
104
- continue
105
- for chunk in self.text_splitter.split_text(page_text):
106
- documents.append(LangChainDocument(
107
- page_content=chunk,
108
- metadata={
109
- "user_id": db_doc.user_id,
110
- "source_type": "document",
111
- "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
112
- "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  "document_id": db_doc.id,
 
114
  "filename": db_doc.filename,
115
- "file_type": db_doc.file_type,
116
  "chunk_index": len(documents),
117
  "page_label": page_num,
118
- },
119
- }
120
- ))
121
-
122
- return documents
123
-
124
- def _profile_dataframe(
125
- self, df: pd.DataFrame, source_name: str, db_doc: DBDocument
126
- ) -> List[LangChainDocument]:
127
- """Profile each column of a dataframe → one chunk per column."""
128
- documents = []
129
- row_count = len(df)
130
-
131
- for col_name in df.columns:
132
- col = df[col_name]
133
- is_numeric = pd.api.types.is_numeric_dtype(col)
134
- null_count = int(col.isnull().sum())
135
- distinct_count = int(col.nunique())
136
- distinct_ratio = distinct_count / row_count if row_count > 0 else 0
137
-
138
- text = f"Source: {source_name} ({row_count} rows)\n"
139
- text += f"Column: {col_name} ({col.dtype})\n"
140
- text += f"Null count: {null_count}\n"
141
- text += f"Distinct count: {distinct_count} ({distinct_ratio:.1%})\n"
142
-
143
- if is_numeric:
144
- text += f"Min: {col.min()}, Max: {col.max()}\n"
145
- text += f"Mean: {col.mean():.4f}, Median: {col.median():.4f}\n"
146
-
147
- if 0 < distinct_ratio <= 0.05:
148
- top_values = col.value_counts().head(10)
149
- top_str = ", ".join(f"{v} ({c})" for v, c in top_values.items())
150
- text += f"Top values: {top_str}\n"
151
-
152
- text += f"Sample values: {col.dropna().head(5).tolist()}"
153
-
154
- documents.append(LangChainDocument(
155
- page_content=text,
156
- metadata={
157
- "user_id": db_doc.user_id,
158
- "source_type": "document",
159
- "chunk_level": "column",
160
- "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
161
- "data": {
162
- "document_id": db_doc.id,
163
- "filename": db_doc.filename,
164
- "file_type": db_doc.file_type,
165
- "source": source_name,
166
- "column_name": col_name,
167
- "column_type": str(col.dtype),
168
- }
169
- }
170
- ))
171
- return documents
172
-
173
- def _to_sheet_document(
174
- self, df: pd.DataFrame, db_doc: DBDocument, sheet_name: str | None, source_name: str
175
- ) -> LangChainDocument:
176
- col_summary = ", ".join(f"{c} ({df[c].dtype})" for c in df.columns)
177
- text = (
178
- f"Source: {source_name} ({len(df)} rows)\n"
179
- f"Columns ({len(df.columns)}): {col_summary}"
180
- )
181
- return LangChainDocument(
182
- page_content=text,
183
- metadata={
184
- "user_id": db_doc.user_id,
185
- "source_type": "document",
186
- "chunk_level": "sheet",
187
- "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
188
- "data": {
189
- "document_id": db_doc.id,
190
- "filename": db_doc.filename,
191
- "file_type": db_doc.file_type,
192
- "sheet_name": sheet_name,
193
- "column_names": list(df.columns),
194
- "row_count": len(df),
195
- },
196
- },
197
- )
198
 
199
- async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
200
- """Profile each column of a CSV file and upload Parquet to Azure Blob."""
201
- df = pd.read_csv(BytesIO(content))
202
- await upload_parquet(df, db_doc.user_id, db_doc.id)
203
- logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
204
- docs = self._profile_dataframe(df, db_doc.filename, db_doc)
205
- docs.append(self._to_sheet_document(df, db_doc, sheet_name=None, source_name=db_doc.filename))
206
- return docs
207
-
208
- async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
209
- """Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
210
- sheets = pd.read_excel(BytesIO(content), sheet_name=None)
211
- documents = []
212
- for sheet_name, df in sheets.items():
213
- source_name = f"{db_doc.filename} / sheet: {sheet_name}"
214
- docs = self._profile_dataframe(df, source_name, db_doc)
215
- for doc in docs:
216
- doc.metadata["data"]["sheet_name"] = sheet_name
217
- doc.metadata["chunk_level"] = "column"
218
- documents.extend(docs)
219
- documents.append(self._to_sheet_document(df, db_doc, sheet_name, source_name))
220
- await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
221
- logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
222
  return documents
223
 
224
  def _extract_text(self, content: bytes, file_type: str) -> str:
 
5
  from src.db.postgres.vector_store import get_vector_store
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.db.postgres.models import Document as DBDocument
8
+ from src.config.settings import settings
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from src.middlewares.logging import get_logger
11
+ from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
12
+ from azure.core.credentials import AzureKeyCredential
13
  from typing import List
14
+ import pypdf
 
15
  import docx
 
 
 
16
  from io import BytesIO
17
 
 
 
18
  logger = get_logger("knowledge_processing")
19
 
20
 
 
40
 
41
  if db_doc.file_type == "pdf":
42
  documents = await self._build_pdf_documents(content, db_doc)
 
 
 
 
43
  else:
44
  text = self._extract_text(content, db_doc.file_type)
45
  if not text.strip():
 
49
  LangChainDocument(
50
  page_content=chunk,
51
  metadata={
52
+ "document_id": db_doc.id,
53
  "user_id": db_doc.user_id,
54
+ "filename": db_doc.filename,
55
+ "chunk_index": i,
 
 
 
 
 
 
56
  }
57
  )
58
  for i, chunk in enumerate(chunks)
 
74
  async def _build_pdf_documents(
75
  self, content: bytes, db_doc: DBDocument
76
  ) -> List[LangChainDocument]:
77
+ """Build LangChain documents from PDF with page_label metadata.
78
+
79
+ Uses Azure Document Intelligence (per-page) when credentials are present,
80
+ falls back to pypdf (also per-page) otherwise.
81
+ """
82
  documents: List[LangChainDocument] = []
83
 
84
+ if settings.azureai_docintel_endpoint and settings.azureai_docintel_key:
85
+ async with DocumentIntelligenceClient(
86
+ endpoint=settings.azureai_docintel_endpoint,
87
+ credential=AzureKeyCredential(settings.azureai_docintel_key),
88
+ ) as client:
89
+ poller = await client.begin_analyze_document(
90
+ model_id="prebuilt-read",
91
+ body=BytesIO(content),
92
+ content_type="application/pdf",
93
+ )
94
+ result = await poller.result()
95
+ logger.info(f"Azure DI extracted {len(result.pages or [])} pages")
96
+
97
+ for page in result.pages or []:
98
+ page_text = "\n".join(
99
+ line.content for line in (page.lines or [])
100
+ )
101
+ if not page_text.strip():
102
+ continue
103
+ for chunk in self.text_splitter.split_text(page_text):
104
+ documents.append(LangChainDocument(
105
+ page_content=chunk,
106
+ metadata={
107
+ "document_id": db_doc.id,
108
+ "user_id": db_doc.user_id,
109
+ "filename": db_doc.filename,
110
+ "chunk_index": len(documents),
111
+ "page_label": page.page_number,
112
+ }
113
+ ))
114
+ else:
115
+ logger.warning("Azure DI not configured, using pypdf")
116
+ pdf_reader = pypdf.PdfReader(BytesIO(content))
117
+ for page_num, page in enumerate(pdf_reader.pages, start=1):
118
+ page_text = page.extract_text() or ""
119
+ if not page_text.strip():
120
+ continue
121
+ for chunk in self.text_splitter.split_text(page_text):
122
+ documents.append(LangChainDocument(
123
+ page_content=chunk,
124
+ metadata={
125
  "document_id": db_doc.id,
126
+ "user_id": db_doc.user_id,
127
  "filename": db_doc.filename,
 
128
  "chunk_index": len(documents),
129
  "page_label": page_num,
130
+ }
131
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  return documents
134
 
135
  def _extract_text(self, content: bytes, file_type: str) -> str:
src/models/credentials.py DELETED
@@ -1,164 +0,0 @@
1
- """Pydantic credential schemas for user-registered external databases.
2
-
3
- Imported by the `/database-clients` API router (`src/api/v1/db_client.py`) and,
4
- via `DbType`, by the db pipeline connector (`src/pipeline/db_pipeline/connector.py`).
5
-
6
- Sensitive fields (`password`, `service_account_json`) are Fernet-encrypted by
7
- the database_client service before being stored in the JSONB column; these
8
- schemas describe the plaintext wire format, not the stored shape.
9
- """
10
-
11
- from typing import Literal, Optional, Union
12
-
13
- from pydantic import BaseModel, Field
14
-
15
- # ---------------------------------------------------------------------------
16
- # Supported DB types
17
- # ---------------------------------------------------------------------------
18
-
19
- DbType = Literal["postgres", "mysql", "sqlserver", "supabase", "bigquery", "snowflake"]
20
-
21
-
22
- # ---------------------------------------------------------------------------
23
- # Typed credential schemas per DB type
24
- # ---------------------------------------------------------------------------
25
-
26
-
27
- class PostgresCredentials(BaseModel):
28
- """Connection credentials for PostgreSQL."""
29
-
30
- host: str = Field(..., description="Hostname or IP address of the PostgreSQL server.", examples=["db.example.com"])
31
- port: int = Field(5432, description="Port number (default: 5432).", examples=[5432])
32
- database: str = Field(..., description="Name of the target database.", examples=["mydb"])
33
- username: str = Field(..., description="Database username.", examples=["admin"])
34
- password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
35
- ssl_mode: Literal["disable", "require", "verify-ca", "verify-full"] = Field(
36
- "require",
37
- description="SSL mode for the connection.",
38
- examples=["require"],
39
- )
40
-
41
-
42
- class MysqlCredentials(BaseModel):
43
- """Connection credentials for MySQL."""
44
-
45
- host: str = Field(..., description="Hostname or IP address of the MySQL server.", examples=["db.example.com"])
46
- port: int = Field(3306, description="Port number (default: 3306).", examples=[3306])
47
- database: str = Field(..., description="Name of the target database.", examples=["mydb"])
48
- username: str = Field(..., description="Database username.", examples=["admin"])
49
- password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
50
- ssl: bool = Field(True, description="Enable SSL for the connection.", examples=[True])
51
-
52
-
53
- class SqlServerCredentials(BaseModel):
54
- """Connection credentials for Microsoft SQL Server."""
55
-
56
- host: str = Field(..., description="Hostname or IP address of the SQL Server.", examples=["sqlserver.example.com"])
57
- port: int = Field(1433, description="Port number (default: 1433).", examples=[1433])
58
- database: str = Field(..., description="Name of the target database.", examples=["mydb"])
59
- username: str = Field(..., description="Database username.", examples=["sa"])
60
- password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
61
- driver: Optional[str] = Field(
62
- None,
63
- description="ODBC driver name. Leave empty to use the default driver.",
64
- examples=["ODBC Driver 17 for SQL Server"],
65
- )
66
-
67
-
68
- class SupabaseCredentials(BaseModel):
69
- """Connection credentials for Supabase (PostgreSQL-based).
70
-
71
- Use the connection string details from your Supabase project dashboard
72
- under Settings > Database.
73
- """
74
-
75
- host: str = Field(
76
- ...,
77
- description="Supabase database host (e.g. db.<project-ref>.supabase.co, or the pooler host).",
78
- examples=["db.xxxx.supabase.co"],
79
- )
80
- port: int = Field(
81
- 5432,
82
- description="Port number. Use 5432 for direct connection, 6543 for the connection pooler.",
83
- examples=[5432],
84
- )
85
- database: str = Field("postgres", description="Database name (always 'postgres' for Supabase).", examples=["postgres"])
86
- username: str = Field(
87
- ...,
88
- description="Database user. Use 'postgres' for direct connection, or 'postgres.<project-ref>' for the pooler.",
89
- examples=["postgres"],
90
- )
91
- password: str = Field(..., description="Database password (set in Supabase dashboard). Will be encrypted at rest.", examples=["s3cr3t!"])
92
- ssl_mode: Literal["require", "verify-ca", "verify-full"] = Field(
93
- "require",
94
- description="SSL mode. Supabase always requires SSL.",
95
- examples=["require"],
96
- )
97
-
98
-
99
- class BigQueryCredentials(BaseModel):
100
- """Connection credentials for Google BigQuery.
101
-
102
- Requires a GCP Service Account with at least BigQuery Data Viewer
103
- and BigQuery Job User roles.
104
- """
105
-
106
- project_id: str = Field(..., description="GCP project ID where the BigQuery dataset resides.", examples=["my-gcp-project"])
107
- dataset_id: str = Field(..., description="BigQuery dataset name to connect to.", examples=["my_dataset"])
108
- location: Optional[str] = Field(
109
- "US",
110
- description="Dataset location/region (default: US).",
111
- examples=["US", "EU", "asia-southeast1"],
112
- )
113
- service_account_json: str = Field(
114
- ...,
115
- description=(
116
- "Full content of the GCP Service Account key JSON file as a string. "
117
- "Will be encrypted at rest."
118
- ),
119
- examples=['{"type":"service_account","project_id":"my-gcp-project","private_key_id":"..."}'],
120
- )
121
-
122
-
123
- class SnowflakeCredentials(BaseModel):
124
- """Connection credentials for Snowflake."""
125
-
126
- account: str = Field(
127
- ...,
128
- description="Snowflake account identifier, including region if applicable (e.g. myaccount.us-east-1).",
129
- examples=["myaccount.us-east-1"],
130
- )
131
- warehouse: str = Field(..., description="Name of the virtual warehouse to use for queries.", examples=["COMPUTE_WH"])
132
- database: str = Field(..., description="Name of the target Snowflake database.", examples=["MY_DB"])
133
- db_schema: Optional[str] = Field("PUBLIC", alias="schema", description="Schema name (default: PUBLIC).", examples=["PUBLIC"])
134
- username: str = Field(..., description="Snowflake username.", examples=["admin"])
135
- password: str = Field(..., description="Snowflake password. Will be encrypted at rest.", examples=["s3cr3t!"])
136
- role: Optional[str] = Field(None, description="Snowflake role to assume for the session.", examples=["SYSADMIN"])
137
-
138
-
139
- # Union of all credential shapes — reserved for future typed validation on
140
- # DatabaseClientCreate.credentials (currently Dict[str, Any]). Kept exported
141
- # so downstream code can reference it without re-declaring.
142
- CredentialsUnion = Union[
143
- PostgresCredentials,
144
- MysqlCredentials,
145
- SqlServerCredentials,
146
- SupabaseCredentials,
147
- BigQueryCredentials,
148
- SnowflakeCredentials,
149
- ]
150
-
151
-
152
- # Doc-only helper: surfaces per-type credential shapes in the Swagger "Schemas"
153
- # panel so API consumers can discover the exact field set for each db_type.
154
- # Not referenced by any endpoint — importing it in db_client.py is enough for
155
- # FastAPI's OpenAPI generator to pick it up.
156
- class CredentialSchemas(BaseModel):
157
- """Reference schemas for `credentials` per `db_type` (Swagger-only, not used by endpoints)."""
158
-
159
- postgres: PostgresCredentials
160
- mysql: MysqlCredentials
161
- sqlserver: SqlServerCredentials
162
- supabase: SupabaseCredentials
163
- bigquery: BigQueryCredentials
164
- snowflake: SnowflakeCredentials
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/sql_query.py DELETED
@@ -1,8 +0,0 @@
1
- """Structured output model for LLM-generated SQL queries."""
2
-
3
- from pydantic import BaseModel, Field
4
-
5
-
6
- class SQLQuery(BaseModel):
7
- sql: str = Field(description="A single SQL SELECT statement. No markdown, no explanation inline.")
8
- reasoning: str = Field(description="One sentence: what this query answers.")
 
 
 
 
 
 
 
 
 
src/models/structured_output.py CHANGED
@@ -19,7 +19,3 @@ class IntentClassification(BaseModel):
19
  default="",
20
  description="Direct response if no search needed (for greetings, etc.)"
21
  )
22
- source_hint: str = Field(
23
- default="both",
24
- description="Which sources to search: 'document' (PDF/DOCX/TXT), 'schema' (DB/CSV/XLSX), or 'both'"
25
- )
 
19
  default="",
20
  description="Direct response if no search needed (for greetings, etc.)"
21
  )
 
 
 
 
src/pipeline/db_pipeline/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from src.pipeline.db_pipeline.db_pipeline_service import DbPipelineService, db_pipeline_service
2
-
3
- __all__ = ["DbPipelineService", "db_pipeline_service"]
 
 
 
 
src/pipeline/db_pipeline/db_pipeline_service.py DELETED
@@ -1,302 +0,0 @@
1
- """Service for ingesting a user's external database into the vector store.
2
-
3
- End-to-end flow: connect -> introspect schema -> profile columns -> build text
4
- -> embed + store in the shared PGVector collection (tagged with
5
- `source_type="database"`, retrievable via the same retriever used for docs).
6
-
7
- Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
8
- async vector writes stay on the event loop.
9
- """
10
-
11
- import asyncio
12
- from contextlib import contextmanager
13
- from datetime import datetime, timezone, timedelta
14
- from typing import Any, Iterator, Optional
15
-
16
- from langchain_core.documents import Document as LangChainDocument
17
- from sqlalchemy import URL, create_engine, text
18
- from sqlalchemy.engine import Engine
19
-
20
- from src.db.postgres.connection import _pgvector_engine
21
- from src.db.postgres.vector_store import get_vector_store
22
- from src.middlewares.logging import get_logger
23
- from src.models.credentials import DbType
24
- from src.pipeline.db_pipeline.extractor import (
25
- build_table_chunk,
26
- fetch_sample_row,
27
- get_row_count,
28
- get_schema,
29
- profile_table,
30
- )
31
-
32
- logger = get_logger("db_pipeline")
33
-
34
-
35
- class DbPipelineService:
36
- """End-to-end DB ingestion: connect -> introspect -> profile -> embed -> store."""
37
-
38
- def connect(self, db_type: DbType, credentials: dict[str, Any]) -> Engine:
39
- """Build a SQLAlchemy engine for the user's database.
40
-
41
- `credentials` is the plaintext dict matching the per-type schema in
42
- `src/models/credentials.py`. BigQuery/Snowflake auth models differ
43
- from host/port/user/pass, so every shape flows through one dict.
44
-
45
- Optional driver imports (snowflake-sqlalchemy, json for BigQuery) are
46
- done lazily so an env missing one driver doesn't break module import.
47
- """
48
- logger.info("connecting to user db", db_type=db_type)
49
-
50
- if db_type in ("postgres", "supabase"):
51
- query = (
52
- {"sslmode": credentials["ssl_mode"]} if credentials.get("ssl_mode") else {}
53
- )
54
- url = URL.create(
55
- drivername="postgresql+psycopg2",
56
- username=credentials["username"],
57
- password=credentials["password"],
58
- host=credentials["host"],
59
- port=credentials["port"],
60
- database=credentials["database"],
61
- query=query,
62
- )
63
- return create_engine(url)
64
-
65
- if db_type == "mysql":
66
- url = URL.create(
67
- drivername="mysql+pymysql",
68
- username=credentials["username"],
69
- password=credentials["password"],
70
- host=credentials["host"],
71
- port=credentials["port"],
72
- database=credentials["database"],
73
- )
74
- # pymysql only activates TLS when the `ssl` dict is truthy
75
- # (empty dict is falsy and silently disables TLS). Use system-
76
- # default CAs via certifi + hostname verification — required by
77
- # managed MySQL providers like TiDB Cloud / PlanetScale / Aiven.
78
- if credentials.get("ssl", True):
79
- import certifi
80
-
81
- connect_args = {
82
- "ssl": {
83
- "ca": certifi.where(),
84
- "check_hostname": True,
85
- }
86
- }
87
- else:
88
- connect_args = {}
89
- return create_engine(url, connect_args=connect_args)
90
-
91
- if db_type == "sqlserver":
92
- # `driver` applies to pyodbc only; we ship pymssql. Accept-and-ignore
93
- # keeps the credential schema stable.
94
- if credentials.get("driver"):
95
- logger.info(
96
- "sqlserver driver hint ignored (using pymssql)",
97
- driver=credentials["driver"],
98
- )
99
- url = URL.create(
100
- drivername="mssql+pymssql",
101
- username=credentials["username"],
102
- password=credentials["password"],
103
- host=credentials["host"],
104
- port=credentials["port"],
105
- database=credentials["database"],
106
- )
107
- return create_engine(url)
108
-
109
- if db_type == "bigquery":
110
- import json
111
-
112
- sa_info = json.loads(credentials["service_account_json"])
113
- # sqlalchemy-bigquery URL shape: bigquery://<project>/<dataset>
114
- url = f"bigquery://{credentials['project_id']}/{credentials['dataset_id']}"
115
- return create_engine(
116
- url,
117
- credentials_info=sa_info,
118
- location=credentials.get("location", "US"),
119
- )
120
-
121
- if db_type == "snowflake":
122
- from snowflake.sqlalchemy import URL as SnowflakeURL
123
-
124
- url = SnowflakeURL(
125
- account=credentials["account"],
126
- user=credentials["username"],
127
- password=credentials["password"],
128
- database=credentials["database"],
129
- schema=(
130
- credentials.get("db_schema")
131
- or credentials.get("schema")
132
- or "PUBLIC"
133
- ),
134
- warehouse=credentials["warehouse"],
135
- role=credentials.get("role") or "",
136
- )
137
- return create_engine(url)
138
-
139
- raise NotImplementedError(f"Unsupported db_type: {db_type}")
140
-
141
- @contextmanager
142
- def engine_scope(
143
- self, db_type: DbType, credentials: dict[str, Any]
144
- ) -> Iterator[Engine]:
145
- """Yield a connected Engine and dispose its pool on exit.
146
-
147
- API callers should prefer this over raw `connect(...)` so user DB
148
- connection pools do not leak between pipeline runs.
149
- """
150
- engine = self.connect(db_type, credentials)
151
- try:
152
- yield engine
153
- finally:
154
- engine.dispose()
155
-
156
- def _to_document(
157
- self, user_id: str, client_id: str, table_name: str, entry: dict, updated_at: str
158
- ) -> LangChainDocument:
159
- col = entry["col"]
160
- return LangChainDocument(
161
- page_content=entry["text"],
162
- metadata={
163
- "user_id": user_id,
164
- "source_type": "database",
165
- "chunk_level": "column",
166
- "database_client_id": client_id,
167
- "updated_at": updated_at,
168
- "data": {
169
- "table_name": table_name,
170
- "column_name": col["name"],
171
- "column_type": col["type"],
172
- "is_primary_key": col.get("is_primary_key", False),
173
- "foreign_key": col.get("foreign_key"),
174
- },
175
- },
176
- )
177
-
178
- def _to_table_document(
179
- self,
180
- user_id: str,
181
- client_id: str,
182
- table_name: str,
183
- columns: list[dict],
184
- row_count: int,
185
- text: str,
186
- updated_at: str,
187
- ) -> LangChainDocument:
188
- foreign_keys = []
189
- for c in columns:
190
- fk = c.get("foreign_key")
191
- if not fk:
192
- continue
193
- target_table, _, target_column = fk.partition(".")
194
- foreign_keys.append({
195
- "column": c["name"],
196
- "target_table": target_table,
197
- "target_column": target_column,
198
- })
199
-
200
- return LangChainDocument(
201
- page_content=text,
202
- metadata={
203
- "user_id": user_id,
204
- "source_type": "database",
205
- "chunk_level": "table",
206
- "database_client_id": client_id,
207
- "updated_at": updated_at,
208
- "data": {
209
- "table_name": table_name,
210
- "row_count": row_count,
211
- "primary_key": [c["name"] for c in columns if c.get("is_primary_key")],
212
- "foreign_keys": foreign_keys,
213
- "column_names": [c["name"] for c in columns],
214
- },
215
- },
216
- )
217
-
218
- async def run(
219
- self,
220
- user_id: str,
221
- client_id: str,
222
- engine: Engine,
223
- exclude_tables: Optional[frozenset[str]] = None,
224
- ) -> int:
225
- """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
226
-
227
- Returns:
228
- Total number of chunks ingested.
229
- """
230
- vector_store = get_vector_store()
231
- logger.info("db pipeline start", user_id=user_id)
232
-
233
- # Profile first — if this fails, old embeddings are untouched
234
- schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
235
-
236
- updated_at = datetime.now(timezone(timedelta(hours=7))).isoformat()
237
- all_docs: list = []
238
- for table_name, columns in schema.items():
239
- logger.info("profiling table", table=table_name, columns=len(columns))
240
- entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
241
- docs = [self._to_document(user_id, client_id, table_name, e, updated_at) for e in entries]
242
- all_docs.extend(docs)
243
-
244
- # Table-level chunk. Failures here are logged and skipped — column
245
- # chunks above are already in all_docs and will still be written.
246
- try:
247
- row_count = await asyncio.to_thread(get_row_count, engine, table_name)
248
- sample_row = (
249
- await asyncio.to_thread(fetch_sample_row, engine, table_name)
250
- if row_count > 0
251
- else None
252
- )
253
- table_text = build_table_chunk(
254
- table_name, row_count, columns, entries, sample_row
255
- )
256
- all_docs.append(
257
- self._to_table_document(
258
- user_id, client_id, table_name, columns, row_count, table_text, updated_at
259
- )
260
- )
261
- except Exception as e:
262
- logger.error(
263
- "table chunk generation failed", table=table_name, error=str(e)
264
- )
265
-
266
- logger.info("profiled table", table=table_name, count=len(docs))
267
-
268
- # Insert new chunks first; only delete stale chunks after the insert succeeds.
269
- # Prevents data loss if aadd_documents fails — old embeddings stay queryable
270
- # until they're proven replaceable. Stale rows are identified by an older
271
- # updated_at than this run.
272
- if not all_docs:
273
- logger.warning(
274
- "no docs produced from schema; skipping delete to preserve existing embeddings",
275
- user_id=user_id,
276
- client_id=client_id,
277
- )
278
- return 0
279
-
280
- await vector_store.aadd_documents(all_docs)
281
-
282
- async with _pgvector_engine.begin() as conn:
283
- result = await conn.execute(
284
- text(
285
- "DELETE FROM langchain_pg_embedding "
286
- "WHERE cmetadata->>'user_id' = :user_id "
287
- " AND cmetadata->>'source_type' = 'database' "
288
- " AND cmetadata->>'database_client_id' = :client_id "
289
- " AND cmetadata->>'updated_at' < :updated_at "
290
- " AND collection_id = ("
291
- " SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'"
292
- " )"
293
- ),
294
- {"user_id": user_id, "client_id": client_id, "updated_at": updated_at},
295
- )
296
- logger.info("cleared stale db embeddings", user_id=user_id, deleted=result.rowcount)
297
-
298
- logger.info("db pipeline complete", user_id=user_id, total=len(all_docs))
299
- return len(all_docs)
300
-
301
-
302
- db_pipeline_service = DbPipelineService()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pipeline/db_pipeline/extractor.py DELETED
@@ -1,283 +0,0 @@
1
- """Schema introspection and per-column profiling for a user's database.
2
-
3
- Identifiers (table/column names) are quoted via the engine's dialect preparer,
4
- which handles reserved words, mixed case, and embedded quotes correctly across
5
- dialects. Values used in SQL come from SQLAlchemy inspection of the DB itself,
6
- not user input.
7
- """
8
-
9
- from typing import Optional
10
-
11
- import pandas as pd
12
- from sqlalchemy import Float, Integer, Numeric, inspect
13
- from sqlalchemy.engine import Engine
14
-
15
- from src.middlewares.logging import get_logger
16
-
17
- logger = get_logger("db_extractor")
18
-
19
- TOP_VALUES_THRESHOLD = 0.05 # show top values if distinct_ratio <= 5%
20
-
21
- # Dialects where PERCENTILE_CONT(...) WITHIN GROUP is supported as an aggregate.
22
- # MySQL has no percentile aggregate; BigQuery has PERCENTILE_CONT only as an
23
- # analytic (window) function — both drop median and keep min/max/mean.
24
- _MEDIAN_DIALECTS = frozenset({"postgresql", "mssql", "snowflake"})
25
-
26
-
27
- def _supports_median(engine: Engine) -> bool:
28
- return engine.dialect.name in _MEDIAN_DIALECTS
29
-
30
-
31
- def _head_query(
32
- engine: Engine,
33
- select_clause: str,
34
- from_clause: str,
35
- n: int,
36
- order_by: str = "",
37
- ) -> str:
38
- """LIMIT/TOP-equivalent head query for the engine's dialect."""
39
- if engine.dialect.name == "mssql":
40
- return f"SELECT TOP {n} {select_clause} FROM {from_clause} {order_by}".strip()
41
- return f"SELECT {select_clause} FROM {from_clause} {order_by} LIMIT {n}".strip()
42
-
43
-
44
- def _qi(engine: Engine, name: str) -> str:
45
- """Dialect-correct identifier quoting (schema.table also handled if dotted)."""
46
- preparer = engine.dialect.identifier_preparer
47
- if "." in name:
48
- schema, _, table = name.partition(".")
49
- return f"{preparer.quote(schema)}.{preparer.quote(table)}"
50
- return preparer.quote(name)
51
-
52
-
53
- def get_schema(
54
- engine: Engine, exclude_tables: Optional[frozenset[str]] = None
55
- ) -> dict[str, list[dict]]:
56
- """Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
57
- exclude = exclude_tables or frozenset()
58
- inspector = inspect(engine)
59
- schema = {}
60
- for table_name in inspector.get_table_names():
61
- if table_name in exclude:
62
- continue
63
-
64
- pk = inspector.get_pk_constraint(table_name)
65
- pk_cols = set(pk["constrained_columns"]) if pk else set()
66
-
67
- fk_map = {}
68
- for fk in inspector.get_foreign_keys(table_name):
69
- for col, ref_col in zip(fk["constrained_columns"], fk["referred_columns"]):
70
- fk_map[col] = f"{fk['referred_table']}.{ref_col}"
71
-
72
- cols = inspector.get_columns(table_name)
73
- schema[table_name] = [
74
- {
75
- "name": c["name"],
76
- "type": str(c["type"]),
77
- "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
78
- "is_primary_key": c["name"] in pk_cols,
79
- "foreign_key": fk_map.get(c["name"]),
80
- }
81
- for c in cols
82
- ]
83
- logger.info("extracted schema", table_count=len(schema))
84
- return schema
85
-
86
-
87
- def get_row_count(engine: Engine, table_name: str) -> int:
88
- # Cast to plain int — pandas returns numpy.int64 which fails JSONB serialization
89
- # when the value lands in PGVector cmetadata via the table-level chunk.
90
- return int(pd.read_sql(f"SELECT COUNT(*) FROM {_qi(engine, table_name)}", engine).iloc[0, 0])
91
-
92
-
93
- def profile_column(
94
- engine: Engine,
95
- table_name: str,
96
- col_name: str,
97
- is_numeric: bool,
98
- row_count: int,
99
- ) -> dict:
100
- """Returns null_count, distinct_count, min/max, top values, and sample values."""
101
- if row_count == 0:
102
- return {
103
- "null_count": 0,
104
- "distinct_count": 0,
105
- "distinct_ratio": 0.0,
106
- "sample_values": [],
107
- }
108
-
109
- qt = _qi(engine, table_name)
110
- qc = _qi(engine, col_name)
111
-
112
- # Combined stats query: null_count, distinct_count, and min/max (if numeric).
113
- # One round-trip instead of two.
114
- select_cols = [
115
- f"COUNT(*) - COUNT({qc}) AS nulls",
116
- f"COUNT(DISTINCT {qc}) AS distincts",
117
- ]
118
- if is_numeric:
119
- select_cols.append(f"MIN({qc}) AS min_val")
120
- select_cols.append(f"MAX({qc}) AS max_val")
121
- select_cols.append(f"AVG({qc}) AS mean_val")
122
- if _supports_median(engine):
123
- select_cols.append(
124
- f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
125
- )
126
- stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
127
-
128
- null_count = int(stats.iloc[0]["nulls"])
129
- distinct_count = int(stats.iloc[0]["distincts"])
130
- distinct_ratio = distinct_count / row_count if row_count > 0 else 0
131
-
132
- profile = {
133
- "null_count": null_count,
134
- "distinct_count": distinct_count,
135
- "distinct_ratio": round(distinct_ratio, 4),
136
- }
137
-
138
- if is_numeric:
139
- profile["min"] = stats.iloc[0]["min_val"]
140
- profile["max"] = stats.iloc[0]["max_val"]
141
- profile["mean"] = stats.iloc[0]["mean_val"]
142
- if _supports_median(engine):
143
- profile["median"] = stats.iloc[0]["median_val"]
144
-
145
- if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
146
- top_sql = _head_query(
147
- engine,
148
- select_clause=f"{qc}, COUNT(*) AS cnt",
149
- from_clause=f"{qt} GROUP BY {qc}",
150
- n=10,
151
- order_by="ORDER BY cnt DESC",
152
- )
153
- top = pd.read_sql(top_sql, engine)
154
- profile["top_values"] = list(zip(top.iloc[:, 0].tolist(), top["cnt"].tolist()))
155
-
156
- sample = pd.read_sql(_head_query(engine, qc, qt, 5), engine)
157
- profile["sample_values"] = sample.iloc[:, 0].tolist()
158
-
159
- return profile
160
-
161
-
162
- def profile_table(engine: Engine, table_name: str, columns: list[dict]) -> list[dict]:
163
- """Profile every column in a table. Returns [{col, profile, text}, ...].
164
-
165
- Per-column errors are logged and skipped so one bad column doesn't abort
166
- the whole table.
167
- """
168
- row_count = get_row_count(engine, table_name)
169
- if row_count == 0:
170
- logger.info("skipping empty table", table=table_name)
171
- return []
172
-
173
- results = []
174
- for col in columns:
175
- try:
176
- profile = profile_column(
177
- engine, table_name, col["name"], col.get("is_numeric", False), row_count
178
- )
179
- text = build_text(table_name, row_count, col, profile)
180
- results.append({"col": col, "profile": profile, "text": text})
181
- except Exception as e:
182
- logger.error(
183
- "column profiling failed",
184
- table=table_name,
185
- column=col["name"],
186
- error=str(e),
187
- )
188
- continue
189
- return results
190
-
191
-
192
- def fetch_sample_row(engine: Engine, table_name: str) -> Optional[dict]:
193
- """First row of the table as a dict, or None if the table is empty.
194
-
195
- Reuses _qi for dialect-correct quoting and _head_query for TOP/LIMIT.
196
- """
197
- qt = _qi(engine, table_name)
198
- sql = _head_query(engine, "*", qt, 1)
199
- df = pd.read_sql(sql, engine)
200
- if df.empty:
201
- return None
202
- return df.iloc[0].to_dict()
203
-
204
-
205
- def build_table_chunk(
206
- table_name: str,
207
- row_count: int,
208
- columns: list[dict],
209
- column_profiles: list[dict],
210
- sample_row: Optional[dict],
211
- ) -> str:
212
- """Build the table-level chunk text.
213
-
214
- Format (lines omitted when not applicable):
215
- Table: {name} ({row_count} rows)
216
- Primary key: {pk_cols}
217
- Foreign keys: {col} -> {target_table}.{target_col}, ...
218
- Columns ({n}): {col1}, {col2}, ...
219
- Numeric ranges: {col} [{min}-{max}], ...
220
- Sample row: {dict}
221
-
222
- Pure formatter — no DB I/O. column_profiles is the output of profile_table
223
- and is reused so we don't re-introspect.
224
- """
225
- lines = [f"Table: {table_name} ({row_count} rows)"]
226
-
227
- pk_cols = [c["name"] for c in columns if c.get("is_primary_key")]
228
- if pk_cols:
229
- lines.append(f"Primary key: {', '.join(pk_cols)}")
230
-
231
- fk_parts = [
232
- f"{c['name']} -> {c['foreign_key']}" for c in columns if c.get("foreign_key")
233
- ]
234
- if fk_parts:
235
- lines.append(f"Foreign keys: {', '.join(fk_parts)}")
236
-
237
- col_names = [c["name"] for c in columns]
238
- lines.append(f"Columns ({len(col_names)}): {', '.join(col_names)}")
239
-
240
- range_parts = []
241
- for entry in column_profiles:
242
- col = entry["col"]
243
- profile = entry["profile"]
244
- if not col.get("is_numeric"):
245
- continue
246
- mn = profile.get("min")
247
- mx = profile.get("max")
248
- if mn is None or mx is None:
249
- continue
250
- range_parts.append(f"{col['name']} [{mn}-{mx}]")
251
- if range_parts:
252
- lines.append(f"Numeric ranges: {', '.join(range_parts)}")
253
-
254
- if sample_row is not None:
255
- lines.append(f"Sample row: {sample_row}")
256
-
257
- return "\n".join(lines)
258
-
259
-
260
- def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str:
261
- col_name = col["name"]
262
- col_type = col["type"]
263
-
264
- key_label = ""
265
- if col.get("is_primary_key"):
266
- key_label = " [PRIMARY KEY]"
267
- elif col.get("foreign_key"):
268
- key_label = f" [FK -> {col['foreign_key']}]"
269
-
270
- text = f"Table: {table_name} ({row_count} rows)\n"
271
- text += f"Column: {col_name} ({col_type}){key_label}\n"
272
- text += f"Null count: {profile['null_count']}\n"
273
- text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
274
- if "min" in profile:
275
- text += f"Min: {profile['min']}, Max: {profile['max']}\n"
276
- text += f"Mean: {profile['mean']}\n"
277
- if profile.get("median") is not None:
278
- text += f"Median: {profile['median']}\n"
279
- if "top_values" in profile:
280
- top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
281
- text += f"Top values: {top_str}\n"
282
- text += f"Sample values: {profile['sample_values']}"
283
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pipeline/document_pipeline/__init__.py DELETED
File without changes
src/pipeline/document_pipeline/document_pipeline.py DELETED
@@ -1,94 +0,0 @@
1
- """Document upload and processing pipeline."""
2
-
3
- from fastapi import HTTPException, UploadFile
4
- from sqlalchemy.ext.asyncio import AsyncSession
5
-
6
- from src.document.document_service import document_service
7
- from src.knowledge.processing_service import knowledge_processor
8
- from src.knowledge.parquet_service import delete_document_parquets
9
- from src.middlewares.logging import get_logger
10
- from src.storage.az_blob.az_blob import blob_storage
11
-
12
- logger = get_logger("document_pipeline")
13
-
14
- # NOTE: Keep in sync with _DOC_TYPES in src/api/v1/document.py
15
- SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
16
- MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10 MB
17
-
18
-
19
- class DocumentPipeline:
20
- """Orchestrates the full document upload, process, and delete flows."""
21
-
22
- async def upload(self, file: UploadFile, user_id: str, db: AsyncSession) -> dict:
23
- """Validate → upload to blob → save to DB."""
24
- content = await file.read()
25
- if not file.filename:
26
- raise HTTPException(status_code=400, detail="Filename is required.")
27
- file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
28
-
29
- if len(content) > MAX_FILE_SIZE_BYTES:
30
- raise HTTPException(
31
- status_code=400,
32
- detail="File size exceeds maximum allowed size of 10 MB.",
33
- )
34
-
35
- if file_type not in SUPPORTED_FILE_TYPES:
36
- raise HTTPException(
37
- status_code=400,
38
- detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}",
39
- )
40
-
41
- blob_name = await blob_storage.upload_file(content, file.filename, user_id)
42
- document = await document_service.create_document(
43
- db=db,
44
- user_id=user_id,
45
- filename=file.filename,
46
- blob_name=blob_name,
47
- file_size=len(content),
48
- file_type=file_type,
49
- )
50
-
51
- logger.info(f"Uploaded document {document.id} for user {user_id}")
52
- return {"id": document.id, "filename": document.filename, "status": document.status}
53
-
54
- async def process(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
55
- """Validate ownership → extract text → chunk → ingest to vector store."""
56
- document = await document_service.get_document(db, document_id)
57
-
58
- if not document:
59
- raise HTTPException(status_code=404, detail="Document not found")
60
- if document.user_id != user_id:
61
- raise HTTPException(status_code=403, detail="Access denied")
62
-
63
- try:
64
- await document_service.update_document_status(db, document_id, "processing")
65
- chunks_count = await knowledge_processor.process_document(document, db)
66
- await document_service.update_document_status(db, document_id, "completed")
67
-
68
- logger.info(f"Processed document {document_id}: {chunks_count} chunks")
69
- return {"document_id": document_id, "chunks_processed": chunks_count}
70
-
71
- except Exception as e:
72
- logger.error(f"Processing failed for document {document_id}", error=str(e))
73
- await document_service.update_document_status(db, document_id, "failed", str(e))
74
- raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
75
-
76
- async def delete(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
77
- """Validate ownership → delete from blob and DB."""
78
- document = await document_service.get_document(db, document_id)
79
-
80
- if not document:
81
- raise HTTPException(status_code=404, detail="Document not found")
82
- if document.user_id != user_id:
83
- raise HTTPException(status_code=403, detail="Access denied")
84
-
85
- await document_service.delete_document(db, document_id)
86
-
87
- if document.file_type in ("csv", "xlsx"):
88
- await delete_document_parquets(user_id, document_id)
89
-
90
- logger.info(f"Deleted document {document_id} for user {user_id}")
91
- return {"document_id": document_id}
92
-
93
-
94
- document_pipeline = DocumentPipeline()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/query/__init__.py DELETED
File without changes
src/query/base.py DELETED
@@ -1,32 +0,0 @@
1
- """Shared contract for query executors."""
2
-
3
- from abc import ABC, abstractmethod
4
- from dataclasses import dataclass, field
5
-
6
- from sqlalchemy.ext.asyncio import AsyncSession
7
-
8
- from src.rag.base import RetrievalResult
9
-
10
-
11
- @dataclass
12
- class QueryResult:
13
- source_type: str # "database" or "document"
14
- source_id: str # database_client_id or document_id
15
- table_or_file: str
16
- columns: list[str]
17
- rows: list[dict]
18
- row_count: int
19
- metadata: dict = field(default_factory=dict)
20
- # metadata should include "column_types": {"col_name": "dtype"} when available
21
-
22
-
23
- class BaseExecutor(ABC):
24
- @abstractmethod
25
- async def execute(
26
- self,
27
- results: list[RetrievalResult],
28
- user_id: str,
29
- db: AsyncSession,
30
- question: str,
31
- limit: int = 100,
32
- ) -> list[QueryResult]: ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/query/executors/__init__.py DELETED
File without changes
src/query/executors/db_executor.py DELETED
@@ -1,648 +0,0 @@
1
- """Executor for registered database sources (source_type="database").
2
-
3
- Flow per (client_id, question):
4
- 1. Collect all relevant (table_name, column_name) pairs from retrieval results.
5
- 2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
6
- 3. Build a schema context string and send to LLM → structured SQLQuery output.
7
- 4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
8
- 5. Execute on the user's DB via engine_scope + asyncio.to_thread.
9
- 6. Return QueryResult per client_id (may span multiple tables via JOINs).
10
-
11
- Supported db_types: postgres, supabase, mysql.
12
- Other types are skipped with a warning — they do not raise.
13
- """
14
-
15
- import asyncio
16
- from collections import defaultdict
17
- from typing import Any
18
-
19
- import sqlglot
20
- import sqlglot.expressions as exp
21
- import tiktoken
22
- from langchain_core.prompts import ChatPromptTemplate
23
- from langchain_openai import AzureChatOpenAI
24
- from sqlalchemy import text
25
- from sqlalchemy.ext.asyncio import AsyncSession
26
-
27
- from src.config.settings import settings
28
- from src.database_client.database_client_service import database_client_service
29
- from src.db.postgres.connection import _pgvector_engine
30
- from src.middlewares.logging import get_logger
31
- from src.models.sql_query import SQLQuery
32
- from src.pipeline.db_pipeline import db_pipeline_service
33
- from src.query.base import BaseExecutor, QueryResult
34
- from src.rag.base import RetrievalResult
35
- from src.utils.db_credential_encryption import decrypt_credentials_dict
36
-
37
- logger = get_logger("db_executor")
38
-
39
- _enc = tiktoken.get_encoding("cl100k_base")
40
-
41
- _SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
42
- _MAX_RETRIES = 3
43
- _MAX_LIMIT = 500
44
- _FK_EXPANSION_MAX_TABLES = 5
45
-
46
- _SQL_SYSTEM_PROMPT = """\
47
- You are a SQL data analyst working with a user's database.
48
- Generate a single SQL SELECT statement that answers the user's question.
49
-
50
- Database dialect: {dialect}
51
-
52
- Rules:
53
- - ONLY reference tables and columns listed in the schema below. Do not invent names.
54
- - Always include a LIMIT clause (max {limit}).
55
- - Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
56
- - Prefer explicit JOINs over subqueries when combining tables.
57
- - For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
58
- - For date filtering, use dialect-appropriate functions ({dialect} syntax).
59
-
60
- Schema:
61
- {schema}
62
-
63
- {error_section}"""
64
-
65
-
66
- class DbExecutor(BaseExecutor):
67
- def __init__(self) -> None:
68
- self._llm = AzureChatOpenAI(
69
- azure_deployment=settings.azureai_deployment_name_4o,
70
- openai_api_version=settings.azureai_api_version_4o,
71
- azure_endpoint=settings.azureai_endpoint_url_4o,
72
- api_key=settings.azureai_api_key_4o,
73
- temperature=0,
74
- )
75
- self._prompt = ChatPromptTemplate.from_messages([
76
- ("system", _SQL_SYSTEM_PROMPT),
77
- ("human", "{question}"),
78
- ])
79
- self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
80
-
81
- # ------------------------------------------------------------------
82
- # Public interface
83
- # ------------------------------------------------------------------
84
-
85
- async def execute(
86
- self,
87
- results: list[RetrievalResult],
88
- user_id: str,
89
- db: AsyncSession,
90
- question: str,
91
- limit: int = 100,
92
- ) -> list[QueryResult]:
93
- db_results = [r for r in results if r.source_type == "database"]
94
- if not db_results:
95
- return []
96
-
97
- # Group by client_id — one SQL generation + execution pass per client
98
- by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
99
- for r in db_results:
100
- client_id = r.metadata.get("database_client_id", "")
101
- if client_id:
102
- by_client[client_id].append(r)
103
- else:
104
- logger.warning("db result missing database_client_id, skipping")
105
-
106
- query_results: list[QueryResult] = []
107
- for client_id, client_results in by_client.items():
108
- try:
109
- qr = await self._execute_for_client(client_id, client_results, user_id, db, question, limit)
110
- if qr:
111
- query_results.append(qr)
112
- except Exception as e:
113
- logger.error("db executor failed for client", client_id=client_id, error=str(e))
114
-
115
- return query_results
116
-
117
- # ------------------------------------------------------------------
118
- # Per-client execution
119
- # ------------------------------------------------------------------
120
-
121
- async def _execute_for_client(
122
- self,
123
- client_id: str,
124
- results: list[RetrievalResult],
125
- user_id: str,
126
- db: AsyncSession,
127
- question: str,
128
- limit: int,
129
- ) -> QueryResult | None:
130
- client = await database_client_service.get(db, client_id)
131
- if not client:
132
- logger.warning("database client not found", client_id=client_id)
133
- return None
134
- if client.user_id != user_id:
135
- logger.warning("client ownership mismatch", client_id=client_id)
136
- return None
137
- if client.db_type not in _SUPPORTED_DB_TYPES:
138
- logger.warning("unsupported db_type for query execution", db_type=client.db_type)
139
- return None
140
-
141
- # Hit tables = tables retrieval pointed at directly. Get full per-column
142
- # schema for these. Related tables (one FK hop away, both directions) are
143
- # fetched separately in abbreviated form to give the LLM enough context
144
- # to JOIN without paying the per-column profile token cost.
145
- hit_tables = list({
146
- r.metadata.get("data", {}).get("table_name")
147
- for r in results
148
- if r.metadata.get("data", {}).get("table_name")
149
- })
150
- if not hit_tables:
151
- logger.warning("no table_name on any retrieval result", client_id=client_id)
152
- return None
153
-
154
- full_schema = await self._fetch_full_schema(client_id, hit_tables, user_id)
155
- if not full_schema:
156
- logger.warning("no schema found in vector store", client_id=client_id, tables=hit_tables)
157
- return None
158
-
159
- related_tables = await self._find_related_tables(client_id, user_id, hit_tables)
160
- related_schema = (
161
- await self._fetch_abbreviated_schema(client_id, user_id, related_tables)
162
- if related_tables else {}
163
- )
164
-
165
- schema_ctx = self._build_schema_context(full_schema, related_schema)
166
- capped_limit = min(limit, _MAX_LIMIT)
167
- dialect = client.db_type
168
-
169
- # SQL generation with retry
170
- validated_sql: str | None = None
171
- prev_error: str = ""
172
- prev_reasoning: str = ""
173
- for attempt in range(_MAX_RETRIES):
174
- if prev_error:
175
- error_section = (
176
- f"Previous attempt reasoning: {prev_reasoning}\n"
177
- f"Previous attempt failed: {prev_error}\n"
178
- "Fix the issue above."
179
- )
180
- else:
181
- error_section = ""
182
- try:
183
- prompt_text = schema_ctx + error_section + question
184
- input_tokens = len(_enc.encode(prompt_text))
185
- logger.info("sql generation input tokens", attempt=attempt + 1, tokens=input_tokens)
186
-
187
- result: SQLQuery = await self._chain.ainvoke({
188
- "schema": schema_ctx,
189
- "dialect": dialect,
190
- "limit": capped_limit,
191
- "error_section": error_section,
192
- "question": question,
193
- })
194
- sql = result.sql.strip()
195
- allowed_tables = set(full_schema) | set(related_schema)
196
- column_map: dict[str, set[str]] = {
197
- t: {c["name"] for c in cols} for t, cols in full_schema.items()
198
- }
199
- for t, info in related_schema.items():
200
- column_map[t] = set(info.get("column_names") or [])
201
- validation_error = self._validate(sql, allowed_tables, capped_limit, column_map)
202
- if validation_error:
203
- prev_error = validation_error
204
- prev_reasoning = result.reasoning
205
- logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
206
- continue
207
- validated_sql = self._enforce_limit(sql, capped_limit)
208
- output_tokens = len(_enc.encode(result.sql)) + len(_enc.encode(result.reasoning))
209
- logger.info(
210
- "sql generated",
211
- attempt=attempt + 1,
212
- input_tokens=input_tokens,
213
- output_tokens=output_tokens,
214
- total_tokens=input_tokens + output_tokens,
215
- reasoning=result.reasoning,
216
- )
217
- break
218
- except Exception as e:
219
- prev_error = str(e)
220
- logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
221
-
222
- if not validated_sql:
223
- logger.error("sql generation failed after retries", client_id=client_id)
224
- return None
225
-
226
- # Execute on user's DB
227
- creds = decrypt_credentials_dict(client.credentials)
228
- with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
229
- rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
230
-
231
- column_types = {
232
- col["name"]: col["type"]
233
- for cols in full_schema.values()
234
- for col in cols
235
- }
236
- columns = list(rows[0].keys()) if rows else []
237
-
238
- return QueryResult(
239
- source_type="database",
240
- source_id=client_id,
241
- table_or_file=", ".join(hit_tables),
242
- columns=columns,
243
- rows=rows,
244
- row_count=len(rows),
245
- metadata={
246
- "db_type": client.db_type,
247
- "client_name": client.name,
248
- "sql": validated_sql,
249
- "column_types": {c: column_types.get(c, "unknown") for c in columns},
250
- },
251
- )
252
-
253
- # ------------------------------------------------------------------
254
- # Schema helpers
255
- # ------------------------------------------------------------------
256
-
257
- async def _find_related_tables(
258
- self,
259
- client_id: str,
260
- user_id: str,
261
- hit_tables: list[str],
262
- ) -> list[str]:
263
- """One-hop FK neighbours of `hit_tables`, both directions, excluding hits.
264
-
265
- Prefers chunk_level='table' rows; if none exist for the client (legacy
266
- ingest predating Phase 1), falls back to aggregating from column-chunk
267
- metadata. Returns [] when no FK metadata is available.
268
-
269
- Capped at _FK_EXPANSION_MAX_TABLES, ranked by edge count desc then
270
- table name asc. A warning is logged when the cap kicks in.
271
- """
272
- if not hit_tables:
273
- return []
274
-
275
- hit_set = set(hit_tables)
276
- # edge_counts[related_table] = number of FK edges connecting it to the hit set
277
- edge_counts: dict[str, int] = defaultdict(int)
278
-
279
- # ---- Primary path: table-level chunks ----
280
- sql = text("""
281
- SELECT lpe.cmetadata
282
- FROM langchain_pg_embedding lpe
283
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
284
- WHERE lpc.name = 'document_embeddings'
285
- AND lpe.cmetadata->>'user_id' = :user_id
286
- AND lpe.cmetadata->>'source_type' = 'database'
287
- AND lpe.cmetadata->>'database_client_id' = :client_id
288
- AND lpe.cmetadata->>'chunk_level' = 'table'
289
- """)
290
- async with _pgvector_engine.connect() as conn:
291
- result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
292
- table_rows = result.fetchall()
293
-
294
- if table_rows:
295
- for row in table_rows:
296
- data = row.cmetadata.get("data", {})
297
- table = data.get("table_name")
298
- fks = data.get("foreign_keys") or []
299
- if not table:
300
- continue
301
- if table in hit_set:
302
- # Outgoing: this hit's FKs point at related tables
303
- for fk in fks:
304
- target = fk.get("target_table")
305
- if target and target not in hit_set:
306
- edge_counts[target] += 1
307
- else:
308
- # Incoming: this non-hit table's FKs point into the hit set
309
- for fk in fks:
310
- target = fk.get("target_table")
311
- if target in hit_set:
312
- edge_counts[table] += 1
313
- else:
314
- # ---- Fallback: aggregate from column chunks ----
315
- sql = text("""
316
- SELECT lpe.cmetadata->'data'->>'table_name' AS src_table,
317
- lpe.cmetadata->'data'->>'foreign_key' AS fk
318
- FROM langchain_pg_embedding lpe
319
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
320
- WHERE lpc.name = 'document_embeddings'
321
- AND lpe.cmetadata->>'user_id' = :user_id
322
- AND lpe.cmetadata->>'source_type' = 'database'
323
- AND lpe.cmetadata->>'database_client_id' = :client_id
324
- AND lpe.cmetadata->>'chunk_level' = 'column'
325
- AND lpe.cmetadata->'data'->>'foreign_key' IS NOT NULL
326
- """)
327
- async with _pgvector_engine.connect() as conn:
328
- result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
329
- col_rows = result.fetchall()
330
-
331
- for row in col_rows:
332
- src = row.src_table
333
- fk = row.fk
334
- if not src or not fk:
335
- continue
336
- target = fk.split(".", 1)[0]
337
- if src in hit_set and target and target not in hit_set:
338
- edge_counts[target] += 1
339
- elif src not in hit_set and target in hit_set:
340
- edge_counts[src] += 1
341
-
342
- if not edge_counts:
343
- return []
344
-
345
- ranked = sorted(edge_counts.items(), key=lambda kv: (-kv[1], kv[0]))
346
- if len(ranked) > _FK_EXPANSION_MAX_TABLES:
347
- logger.warning(
348
- "fk expansion cap hit",
349
- client_id=client_id,
350
- total=len(ranked),
351
- cap=_FK_EXPANSION_MAX_TABLES,
352
- dropped=[t for t, _ in ranked[_FK_EXPANSION_MAX_TABLES:]],
353
- )
354
- ranked = ranked[:_FK_EXPANSION_MAX_TABLES]
355
-
356
- related = [t for t, _ in ranked]
357
- logger.info("fk-related tables", hit=sorted(hit_set), related=related)
358
- return related
359
-
360
- async def _fetch_abbreviated_schema(
361
- self,
362
- client_id: str,
363
- user_id: str,
364
- table_names: list[str],
365
- ) -> dict[str, dict[str, Any]]:
366
- """Abbreviated schema: name, row_count, PK, FKs, column names — no profiles.
367
-
368
- Prefers chunk_level='table' rows. Falls back to aggregating column-chunk
369
- metadata when table chunks are missing for a given table_name.
370
-
371
- Returns {table_name: {"row_count": int|None, "primary_key": [str],
372
- "foreign_keys": [{column, target_table, target_column}],
373
- "column_names": [str]}}.
374
- """
375
- if not table_names:
376
- return {}
377
-
378
- placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
379
- params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
380
- for i, name in enumerate(table_names):
381
- params[f"t{i}"] = name
382
-
383
- # Primary path: one row per table from chunk_level='table'
384
- sql_table = text(f"""
385
- SELECT lpe.cmetadata
386
- FROM langchain_pg_embedding lpe
387
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
388
- WHERE lpc.name = 'document_embeddings'
389
- AND lpe.cmetadata->>'user_id' = :user_id
390
- AND lpe.cmetadata->>'source_type' = 'database'
391
- AND lpe.cmetadata->>'database_client_id' = :client_id
392
- AND lpe.cmetadata->>'chunk_level' = 'table'
393
- AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
394
- """)
395
- async with _pgvector_engine.connect() as conn:
396
- result = await conn.execute(sql_table, params)
397
- t_rows = result.fetchall()
398
-
399
- out: dict[str, dict[str, Any]] = {}
400
- for row in t_rows:
401
- data = row.cmetadata.get("data", {})
402
- tname = data.get("table_name")
403
- if not tname:
404
- continue
405
- out[tname] = {
406
- "row_count": data.get("row_count"),
407
- "primary_key": list(data.get("primary_key") or []),
408
- "foreign_keys": list(data.get("foreign_keys") or []),
409
- "column_names": list(data.get("column_names") or []),
410
- }
411
-
412
- # Fallback for tables with no table-chunk: aggregate column chunks
413
- missing = [t for t in table_names if t not in out]
414
- if missing:
415
- placeholders_m = ", ".join(f":m{i}" for i in range(len(missing)))
416
- params_m: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
417
- for i, name in enumerate(missing):
418
- params_m[f"m{i}"] = name
419
- sql_col = text(f"""
420
- SELECT lpe.cmetadata
421
- FROM langchain_pg_embedding lpe
422
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
423
- WHERE lpc.name = 'document_embeddings'
424
- AND lpe.cmetadata->>'user_id' = :user_id
425
- AND lpe.cmetadata->>'source_type' = 'database'
426
- AND lpe.cmetadata->>'database_client_id' = :client_id
427
- AND lpe.cmetadata->>'chunk_level' = 'column'
428
- AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders_m})
429
- ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
430
- """)
431
- async with _pgvector_engine.connect() as conn:
432
- result = await conn.execute(sql_col, params_m)
433
- c_rows = result.fetchall()
434
-
435
- agg: dict[str, dict[str, Any]] = {
436
- t: {"row_count": None, "primary_key": [], "foreign_keys": [], "column_names": []}
437
- for t in missing
438
- }
439
- for row in c_rows:
440
- data = row.cmetadata.get("data", {})
441
- tname = data.get("table_name")
442
- cname = data.get("column_name")
443
- if not tname or tname not in agg or not cname:
444
- continue
445
- bucket = agg[tname]
446
- bucket["column_names"].append(cname)
447
- if data.get("is_primary_key"):
448
- bucket["primary_key"].append(cname)
449
- fk = data.get("foreign_key")
450
- if fk:
451
- target_table, _, target_col = fk.partition(".")
452
- bucket["foreign_keys"].append({
453
- "column": cname,
454
- "target_table": target_table,
455
- "target_column": target_col,
456
- })
457
- for t, v in agg.items():
458
- if v["column_names"]:
459
- out[t] = v
460
-
461
- return out
462
-
463
- async def _fetch_full_schema(
464
- self,
465
- client_id: str,
466
- table_names: list[str],
467
- user_id: str,
468
- ) -> dict[str, list[dict[str, Any]]]:
469
- """Fetch ALL column chunks for the given tables from PGVector.
470
-
471
- Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
472
- "foreign_key": ..., "content": ...}]}
473
- """
474
- placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
475
- sql = text(f"""
476
- SELECT lpe.cmetadata, lpe.document
477
- FROM langchain_pg_embedding lpe
478
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
479
- WHERE lpc.name = 'document_embeddings'
480
- AND lpe.cmetadata->>'user_id' = :user_id
481
- AND lpe.cmetadata->>'source_type' = 'database'
482
- AND lpe.cmetadata->>'chunk_level' = 'column'
483
- AND lpe.cmetadata->>'database_client_id' = :client_id
484
- AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
485
- ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
486
- """)
487
-
488
- params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
489
- for i, name in enumerate(table_names):
490
- params[f"t{i}"] = name
491
-
492
- async with _pgvector_engine.connect() as conn:
493
- result = await conn.execute(sql, params)
494
- rows = result.fetchall()
495
-
496
- schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
497
- for row in rows:
498
- data = row.cmetadata.get("data", {})
499
- table = data.get("table_name")
500
- if table:
501
- schema[table].append({
502
- "name": data.get("column_name", ""),
503
- "type": data.get("column_type", ""),
504
- "is_primary_key": data.get("is_primary_key", False),
505
- "foreign_key": data.get("foreign_key"),
506
- "content": row.document, # chunk text includes top values / samples
507
- })
508
- return dict(schema)
509
-
510
- def _build_schema_context(
511
- self,
512
- schema: dict[str, list[dict[str, Any]]],
513
- related_schema: dict[str, dict[str, Any]] | None = None,
514
- ) -> str:
515
- lines: list[str] = []
516
- for table, columns in schema.items():
517
- lines.append(f"Table: {table}")
518
- for col in columns:
519
- flags = []
520
- if col["is_primary_key"]:
521
- flags.append("PRIMARY KEY")
522
- if col["foreign_key"]:
523
- flags.append(f"FK -> {col['foreign_key']}")
524
- flag_str = f" [{', '.join(flags)}]" if flags else ""
525
- lines.append(f" - {col['name']} {col['type']}{flag_str}")
526
- # Include sample/top-values line from chunk content if present
527
- for line in col["content"].splitlines():
528
- if line.startswith(("Top values:", "Sample values:")):
529
- lines.append(f" {line}")
530
- break
531
- lines.append("")
532
-
533
- related_block = self._build_related_schema_block(related_schema or {})
534
- if related_block:
535
- lines.append(related_block)
536
-
537
- return "\n".join(lines).strip()
538
-
539
- def _build_related_schema_block(self, related_schema: dict[str, dict[str, Any]]) -> str:
540
- """Format the abbreviated FK-related-tables section. Empty string when no related."""
541
- if not related_schema:
542
- return ""
543
- lines: list[str] = ["Related tables (one hop via FK, abbreviated — use for JOINs only):"]
544
- for table, info in related_schema.items():
545
- row_count = info.get("row_count")
546
- header = f"- {table} ({row_count} rows)" if row_count is not None else f"- {table}"
547
- lines.append(header)
548
- pk = info.get("primary_key") or []
549
- lines.append(f" Primary key: {', '.join(pk) if pk else '(none)'}")
550
- fks = info.get("foreign_keys") or []
551
- if fks:
552
- fk_strs = [
553
- f"{fk.get('column')} -> {fk.get('target_table')}.{fk.get('target_column')}"
554
- for fk in fks
555
- ]
556
- lines.append(f" Foreign keys: {', '.join(fk_strs)}")
557
- else:
558
- lines.append(" Foreign keys: (none)")
559
- cols = info.get("column_names") or []
560
- lines.append(f" Columns: {', '.join(cols)}")
561
- return "\n".join(lines)
562
-
563
- # ------------------------------------------------------------------
564
- # Guardrails
565
- # ------------------------------------------------------------------
566
-
567
- def _validate(
568
- self,
569
- sql: str,
570
- allowed_tables: set[str],
571
- limit: int,
572
- column_map: dict[str, set[str]] | None = None,
573
- ) -> str:
574
- """Return an error string if validation fails, empty string if OK.
575
-
576
- `allowed_tables` is the union of hit-table names and FK-related table
577
- names — both are legal targets for SELECT/JOIN.
578
-
579
- `column_map` maps table_name → set of valid column names. When provided,
580
- any qualified table.column reference not found in the map triggers a retry
581
- with an informative error so the LLM can self-correct without hallucinating.
582
- """
583
- # Layer 1: sqlglot parse + SELECT-only check
584
- try:
585
- parsed = sqlglot.parse_one(sql)
586
- except sqlglot.errors.ParseError as e:
587
- return f"SQL parse error: {e}"
588
-
589
- if not isinstance(parsed, exp.Select):
590
- return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
591
-
592
- # Check for DML anywhere in the AST (including writeable CTEs)
593
- for node in parsed.find_all((exp.Insert, exp.Update, exp.Delete)):
594
- return f"DML ({type(node).__name__}) is not allowed."
595
-
596
- # Layer 2: schema grounding — table names
597
- known_tables = {t.lower() for t in allowed_tables}
598
- alias_to_table: dict[str, str] = {}
599
- for tbl in parsed.find_all(exp.Table):
600
- name = tbl.name.lower()
601
- if name and name not in known_tables:
602
- return f"Unknown table '{tbl.name}'. Only use tables from the schema."
603
- alias = (tbl.alias or tbl.name).lower()
604
- alias_to_table[alias] = name
605
-
606
- # Layer 3: column grounding — qualified references only (table.column)
607
- if column_map:
608
- normalized_map = {t.lower(): {c.lower() for c in cols} for t, cols in column_map.items()}
609
- for col_node in parsed.find_all(exp.Column):
610
- tbl_ref = col_node.table
611
- if not tbl_ref:
612
- continue # unqualified — skip, can't resolve without full alias tracking
613
- tbl_name = alias_to_table.get(tbl_ref.lower(), tbl_ref.lower())
614
- col_name = col_node.name.lower()
615
- if tbl_name in normalized_map and col_name not in normalized_map[tbl_name]:
616
- available = ", ".join(sorted(normalized_map[tbl_name]))
617
- return (
618
- f"Column '{col_node.name}' does not exist on table '{tbl_name}'. "
619
- f"Available columns: {available}."
620
- )
621
-
622
- # Layer 4: LIMIT enforcement (inject if missing — done before execution)
623
- return ""
624
-
625
- # ------------------------------------------------------------------
626
- # SQL execution
627
- # ------------------------------------------------------------------
628
-
629
- def _enforce_limit(self, sql: str, limit: int) -> str:
630
- """Inject or cap LIMIT using sqlglot AST manipulation."""
631
- parsed = sqlglot.parse_one(sql)
632
- existing = parsed.find(exp.Limit)
633
- if existing:
634
- current = int(existing.expression.this)
635
- if current > limit:
636
- return parsed.limit(limit).sql()
637
- else:
638
- return parsed.limit(limit).sql()
639
- return parsed.sql()
640
-
641
- def _run_sql(self, engine: Any, sql: str) -> list[dict]:
642
- # Ensure the user DB connection is a read-only credential — sqlglot validation alone is not sufficient.
643
- with engine.connect() as conn:
644
- result = conn.execute(text(sql))
645
- return [dict(row) for row in result.mappings()]
646
-
647
-
648
- db_executor = DbExecutor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/query/executors/tabular.py DELETED
@@ -1,287 +0,0 @@
1
- """Executor for tabular document sources (source_type="document", file_type csv/xlsx).
2
-
3
- Flow:
4
- 1. Group RetrievalResult chunks by (document_id, sheet_name).
5
- 2. Per group: download Parquet from Azure Blob → pandas DataFrame.
6
- 3. Build schema context from DataFrame columns + sample values.
7
- 4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
8
- 5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
9
- 6. Fallback to raw rows if all retries fail.
10
- 7. Return QueryResult per group.
11
- """
12
- import asyncio
13
- from typing import Literal, TypedDict
14
-
15
- import pandas as pd
16
- from langchain_core.prompts import ChatPromptTemplate
17
- from langchain_openai import AzureChatOpenAI
18
- from pydantic import BaseModel
19
- from sqlalchemy.ext.asyncio import AsyncSession
20
-
21
- from src.config.settings import settings
22
- from src.knowledge.parquet_service import download_parquet
23
- from src.middlewares.logging import get_logger
24
- from src.query.base import BaseExecutor, QueryResult
25
- from src.rag.base import RetrievalResult
26
-
27
- logger = get_logger("tabular_executor")
28
-
29
-
30
- class _GroupInfo(TypedDict):
31
- filename: str
32
- file_type: str
33
-
34
-
35
- _TABULAR_FILE_TYPES = ("csv", "xlsx")
36
- _MAX_RETRIES = 3
37
-
38
- _SYSTEM_PROMPT = """\
39
- You are a data analyst. Given a DataFrame schema and a user question, \
40
- decide which pandas operation to perform.
41
-
42
- IMPORTANT rules:
43
- - Use ONLY the exact column names as written in the schema below. Never translate or rename them.
44
- - For top_n: always set value_col to the column to sort by. Do NOT use sort_col for top_n.
45
- - For sort: use sort_col for the column to sort by.
46
- - For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
47
- - For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
48
- Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
49
- - For OR conditions on a column (e.g. value is A or B), use or_filters. Combine with filters for mixed AND+OR logic.
50
- Example: (status=FAILED OR status=REVERSED) AND payment_channel=X → or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}], filters=[{{"col":"payment_channel","value":"X","op":"eq"}}]
51
- - For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
52
-
53
- Schema:
54
- {schema}
55
-
56
- {error_section}"""
57
-
58
-
59
- class TabularOperation(BaseModel):
60
- operation: Literal[
61
- "filter", "groupby_sum", "groupby_avg", "groupby_count",
62
- "top_n", "sort", "aggregate", "raw"
63
- ]
64
- group_col: str | None = None # for groupby_*
65
- value_col: str | None = None # for groupby_*, top_n, aggregate
66
- filter_col: str | None = None # for single filter
67
- filter_value: str | None = None # for single filter
68
- filter_operator: Literal["eq", "ne", "gt", "gte", "lt", "lte"] = "eq" # for single filter
69
- filters: list[dict] | None = None # for multi-condition AND: [{"col": ..., "value": ..., "op": ...}]
70
- or_filters: list[dict] | None = None # for OR conditions, applied before AND filters
71
- sort_col: str | None = None # for sort
72
- ascending: bool = True # for sort
73
- n: int | None = None # for top_n
74
- agg_func: Literal["sum", "avg", "min", "max", "count"] | None = None # for aggregate
75
- reasoning: str
76
-
77
-
78
- def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
79
- numeric = pd.to_numeric(df[col], errors="coerce")
80
- if operator == "eq":
81
- return df[col].astype(str) == str(value)
82
- elif operator == "ne":
83
- return df[col].astype(str) != str(value)
84
- elif operator == "gt":
85
- return numeric > float(value)
86
- elif operator == "gte":
87
- return numeric >= float(value)
88
- elif operator == "lt":
89
- return numeric < float(value)
90
- elif operator == "lte":
91
- return numeric <= float(value)
92
- raise ValueError(f"Unknown operator: {operator}")
93
-
94
-
95
- def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
96
- return df[_get_filter_mask(df, col, value, operator)]
97
-
98
-
99
- def _build_schema_context(df: pd.DataFrame) -> str:
100
- lines = []
101
- for col in df.columns:
102
- sample = df[col].dropna().head(3).tolist()
103
- lines.append(f"- {col} ({df[col].dtype}): sample values: {sample}")
104
- return "\n".join(lines)
105
-
106
-
107
- def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.DataFrame:
108
- if op.operation == "groupby_sum":
109
- if not op.group_col or not op.value_col:
110
- raise ValueError(f"groupby_sum requires group_col and value_col, got {op}")
111
- return df.groupby(op.group_col)[op.value_col].sum().reset_index().nlargest(limit, op.value_col)
112
- elif op.operation == "groupby_avg":
113
- if not op.group_col or not op.value_col:
114
- raise ValueError(f"groupby_avg requires group_col and value_col, got {op}")
115
- return df.groupby(op.group_col)[op.value_col].mean().reset_index().nlargest(limit, op.value_col)
116
- elif op.operation == "groupby_count":
117
- if not op.group_col:
118
- raise ValueError(f"groupby_count requires group_col, got {op}")
119
- df_filtered = df.copy()
120
- if op.or_filters:
121
- or_mask = pd.Series([False] * len(df_filtered), index=df_filtered.index)
122
- for f in op.or_filters:
123
- or_mask = or_mask | _get_filter_mask(df_filtered, f["col"], f["value"], f.get("op", "eq"))
124
- df_filtered = df_filtered[or_mask]
125
- if op.filters:
126
- for f in op.filters:
127
- df_filtered = _apply_single_filter(df_filtered, f["col"], f["value"], f.get("op", "eq"))
128
- elif op.filter_col and op.filter_value is not None:
129
- df_filtered = _apply_single_filter(df_filtered, op.filter_col, op.filter_value, op.filter_operator)
130
- return df_filtered.groupby(op.group_col).size().reset_index(name="count").nlargest(limit, "count")
131
- elif op.operation == "filter":
132
- result = df.copy()
133
- if op.or_filters:
134
- or_mask = pd.Series([False] * len(result), index=result.index)
135
- for f in op.or_filters:
136
- or_mask = or_mask | _get_filter_mask(result, f["col"], f["value"], f.get("op", "eq"))
137
- result = result[or_mask]
138
- if op.filters:
139
- for f in op.filters:
140
- result = _apply_single_filter(result, f["col"], f["value"], f.get("op", "eq"))
141
- elif op.filter_col and op.filter_value is not None and not op.or_filters:
142
- result = _apply_single_filter(result, op.filter_col, op.filter_value, op.filter_operator)
143
- elif not op.or_filters and not op.filters and (not op.filter_col or op.filter_value is None):
144
- raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
145
- return result.head(limit)
146
- elif op.operation == "top_n":
147
- col = op.value_col
148
- if not col:
149
- raise ValueError(f"top_n requires value_col, got {op}")
150
- n = op.n or limit
151
- return df.nlargest(n, col)
152
- elif op.operation == "sort":
153
- if not op.sort_col:
154
- raise ValueError(f"sort requires sort_col, got {op}")
155
- return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
156
- elif op.operation == "aggregate":
157
- if not op.value_col or not op.agg_func:
158
- raise ValueError(f"aggregate requires value_col and agg_func, got {op}")
159
- funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max", "count": "count"}
160
- value = getattr(df[op.value_col], funcs[op.agg_func])()
161
- return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
162
- else: # "raw"
163
- return df.head(limit)
164
-
165
-
166
- class TabularExecutor(BaseExecutor):
167
- def __init__(self) -> None:
168
- self._llm = AzureChatOpenAI(
169
- azure_deployment=settings.azureai_deployment_name_4o,
170
- openai_api_version=settings.azureai_api_version_4o,
171
- azure_endpoint=settings.azureai_endpoint_url_4o,
172
- api_key=settings.azureai_api_key_4o,
173
- temperature=0,
174
- )
175
- self._prompt = ChatPromptTemplate.from_messages([
176
- ("system", _SYSTEM_PROMPT),
177
- ("human", "{question}"),
178
- ])
179
- self._chain = self._prompt | self._llm.with_structured_output(TabularOperation)
180
-
181
- async def execute(
182
- self,
183
- results: list[RetrievalResult],
184
- user_id: str,
185
- _db: AsyncSession,
186
- question: str,
187
- limit: int = 100,
188
- ) -> list[QueryResult]:
189
- tabular = [
190
- r for r in results
191
- if r.source_type == "document"
192
- and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
193
- ]
194
-
195
- if not tabular:
196
- return []
197
-
198
- # Group by (document_id, sheet_name) — one parquet download per group
199
- groups: dict[tuple[str, str | None], _GroupInfo] = {}
200
- for r in tabular:
201
- data = r.metadata.get("data", {})
202
- doc_id = data.get("document_id")
203
- if not doc_id:
204
- continue
205
- sheet_name = data.get("sheet_name") # None for CSV
206
- key = (doc_id, sheet_name)
207
- if key not in groups:
208
- groups[key] = {
209
- "filename": data.get("filename", ""),
210
- "file_type": data.get("file_type", ""),
211
- }
212
-
213
- async def _process_group(
214
- doc_id: str, sheet_name: str | None, info: _GroupInfo
215
- ) -> QueryResult | None:
216
- try:
217
- df = await download_parquet(user_id, doc_id, sheet_name)
218
- df_result = await self._query_with_agent(df, question, limit)
219
-
220
- table_label = info["filename"]
221
- if sheet_name:
222
- table_label += f" / sheet: {sheet_name}"
223
-
224
- logger.info(
225
- "tabular query complete",
226
- document_id=doc_id,
227
- sheet=sheet_name,
228
- file_type=info["file_type"],
229
- rows=len(df_result),
230
- columns=len(df_result.columns),
231
- )
232
- return QueryResult(
233
- source_type="document",
234
- source_id=doc_id,
235
- table_or_file=table_label,
236
- columns=list(df_result.columns),
237
- rows=df_result.to_dict(orient="records"),
238
- row_count=len(df_result),
239
- )
240
- except Exception as e:
241
- logger.error(
242
- "tabular query failed",
243
- document_id=doc_id,
244
- sheet=sheet_name,
245
- error=str(e),
246
- )
247
- return None
248
-
249
- gathered = await asyncio.gather(*[
250
- _process_group(doc_id, sheet_name, info)
251
- for (doc_id, sheet_name), info in groups.items()
252
- ])
253
- return [r for r in gathered if r is not None]
254
-
255
- async def _query_with_agent(
256
- self, df: pd.DataFrame, question: str, limit: int
257
- ) -> pd.DataFrame:
258
- schema_ctx = _build_schema_context(df)
259
- prev_error = ""
260
-
261
- for attempt in range(_MAX_RETRIES):
262
- error_section = (
263
- f"Previous attempt failed: {prev_error}\nFix the issue."
264
- if prev_error else ""
265
- )
266
- try:
267
- op: TabularOperation = await self._chain.ainvoke({
268
- "schema": schema_ctx,
269
- "error_section": error_section,
270
- "question": question,
271
- })
272
- logger.info(
273
- "tabular operation decided",
274
- operation=op.operation,
275
- reasoning=op.reasoning,
276
- )
277
- return _apply_operation(df, op, limit)
278
- except Exception as e:
279
- prev_error = str(e)
280
- logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
281
-
282
- # Fallback: return raw rows
283
- logger.warning("tabular agent failed after retries, returning raw rows")
284
- return df.head(limit)
285
-
286
-
287
- tabular_executor = TabularExecutor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/query/query_executor.py DELETED
@@ -1,42 +0,0 @@
1
- """QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
2
-
3
- import asyncio
4
-
5
- from sqlalchemy.ext.asyncio import AsyncSession
6
-
7
- from src.middlewares.logging import get_logger
8
- from src.query.base import QueryResult
9
- from src.query.executors.db_executor import db_executor
10
- from src.query.executors.tabular import tabular_executor
11
- from src.rag.base import RetrievalResult
12
-
13
- logger = get_logger("query_executor")
14
-
15
-
16
- class QueryExecutor:
17
- async def execute(
18
- self,
19
- results: list[RetrievalResult],
20
- user_id: str,
21
- db: AsyncSession,
22
- question: str,
23
- limit: int = 100,
24
- ) -> list[QueryResult]:
25
- batches = await asyncio.gather(
26
- db_executor.execute(results, user_id, db, question, limit),
27
- tabular_executor.execute(results, user_id, db, question, limit),
28
- return_exceptions=True,
29
- )
30
-
31
- query_results: list[QueryResult] = []
32
- for batch in batches:
33
- if isinstance(batch, Exception):
34
- logger.error("executor failed", error=str(batch))
35
- continue
36
- query_results.extend(batch)
37
-
38
- logger.info("query execution complete", total=len(query_results))
39
- return query_results
40
-
41
-
42
- query_executor = QueryExecutor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rag/base.py DELETED
@@ -1,20 +0,0 @@
1
- """Shared contract for all retriever implementations."""
2
-
3
- from abc import ABC, abstractmethod
4
- from dataclasses import dataclass
5
- from typing import Any
6
-
7
-
8
- @dataclass
9
- class RetrievalResult:
10
- content: str
11
- metadata: dict[str, Any]
12
- score: float
13
- source_type: str # "document" | "database"
14
-
15
-
16
- class BaseRetriever(ABC):
17
- @abstractmethod
18
- async def retrieve(
19
- self, query: str, user_id: str, k: int = 5
20
- ) -> list[RetrievalResult]: ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rag/retriever.py CHANGED
@@ -1,45 +1,69 @@
1
- """Public retrieval API thin wrapper around RetrievalRouter."""
2
 
 
 
 
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
-
5
  from src.middlewares.logging import get_logger
6
- from src.rag.base import RetrievalResult
7
- from src.rag.retrievers.document import document_retriever
8
- from src.rag.retrievers.schema import schema_retriever
9
- from src.rag.router import RetrievalRouter, SourceHint
10
 
11
  logger = get_logger("retriever")
12
 
 
13
 
14
- class RetrieverService:
15
- """Public retrieval service used by chat.py and search tools.
16
 
17
- Delegates to RetrievalRouter which dispatches based on source_hint.
18
- Returns RetrievalResult objects directly so downstream consumers
19
- (db_executor, tabular_executor) can be fed without lossy dict
20
- conversion. The `db` parameter is accepted for call-site compatibility
21
- but currently unused — retrieval reads PGVector via _pgvector_engine
22
- inside each retriever.
23
- """
24
 
25
  def __init__(self):
26
- self._router = RetrievalRouter(
27
- schema_retriever=schema_retriever,
28
- document_retriever=document_retriever,
29
- )
30
 
31
  async def retrieve(
32
  self,
33
  query: str,
34
  user_id: str,
35
  db: AsyncSession,
36
- k: int = 5,
37
- source_hint: SourceHint = "both",
38
- ) -> list[RetrievalResult]:
 
 
 
 
 
39
  try:
40
- return await self._router.retrieve(query, user_id, source_hint, k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  except Exception as e:
42
- logger.error("retrieval failed", error=str(e))
43
  return []
44
 
45
 
 
1
+ """Service for retrieving relevant documents from vector store."""
2
 
3
+ import hashlib
4
+ import json
5
+ from src.db.postgres.vector_store import get_vector_store
6
+ from src.db.redis.connection import get_redis
7
  from sqlalchemy.ext.asyncio import AsyncSession
 
8
  from src.middlewares.logging import get_logger
9
+ from typing import List, Dict, Any
 
 
 
10
 
11
  logger = get_logger("retriever")
12
 
13
+ _RETRIEVAL_CACHE_TTL = 3600 # 1 hour
14
 
 
 
15
 
16
+ class RetrieverService:
17
+ """Service for retrieving relevant documents."""
 
 
 
 
 
18
 
19
  def __init__(self):
20
+ self.vector_store = get_vector_store()
 
 
 
21
 
22
  async def retrieve(
23
  self,
24
  query: str,
25
  user_id: str,
26
  db: AsyncSession,
27
+ k: int = 5
28
+ ) -> List[Dict[str, Any]]:
29
+ """Retrieve relevant chunks for a query, scoped to the user's documents.
30
+
31
+ Returns:
32
+ List of dicts with keys: content, metadata
33
+ metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
34
+ """
35
  try:
36
+ redis = await get_redis()
37
+ query_hash = hashlib.md5(query.encode()).hexdigest()
38
+ cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
39
+
40
+ cached = await redis.get(cache_key)
41
+ if cached:
42
+ logger.info("Returning cached retrieval results")
43
+ return json.loads(cached)
44
+
45
+ logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
46
+
47
+ docs = await self.vector_store.asimilarity_search(
48
+ query=query,
49
+ k=k,
50
+ filter={"user_id": user_id}
51
+ )
52
+
53
+ results = [
54
+ {
55
+ "content": doc.page_content,
56
+ "metadata": doc.metadata,
57
+ }
58
+ for doc in docs
59
+ ]
60
+
61
+ logger.info(f"Retrieved {len(results)} chunks")
62
+ await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
63
+ return results
64
+
65
  except Exception as e:
66
+ logger.error("Retrieval failed", error=str(e))
67
  return []
68
 
69
 
src/rag/retrievers/__init__.py DELETED
File without changes
src/rag/retrievers/baseline.py DELETED
@@ -1,76 +0,0 @@
1
- """Service for retrieving relevant documents from vector store."""
2
-
3
- import hashlib
4
- import json
5
- from src.db.postgres.vector_store import get_vector_store
6
- from src.db.redis.connection import get_redis
7
- from sqlalchemy.ext.asyncio import AsyncSession
8
- from src.middlewares.logging import get_logger
9
- from typing import List, Dict, Any
10
-
11
- logger = get_logger("retriever")
12
-
13
- _RETRIEVAL_CACHE_TTL = 3600 # 1 hour
14
-
15
-
16
- class BaselineRetrieverService:
17
- """Baseline (pre-Phase-1) retriever — preserved for benchmark comparison.
18
-
19
- Renamed from RetrieverService so it doesn't shadow the production wrapper
20
- at src/rag/retriever.py. Production code imports from src.rag.retriever;
21
- benchmark scripts that want this baseline must import explicitly from
22
- src.rag.retrievers.baseline.
23
- """
24
-
25
- def __init__(self):
26
- self.vector_store = get_vector_store()
27
-
28
- async def retrieve(
29
- self,
30
- query: str,
31
- user_id: str,
32
- db: AsyncSession,
33
- k: int = 5
34
- ) -> List[Dict[str, Any]]:
35
- """Retrieve relevant chunks for a query, scoped to the user's documents.
36
-
37
- Returns:
38
- List of dicts with keys: content, metadata
39
- metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
40
- """
41
- try:
42
- redis = await get_redis()
43
- query_hash = hashlib.md5(query.encode()).hexdigest()
44
- cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
45
-
46
- cached = await redis.get(cache_key)
47
- if cached:
48
- logger.info("Returning cached retrieval results")
49
- return json.loads(cached)
50
-
51
- logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
52
-
53
- docs = await self.vector_store.asimilarity_search(
54
- query=query,
55
- k=k,
56
- filter={"user_id": user_id}
57
- )
58
-
59
- results = [
60
- {
61
- "content": doc.page_content,
62
- "metadata": doc.metadata,
63
- }
64
- for doc in docs
65
- ]
66
-
67
- logger.info(f"Retrieved {len(results)} chunks")
68
- await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
69
- return results
70
-
71
- except Exception as e:
72
- logger.error("Retrieval failed", error=str(e))
73
- return []
74
-
75
-
76
- baseline_retriever = BaselineRetrieverService()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rag/retrievers/document.py DELETED
@@ -1,158 +0,0 @@
1
- """Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular)."""
2
-
3
- import math
4
-
5
- from langchain_postgres import PGVector
6
- from langchain_postgres.vectorstores import DistanceStrategy
7
- from langchain_openai import AzureOpenAIEmbeddings
8
- from sqlalchemy import text
9
-
10
- from src.config.settings import settings
11
- from src.db.postgres.connection import _pgvector_engine
12
- from src.db.postgres.vector_store import get_vector_store
13
- from src.middlewares.logging import get_logger
14
- from src.rag.base import BaseRetriever, RetrievalResult
15
-
16
- logger = get_logger("document_retriever")
17
-
18
- # Change this one line to switch retrieval method
19
- # Options: "mmr" | "cosine" | "euclidean" | "inner_product" | "manhattan"
20
- _RETRIEVAL_METHOD = "mmr"
21
-
22
- _TABULAR_TYPES = {"csv", "xlsx"}
23
- _FETCH_K = 20
24
- _LAMBDA_MULT = 0.5
25
- _COLLECTION_NAME = "document_embeddings"
26
-
27
- _embeddings = AzureOpenAIEmbeddings(
28
- azure_deployment=settings.azureai_deployment_name_embedding,
29
- openai_api_version=settings.azureai_api_version_embedding,
30
- azure_endpoint=settings.azureai_endpoint_url_embedding,
31
- api_key=settings.azureai_api_key_embedding,
32
- )
33
-
34
- _euclidean_store = PGVector(
35
- embeddings=_embeddings,
36
- connection=_pgvector_engine,
37
- collection_name=_COLLECTION_NAME,
38
- distance_strategy=DistanceStrategy.EUCLIDEAN,
39
- use_jsonb=True,
40
- async_mode=True,
41
- create_extension=False,
42
- )
43
-
44
- _ip_store = PGVector(
45
- embeddings=_embeddings,
46
- connection=_pgvector_engine,
47
- collection_name=_COLLECTION_NAME,
48
- distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
49
- use_jsonb=True,
50
- async_mode=True,
51
- create_extension=False,
52
- )
53
-
54
- _MANHATTAN_SQL = text("""
55
- SELECT
56
- lpe.document,
57
- lpe.cmetadata,
58
- lpe.embedding <+> CAST(:embedding AS vector) AS distance
59
- FROM langchain_pg_embedding lpe
60
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
61
- WHERE lpc.name = :collection
62
- AND lpe.cmetadata->>'user_id' = :user_id
63
- AND lpe.cmetadata->>'source_type' = 'document'
64
- ORDER BY distance ASC
65
- LIMIT :k
66
- """)
67
-
68
-
69
- class DocumentRetriever(BaseRetriever):
70
- def __init__(self) -> None:
71
- self.vector_store = get_vector_store()
72
-
73
- async def retrieve(
74
- self, query: str, user_id: str, k: int = 5
75
- ) -> list[RetrievalResult]:
76
- filter_ = {"user_id": user_id, "source_type": "document"}
77
- fetch_k = k + len(_TABULAR_TYPES)
78
-
79
- if _RETRIEVAL_METHOD == "manhattan":
80
- return await self._retrieve_manhattan(query, user_id, k, fetch_k)
81
-
82
- if _RETRIEVAL_METHOD == "mmr":
83
- docs = await self.vector_store.amax_marginal_relevance_search(
84
- query=query,
85
- k=fetch_k,
86
- fetch_k=_FETCH_K,
87
- lambda_mult=_LAMBDA_MULT,
88
- filter=filter_,
89
- )
90
- cosine = await self.vector_store.asimilarity_search_with_score(
91
- query=query, k=fetch_k, filter=filter_,
92
- )
93
- score_map = {doc.page_content: score for doc, score in cosine}
94
- docs_with_scores = [(doc, score_map.get(doc.page_content, 0.0)) for doc in docs]
95
- elif _RETRIEVAL_METHOD == "euclidean":
96
- docs_with_scores = await _euclidean_store.asimilarity_search_with_score(
97
- query=query, k=fetch_k, filter=filter_,
98
- )
99
- elif _RETRIEVAL_METHOD == "inner_product":
100
- docs_with_scores = await _ip_store.asimilarity_search_with_score(
101
- query=query, k=fetch_k, filter=filter_,
102
- )
103
- else: # cosine
104
- docs_with_scores = await self.vector_store.asimilarity_search_with_score(
105
- query=query, k=fetch_k, filter=filter_,
106
- )
107
-
108
- results = []
109
- for doc, score in docs_with_scores:
110
- file_type = doc.metadata.get("data", {}).get("file_type", "")
111
- if file_type not in _TABULAR_TYPES:
112
- results.append(RetrievalResult(
113
- content=doc.page_content,
114
- metadata=doc.metadata,
115
- score=score,
116
- source_type="document",
117
- ))
118
- if len(results) == k:
119
- break
120
-
121
- logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
122
- return results
123
-
124
- async def _retrieve_manhattan(
125
- self, query: str, user_id: str, k: int, fetch_k: int
126
- ) -> list[RetrievalResult]:
127
- query_vector = await _embeddings.aembed_query(query)
128
- if not all(math.isfinite(v) for v in query_vector):
129
- raise ValueError("Embedding vector contains NaN or Infinity values.")
130
- vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
131
-
132
- async with _pgvector_engine.connect() as conn:
133
- result = await conn.execute(_MANHATTAN_SQL, {
134
- "embedding": vector_str,
135
- "collection": _COLLECTION_NAME,
136
- "user_id": user_id,
137
- "k": fetch_k,
138
- })
139
- rows = result.fetchall()
140
-
141
- results = []
142
- for row in rows:
143
- file_type = row.cmetadata.get("data", {}).get("file_type", "")
144
- if file_type not in _TABULAR_TYPES:
145
- results.append(RetrievalResult(
146
- content=row.document,
147
- metadata=row.cmetadata,
148
- score=float(row.distance),
149
- source_type="document",
150
- ))
151
- if len(results) == k:
152
- break
153
-
154
- logger.info("retrieved chunks", method="manhattan", count=len(results))
155
- return results
156
-
157
-
158
- document_retriever = DocumentRetriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rag/retrievers/schema.py DELETED
@@ -1,411 +0,0 @@
1
- """Schema retriever — handles DB schemas (source_type="database") and tabular file
2
- columns stored as source_type="document" with file_type in ("csv","xlsx").
3
-
4
- Strategy: hybrid_bm25 — RRF merge of dense cosine search (DB columns + DB tables
5
- + tabular columns + tabular sheets) and PostgreSQL full-text search (DB columns only).
6
- Embeds the query once, fans out five legs in parallel.
7
-
8
- The DB-tables leg surfaces table-level summary chunks (chunk_level='table') as
9
- a recall signal for multi-table questions: when a relevant table's columns
10
- don't individually win on similarity, the table chunk can still pull the table
11
- into the hit set, where db_executor's downstream full-schema fetch picks up
12
- the per-column detail.
13
-
14
- FTS requires a GIN index on langchain_pg_embedding.document (created by init_db.py).
15
- """
16
-
17
- import asyncio
18
-
19
- from sqlalchemy import text
20
-
21
- from src.db.postgres.connection import _pgvector_engine
22
- from src.db.postgres.vector_store import get_vector_store
23
- from src.middlewares.logging import get_logger
24
- from src.rag.base import BaseRetriever, RetrievalResult
25
-
26
- logger = get_logger("schema_retriever")
27
-
28
- _TABULAR_FILE_TYPES = ("csv", "xlsx")
29
- _TABLE_CHUNK_K_MULTIPLIER = 2 # how many table chunks to pull before RRF
30
-
31
-
32
- class SchemaRetriever(BaseRetriever):
33
- def __init__(self):
34
- self.vector_store = get_vector_store()
35
-
36
- # ------------------------------------------------------------------
37
- # Internal helpers
38
- # ------------------------------------------------------------------
39
-
40
- async def _embed_query(self, query: str) -> list[float]:
41
- return await asyncio.to_thread(self.vector_store.embeddings.embed_query, query)
42
-
43
- async def _search_db(
44
- self, embedding: list[float], user_id: str, k: int
45
- ) -> list[RetrievalResult]:
46
- """Cosine vector search over database chunks."""
47
- emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
48
-
49
- sql = text(f"""
50
- SELECT lpe.document, lpe.cmetadata,
51
- 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
52
- FROM langchain_pg_embedding lpe
53
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
54
- WHERE lpc.name = 'document_embeddings'
55
- AND lpe.cmetadata->>'user_id' = :user_id
56
- AND lpe.cmetadata->>'source_type' = 'database'
57
- AND lpe.cmetadata->>'chunk_level' = 'column'
58
- ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
59
- LIMIT :k
60
- """)
61
-
62
- async with _pgvector_engine.connect() as conn:
63
- result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
64
- rows = result.fetchall()
65
-
66
- return [
67
- RetrievalResult(
68
- content=row.document,
69
- metadata=row.cmetadata,
70
- score=float(row.score),
71
- source_type="database",
72
- )
73
- for row in rows
74
- ]
75
-
76
- async def _search_db_tables(
77
- self, embedding: list[float], user_id: str, k: int
78
- ) -> list[RetrievalResult]:
79
- """Cosine vector search over database TABLE-level chunks.
80
-
81
- Recall channel for multi-table questions. The chunk's content is
82
- discarded downstream — db_executor only consumes its `data.table_name`
83
- to seed full-schema fetch.
84
- """
85
- emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
86
-
87
- sql = text(f"""
88
- SELECT lpe.document, lpe.cmetadata,
89
- 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
90
- FROM langchain_pg_embedding lpe
91
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
92
- WHERE lpc.name = 'document_embeddings'
93
- AND lpe.cmetadata->>'user_id' = :user_id
94
- AND lpe.cmetadata->>'source_type' = 'database'
95
- AND lpe.cmetadata->>'chunk_level' = 'table'
96
- ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
97
- LIMIT :k
98
- """)
99
-
100
- async with _pgvector_engine.connect() as conn:
101
- result = await conn.execute(
102
- sql, {"user_id": user_id, "k": k * _TABLE_CHUNK_K_MULTIPLIER}
103
- )
104
- rows = result.fetchall()
105
-
106
- return [
107
- RetrievalResult(
108
- content=row.document,
109
- metadata=row.cmetadata,
110
- score=float(row.score),
111
- source_type="database",
112
- )
113
- for row in rows
114
- ]
115
-
116
- async def _search_tabular(
117
- self, embedding: list[float], user_id: str, k: int
118
- ) -> list[RetrievalResult]:
119
- """Cosine vector search over tabular document chunks (csv/xlsx)."""
120
- emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
121
-
122
- sql = text(f"""
123
- SELECT lpe.document, lpe.cmetadata,
124
- 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
125
- FROM langchain_pg_embedding lpe
126
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
127
- WHERE lpc.name = 'document_embeddings'
128
- AND lpe.cmetadata->>'user_id' = :user_id
129
- AND lpe.cmetadata->>'source_type' = 'document'
130
- AND lpe.cmetadata->>'chunk_level' = 'column'
131
- AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
132
- OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
133
- ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
134
- LIMIT :k
135
- """)
136
-
137
- async with _pgvector_engine.connect() as conn:
138
- result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
139
- rows = result.fetchall()
140
-
141
- return [
142
- RetrievalResult(
143
- content=row.document,
144
- metadata=row.cmetadata,
145
- score=float(row.score),
146
- source_type="document",
147
- )
148
- for row in rows
149
- ]
150
-
151
- async def _search_tabular_sheets(
152
- self, embedding: list[float], user_id: str, k: int
153
- ) -> list[RetrievalResult]:
154
- """Leg 5: sheet-level summary chunks from CSV/XLSX files."""
155
- emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
156
-
157
- sql = text(f"""
158
- SELECT lpe.document, lpe.cmetadata,
159
- 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
160
- FROM langchain_pg_embedding lpe
161
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
162
- WHERE lpc.name = 'document_embeddings'
163
- AND lpe.cmetadata->>'user_id' = :user_id
164
- AND lpe.cmetadata->>'source_type' = 'document'
165
- AND lpe.cmetadata->>'chunk_level' = 'sheet'
166
- AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
167
- OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
168
- ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
169
- LIMIT :k
170
- """)
171
-
172
- async with _pgvector_engine.connect() as conn:
173
- result = await conn.execute(sql, {"user_id": user_id, "k": k})
174
- rows = result.fetchall()
175
-
176
- return [
177
- RetrievalResult(
178
- content=row.document,
179
- metadata=row.cmetadata,
180
- score=float(row.score),
181
- source_type="document",
182
- )
183
- for row in rows
184
- ]
185
-
186
- async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
187
- """Full-text search over DB schema chunks using PostgreSQL tsvector."""
188
- sql = text("""
189
- SELECT lpe.document, lpe.cmetadata,
190
- ts_rank(to_tsvector('english', lpe.document),
191
- plainto_tsquery('english', :query)) AS rank
192
- FROM langchain_pg_embedding lpe
193
- JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
194
- WHERE lpc.name = 'document_embeddings'
195
- AND lpe.cmetadata->>'user_id' = :user_id
196
- AND lpe.cmetadata->>'source_type' = 'database'
197
- AND lpe.cmetadata->>'chunk_level' = 'column'
198
- AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
199
- ORDER BY rank DESC
200
- LIMIT :k
201
- """)
202
-
203
- async with _pgvector_engine.connect() as conn:
204
- result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
205
- rows = result.fetchall()
206
-
207
- return [
208
- RetrievalResult(
209
- content=row.document,
210
- metadata=row.cmetadata,
211
- score=float(row.rank),
212
- source_type="database",
213
- )
214
- for row in rows
215
- ]
216
-
217
- def _rank_tabular_sheets(
218
- self,
219
- sheet_results: list[RetrievalResult],
220
- column_results: list[RetrievalResult],
221
- top_k: int,
222
- k_rrf: int = 60,
223
- ) -> list[RetrievalResult]:
224
- """Rank tabular sheets by RRF across two voting legs:
225
- L1 (primary): sheet-chunk cosine score
226
- L2 (vote): best column-chunk position per (doc_id, sheet_name)
227
-
228
- Returns top-k sheet-level RetrievalResults. The full column list of
229
- each sheet is already in the sheet chunk's data.column_names from
230
- ingestion, so downstream tabular_executor can read full sheet context.
231
-
232
- For sheets surfaced by column votes but missing a sheet chunk (rare —
233
- ingestion always creates one), a minimal stub is returned and
234
- tabular_executor falls back to reading columns from the parquet.
235
- """
236
- # L1: sheets indexed by (doc_id, sheet_name) from sheet chunks
237
- sheet_index: dict[tuple, RetrievalResult] = {}
238
- sheet_ranked: list[tuple] = []
239
- for r in sheet_results:
240
- d = r.metadata.get("data", {})
241
- key = (d.get("document_id"), d.get("sheet_name"))
242
- if key[0] and key not in sheet_index:
243
- sheet_index[key] = r
244
- sheet_ranked.append(key)
245
-
246
- # L2: sheets ranked by first-appearance in column-chunk results
247
- col_sheet_ranked: list[tuple] = []
248
- seen: set[tuple] = set()
249
- for r in column_results:
250
- d = r.metadata.get("data", {})
251
- key = (d.get("document_id"), d.get("sheet_name"))
252
- if key[0] and key not in seen:
253
- col_sheet_ranked.append(key)
254
- seen.add(key)
255
-
256
- # RRF over (doc_id, sheet_name) across the two legs
257
- rrf_scores: dict[tuple, float] = {}
258
- for ranked_list in [sheet_ranked, col_sheet_ranked]:
259
- for rank, key in enumerate(ranked_list):
260
- rrf_scores[key] = rrf_scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
261
-
262
- top_sheets = sorted(rrf_scores, key=lambda k: rrf_scores[k], reverse=True)[:top_k]
263
-
264
- results: list[RetrievalResult] = []
265
- for key in top_sheets:
266
- if key in sheet_index:
267
- r = sheet_index[key]
268
- r.score = rrf_scores[key]
269
- results.append(r)
270
- else:
271
- # Surfaced by column votes only — build stub from a representative
272
- # column result so tabular_executor can group correctly.
273
- doc_id, sheet_name = key
274
- rep = next(
275
- (r for r in column_results
276
- if r.metadata.get("data", {}).get("document_id") == doc_id
277
- and r.metadata.get("data", {}).get("sheet_name") == sheet_name),
278
- None,
279
- )
280
- if rep is None:
281
- continue
282
- stub_data = dict(rep.metadata.get("data", {}))
283
- stub_data.pop("column_name", None)
284
- stub_data.pop("column_type", None)
285
- results.append(RetrievalResult(
286
- content=f"Sheet: {stub_data.get('filename', '')}"
287
- + (f" / sheet: {sheet_name}" if sheet_name else ""),
288
- metadata={**rep.metadata, "data": stub_data, "chunk_level": "sheet"},
289
- score=rrf_scores[key],
290
- source_type="document",
291
- ))
292
- return results
293
-
294
- def _rank_db_tables(
295
- self,
296
- tbl_results: list[RetrievalResult],
297
- col_results: list[RetrievalResult],
298
- fts_results: list[RetrievalResult],
299
- top_k: int,
300
- k_rrf: int = 60,
301
- ) -> list[RetrievalResult]:
302
- """Rank DB tables by RRF across three legs:
303
- L1 (primary): table-summary chunk similarity
304
- L2 (vote): best column-chunk position per table
305
- L3 (vote): best FTS position per table
306
-
307
- Returns top-k table-chunk RetrievalResults. For tables surfaced by
308
- L2/L3 but missing a table chunk, a minimal stub is returned so that
309
- db_executor._fetch_full_schema can seed off data.table_name.
310
- """
311
- # L1: tables ranked by table-chunk cosine score
312
- tbl_index: dict[str, RetrievalResult] = {}
313
- tbl_ranked: list[str] = []
314
- for r in tbl_results:
315
- tname = r.metadata.get("data", {}).get("table_name")
316
- if tname and tname not in tbl_index:
317
- tbl_index[tname] = r
318
- tbl_ranked.append(tname)
319
-
320
- # L2: tables ranked by first-appearance in column-chunk list (best col score)
321
- col_table_ranked: list[str] = []
322
- seen: set[str] = set()
323
- for r in col_results:
324
- tname = r.metadata.get("data", {}).get("table_name")
325
- if tname and tname not in seen:
326
- col_table_ranked.append(tname)
327
- seen.add(tname)
328
-
329
- # L3: tables ranked by first-appearance in FTS list
330
- fts_table_ranked: list[str] = []
331
- seen = set()
332
- for r in fts_results:
333
- tname = r.metadata.get("data", {}).get("table_name")
334
- if tname and tname not in seen:
335
- fts_table_ranked.append(tname)
336
- seen.add(tname)
337
-
338
- # RRF over table names across the three legs
339
- rrf_scores: dict[str, float] = {}
340
- for ranked_list in [tbl_ranked, col_table_ranked, fts_table_ranked]:
341
- for rank, tname in enumerate(ranked_list):
342
- rrf_scores[tname] = rrf_scores.get(tname, 0.0) + 1.0 / (k_rrf + rank + 1)
343
-
344
- top_tables = sorted(rrf_scores, key=lambda t: rrf_scores[t], reverse=True)[:top_k]
345
-
346
- results: list[RetrievalResult] = []
347
- for tname in top_tables:
348
- if tname in tbl_index:
349
- r = tbl_index[tname]
350
- r.score = rrf_scores[tname]
351
- results.append(r)
352
- else:
353
- # Surfaced by column/FTS votes with no table chunk — minimal stub
354
- results.append(RetrievalResult(
355
- content=f"Table: {tname}",
356
- metadata={"data": {"table_name": tname}, "source_type": "database"},
357
- score=rrf_scores[tname],
358
- source_type="database",
359
- ))
360
- return results
361
-
362
- # ------------------------------------------------------------------
363
- # Public interface — called by the router
364
- # ------------------------------------------------------------------
365
-
366
- async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
367
- """Table-first retrieval for DB sources; chunk-level for tabular.
368
-
369
- DB tables are ranked via RRF across three legs:
370
- L1 (primary): table-summary chunk similarity
371
- L2 (vote): top-K column-chunk cosine, grouped by table
372
- L3 (vote): top-K FTS column hits, grouped by table
373
-
374
- db_executor downstream fetches the full per-column schema for the
375
- ranked table set via _fetch_full_schema — the column chunks returned
376
- here are intentionally NOT used as the schema source, only for voting.
377
-
378
- Tabular (CSV/XLSX) sheets are ranked via RRF across two legs:
379
- L1: sheet-chunk cosine
380
- L2: column-chunk votes (best position per sheet)
381
- Returns sheet-level RetrievalResults so tabular_executor receives
382
- full sheet context (all columns) rather than fragmented column hits.
383
- """
384
- embedding = await self._embed_query(query)
385
- db_col_results, db_tbl_results, tabular_results, fts_results, sheet_results = await asyncio.gather(
386
- self._search_db(embedding, user_id, k),
387
- self._search_db_tables(embedding, user_id, k),
388
- self._search_tabular(embedding, user_id, k),
389
- self._search_fts_db(query, user_id, k * 4),
390
- self._search_tabular_sheets(embedding, user_id, k),
391
- )
392
-
393
- db_ranked = self._rank_db_tables(db_tbl_results, db_col_results, fts_results, top_k=k)
394
- tabular_ranked = self._rank_tabular_sheets(sheet_results, tabular_results, top_k=k)
395
-
396
- results = sorted(db_ranked + tabular_ranked, key=lambda r: r.score, reverse=True)
397
- logger.info(
398
- "schema retrieval",
399
- count=len(results),
400
- db_tables_ranked=len(db_ranked),
401
- db_cols=len(db_col_results),
402
- db_tables=len(db_tbl_results),
403
- tabular_cols=len(tabular_results),
404
- tabular_sheets=len(sheet_results),
405
- tabular_ranked=len(tabular_ranked),
406
- fts=len(fts_results),
407
- )
408
- return results
409
-
410
-
411
- schema_retriever = SchemaRetriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rag/router.py DELETED
@@ -1,179 +0,0 @@
1
- """Routes retrieval requests to the appropriate retriever based on source_hint.
2
-
3
- Cross-retriever merging uses Reciprocal Rank Fusion (RRF) on per-retriever
4
- ranked lists — score scales differ across retrievers (RRF, cosine, distance)
5
- and aren't directly comparable, so we rank-merge instead of score-merge.
6
- """
7
-
8
- import asyncio
9
- import hashlib
10
- import json
11
- from dataclasses import asdict
12
- from typing import Literal
13
-
14
- from src.db.redis.connection import get_redis
15
- from src.middlewares.logging import get_logger
16
- from src.rag.base import BaseRetriever, RetrievalResult
17
-
18
- logger = get_logger("retrieval_router")
19
-
20
- _CACHE_TTL = 3600 # 1 hour
21
- _CACHE_KEY_PREFIX = "retrieval"
22
- _RRF_K = 60 # standard RRF constant
23
- SourceHint = Literal["document", "schema", "both"]
24
-
25
-
26
- def _result_dedup_key(r: RetrievalResult) -> tuple:
27
- """Cross-retriever dedup key — distinguishes DB columns vs DB tables vs
28
- tabular columns vs prose chunks vs sheet-level chunks."""
29
- data = r.metadata.get("data", {})
30
- return (
31
- r.source_type,
32
- data.get("table_name"),
33
- data.get("column_name"),
34
- data.get("filename"),
35
- data.get("sheet_name"),
36
- data.get("chunk_index"), # disambiguates multiple prose chunks per doc
37
- r.metadata.get("chunk_level"), # distinguishes sheet vs column chunks
38
- )
39
-
40
-
41
- def _rrf_merge(
42
- ranked_lists: list[list[RetrievalResult]],
43
- top_k: int,
44
- k_rrf: int = _RRF_K,
45
- ) -> list[RetrievalResult]:
46
- """Reciprocal Rank Fusion across retriever batches.
47
-
48
- Each input list is treated as already best-first ordered. Items are
49
- deduped via _result_dedup_key and re-ranked by aggregated reciprocal
50
- rank across all lists. Score on the returned RetrievalResult is the
51
- aggregated RRF score (uniform scale across legs).
52
- """
53
- scores: dict[tuple, float] = {}
54
- index: dict[tuple, RetrievalResult] = {}
55
-
56
- for ranked in ranked_lists:
57
- for rank, result in enumerate(ranked):
58
- key = _result_dedup_key(result)
59
- scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
60
- # Keep the first occurrence; metadata is identical for the same
61
- # key across lists, so any copy is fine.
62
- if key not in index:
63
- index[key] = result
64
-
65
- merged = sorted(index.values(), key=lambda r: scores[_result_dedup_key(r)], reverse=True)
66
- # Overwrite score with RRF score so downstream consumers see a uniform scale.
67
- for r in merged:
68
- r.score = scores[_result_dedup_key(r)]
69
- return merged[:top_k]
70
-
71
-
72
- async def invalidate_retrieval_cache(user_id: str) -> int:
73
- """Delete every cached retrieval entry for `user_id`.
74
-
75
- Called by ingest/upload/delete API handlers after a successful write so
76
- the next retrieval picks up the new data instead of stale cached top-k.
77
- Returns the number of keys removed.
78
- """
79
- redis = await get_redis()
80
- pattern = f"{_CACHE_KEY_PREFIX}:{user_id}:*"
81
- keys = [key async for key in redis.scan_iter(match=pattern)]
82
- if not keys:
83
- return 0
84
- deleted = await redis.delete(*keys)
85
- logger.info("retrieval cache invalidated", user_id=user_id, deleted=deleted)
86
- return int(deleted)
87
-
88
-
89
- class RetrievalRouter:
90
- def __init__(
91
- self,
92
- schema_retriever: BaseRetriever,
93
- document_retriever: BaseRetriever,
94
- ):
95
- self._retrievers: dict[str, BaseRetriever] = {
96
- "schema": schema_retriever,
97
- "document": document_retriever,
98
- }
99
-
100
- def _route(self, source_hint: SourceHint) -> list[tuple[str, BaseRetriever]]:
101
- if source_hint == "schema":
102
- return [("schema", self._retrievers["schema"])]
103
- if source_hint == "document":
104
- return [("document", self._retrievers["document"])]
105
- return list(self._retrievers.items())
106
-
107
- async def retrieve(
108
- self,
109
- query: str,
110
- user_id: str,
111
- source_hint: SourceHint = "both",
112
- k: int = 10,
113
- ) -> list[RetrievalResult]:
114
- redis = await get_redis()
115
- query_hash = hashlib.md5(query.encode()).hexdigest()
116
- cache_key = f"{_CACHE_KEY_PREFIX}:{user_id}:{source_hint}:{query_hash}:{k}"
117
-
118
- cached = await redis.get(cache_key)
119
- if cached:
120
- try:
121
- raw = json.loads(cached)
122
- logger.info("returning cached retrieval results", source_hint=source_hint)
123
- return [RetrievalResult(**r) for r in raw]
124
- except Exception:
125
- logger.warning("corrupted retrieval cache, fetching fresh", cache_key=cache_key)
126
-
127
- results = await self._retrieve_uncached(query, user_id, source_hint, k)
128
-
129
- # Empty-result fallback: orchestrator may have misclassified intent.
130
- # Retry once with "both" before giving up. No-op when source_hint is
131
- # already "both".
132
- if not results and source_hint != "both":
133
- logger.warning(
134
- "empty retrieval, falling back to source_hint='both'",
135
- original_source_hint=source_hint,
136
- )
137
- results = await self._retrieve_uncached(query, user_id, "both", k)
138
-
139
- await redis.setex(
140
- cache_key,
141
- _CACHE_TTL,
142
- json.dumps([asdict(r) for r in results]),
143
- )
144
- return results
145
-
146
- async def _retrieve_uncached(
147
- self,
148
- query: str,
149
- user_id: str,
150
- source_hint: SourceHint,
151
- k: int,
152
- ) -> list[RetrievalResult]:
153
- routed = self._route(source_hint)
154
- batches = await asyncio.gather(
155
- *[r.retrieve(query, user_id, k) for _, r in routed],
156
- return_exceptions=True,
157
- )
158
-
159
- valid_lists: list[list[RetrievalResult]] = []
160
- per_retriever: dict[str, int | str] = {}
161
- for (name, _), batch in zip(routed, batches):
162
- if isinstance(batch, Exception):
163
- logger.error("retriever failed", retriever=name, error=str(batch))
164
- per_retriever[name] = "error"
165
- continue
166
- valid_lists.append(batch)
167
- per_retriever[name] = len(batch)
168
-
169
- results = _rrf_merge(valid_lists, top_k=k)
170
-
171
- logger.info(
172
- "router result",
173
- source_hint=source_hint,
174
- per_retriever=per_retriever,
175
- final_count=len(results),
176
- top_score=results[0].score if results else None,
177
- bottom_score=results[-1].score if results else None,
178
- )
179
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/storage/az_blob/az_blob.py CHANGED
@@ -57,22 +57,6 @@ class AzureBlobStorage:
57
  logger.error(f"Failed to download blob {blob_name}", error=str(e))
58
  raise
59
 
60
- async def upload_bytes(self, content: bytes, blob_name: str) -> str:
61
- """Upload bytes to Azure Blob Storage using a specific blob name.
62
-
63
- Unlike upload_file(), this does not generate a UUID name — caller controls the blob_name.
64
- Used for Parquet files where the name must be deterministic (derived from document_id).
65
- """
66
- try:
67
- async with self._get_blob_client(blob_name) as blob_client:
68
- logger.info(f"Uploading bytes to blob {blob_name}")
69
- await blob_client.upload_blob(content, overwrite=True)
70
- logger.info(f"Successfully uploaded {blob_name}")
71
- return blob_name
72
- except Exception as e:
73
- logger.error(f"Failed to upload bytes to {blob_name}", error=str(e))
74
- raise
75
-
76
  async def delete_file(self, blob_name: str) -> bool:
77
  """Delete file from Azure Blob Storage."""
78
  try:
@@ -87,24 +71,6 @@ class AzureBlobStorage:
87
  logger.error(f"Failed to delete blob {blob_name}", error=str(e))
88
  return False
89
 
90
- async def delete_blobs_with_prefix(self, prefix: str) -> int:
91
- """Delete all blobs whose name starts with prefix. Returns count deleted.
92
-
93
- Used to delete all Parquet files for a document in one call.
94
- """
95
- from azure.storage.blob.aio import ContainerClient
96
- container_url = f"{self.account_url}/{self.container_name}?{self.sas_token}"
97
- deleted = 0
98
- try:
99
- async with ContainerClient.from_container_url(container_url) as container:
100
- async for blob in container.list_blobs(name_starts_with=prefix):
101
- await container.delete_blob(blob.name)
102
- deleted += 1
103
- logger.info(f"Deleted {deleted} blobs with prefix {prefix}")
104
- except Exception as e:
105
- logger.error(f"Failed to delete blobs with prefix {prefix}", error=str(e))
106
- return deleted
107
-
108
 
109
  # Singleton instance
110
  blob_storage = AzureBlobStorage()
 
57
  logger.error(f"Failed to download blob {blob_name}", error=str(e))
58
  raise
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  async def delete_file(self, blob_name: str) -> bool:
61
  """Delete file from Azure Blob Storage."""
62
  try:
 
71
  logger.error(f"Failed to delete blob {blob_name}", error=str(e))
72
  return False
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Singleton instance
76
  blob_storage = AzureBlobStorage()
src/tools/search.py CHANGED
@@ -34,10 +34,10 @@ async def search_documents(
34
 
35
  formatted_results = []
36
  for result in results:
37
- filename = result.metadata.get("filename", "Unknown")
38
- page = result.metadata.get("page_label")
39
  source_label = f"{filename}, p.{page}" if page else filename
40
- formatted_results.append(f"[Source: {source_label}]\n{result.content}\n")
41
 
42
  return "\n".join(formatted_results)
43
 
 
34
 
35
  formatted_results = []
36
  for result in results:
37
+ filename = result["metadata"].get("filename", "Unknown")
38
+ page = result["metadata"].get("page_label")
39
  source_label = f"{filename}, p.{page}" if page else filename
40
+ formatted_results.append(f"[Source: {source_label}]\n{result['content']}\n")
41
 
42
  return "\n".join(formatted_results)
43
 
src/utils/db_credential_encryption.py DELETED
@@ -1,70 +0,0 @@
1
- """Fernet encryption utilities for user-registered database credentials.
2
-
3
- Encryption key is sourced from `dataeyond__db__credential__key` env variable,
4
- intentionally separate from the user-auth bcrypt salt (`emarcal__bcrypt__salt`).
5
-
6
- Usage:
7
- from src.utils.db_credential_encryption import encrypt_credentials_dict, decrypt_credentials_dict
8
-
9
- # Before INSERT:
10
- safe_creds = encrypt_credentials_dict(raw_credentials)
11
-
12
- # After SELECT:
13
- plain_creds = decrypt_credentials_dict(row.credentials)
14
- """
15
-
16
- from cryptography.fernet import Fernet
17
- from src.config.settings import settings
18
-
19
- # Sensitive credential field names that must be encrypted at rest.
20
- # Covers all supported DB types:
21
- # - password : postgres, mysql, sqlserver, supabase, snowflake
22
- # - service_account_json : bigquery
23
- SENSITIVE_FIELDS: frozenset[str] = frozenset({"password", "service_account_json"})
24
-
25
-
26
- def _get_cipher() -> Fernet:
27
- key = settings.dataeyond_db_credential_key
28
- if not key:
29
- raise ValueError(
30
- "dataeyond__db__credential__key is not set. "
31
- "Generate one with: Fernet.generate_key().decode()"
32
- )
33
- return Fernet(key.encode())
34
-
35
-
36
- def encrypt_credential(value: str) -> str:
37
- """Encrypt a single credential string value."""
38
- return _get_cipher().encrypt(value.encode()).decode()
39
-
40
-
41
- def decrypt_credential(value: str) -> str:
42
- """Decrypt a single Fernet-encrypted credential string."""
43
- return _get_cipher().decrypt(value.encode()).decode()
44
-
45
-
46
- def encrypt_credentials_dict(creds: dict) -> dict:
47
- """Return a copy of the credentials dict with sensitive fields encrypted.
48
-
49
- Call this before inserting a new DatabaseClient record.
50
- """
51
- cipher = _get_cipher()
52
- result = dict(creds)
53
- for field in SENSITIVE_FIELDS:
54
- if result.get(field):
55
- result[field] = cipher.encrypt(result[field].encode()).decode()
56
- return result
57
-
58
-
59
- def decrypt_credentials_dict(creds: dict) -> dict:
60
- """Return a copy of the credentials dict with sensitive fields decrypted.
61
-
62
- Call this after fetching a DatabaseClient record from DB.
63
- """
64
- cipher = _get_cipher()
65
- result = dict(creds)
66
- for field in SENSITIVE_FIELDS:
67
- if result.get(field):
68
- result[field] = cipher.decrypt(result[field].encode()).decode()
69
- return result
70
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
@@ -1,5 +1,5 @@
1
  version = 1
2
- revision = 3
3
  requires-python = "==3.12.*"
4
  resolution-markers = [
5
  "python_full_version >= '3.12.4'",
@@ -39,7 +39,6 @@ dependencies = [
39
  { name = "orjson" },
40
  { name = "pandas" },
41
  { name = "passlib", extra = ["bcrypt"] },
42
- { name = "pdf2image" },
43
  { name = "pgvector" },
44
  { name = "plotly" },
45
  { name = "presidio-analyzer" },
@@ -47,15 +46,10 @@ dependencies = [
47
  { name = "prometheus-client" },
48
  { name = "psycopg", extra = ["binary", "pool"] },
49
  { name = "psycopg2" },
50
- { name = "pyarrow" },
51
  { name = "pydantic" },
52
  { name = "pydantic-settings" },
53
  { name = "pymongo" },
54
- { name = "pymssql" },
55
- { name = "pymysql" },
56
  { name = "pypdf" },
57
- { name = "pypdf2" },
58
- { name = "pytesseract" },
59
  { name = "python-docx" },
60
  { name = "python-dotenv" },
61
  { name = "python-multipart" },
@@ -63,11 +57,8 @@ dependencies = [
63
  { name = "redis" },
64
  { name = "sentence-transformers" },
65
  { name = "slowapi" },
66
- { name = "snowflake-sqlalchemy" },
67
  { name = "spacy" },
68
  { name = "sqlalchemy", extra = ["asyncio"] },
69
- { name = "sqlalchemy-bigquery" },
70
- { name = "sqlglot" },
71
  { name = "sse-starlette" },
72
  { name = "starlette" },
73
  { name = "structlog" },
@@ -89,8 +80,11 @@ dev = [
89
 
90
  [package.dev-dependencies]
91
  dev = [
 
 
92
  { name = "pytest" },
93
  { name = "pytest-asyncio" },
 
94
  { name = "ruff" },
95
  ]
96
 
@@ -126,7 +120,6 @@ requires-dist = [
126
  { name = "orjson", specifier = "==3.10.12" },
127
  { name = "pandas", specifier = "==2.2.3" },
128
  { name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
129
- { name = "pdf2image", specifier = ">=1.17.0" },
130
  { name = "pgvector", specifier = "==0.3.6" },
131
  { name = "plotly", specifier = "==5.24.1" },
132
  { name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
@@ -135,15 +128,10 @@ requires-dist = [
135
  { name = "prometheus-client", specifier = "==0.21.1" },
136
  { name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
137
  { name = "psycopg2", specifier = ">=2.9.11" },
138
- { name = "pyarrow", specifier = ">=24.0.0" },
139
  { name = "pydantic", specifier = "==2.10.3" },
140
  { name = "pydantic-settings", specifier = "==2.7.0" },
141
  { name = "pymongo", specifier = ">=4.14.0" },
142
- { name = "pymssql", specifier = ">=2.3.0" },
143
- { name = "pymysql", specifier = ">=1.1.1" },
144
  { name = "pypdf", specifier = "==5.1.0" },
145
- { name = "pypdf2", specifier = ">=3.0.1" },
146
- { name = "pytesseract", specifier = ">=0.3.13" },
147
  { name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
148
  { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
149
  { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
@@ -155,11 +143,8 @@ requires-dist = [
155
  { name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
156
  { name = "sentence-transformers", specifier = "==3.3.1" },
157
  { name = "slowapi", specifier = "==0.1.9" },
158
- { name = "snowflake-sqlalchemy", specifier = ">=1.7.0" },
159
  { name = "spacy", specifier = "==3.8.3" },
160
  { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
161
- { name = "sqlalchemy-bigquery", specifier = ">=1.11.0" },
162
- { name = "sqlglot", specifier = ">=25.0.0" },
163
  { name = "sse-starlette", specifier = "==2.1.3" },
164
  { name = "starlette", specifier = "==0.41.3" },
165
  { name = "structlog", specifier = "==24.4.0" },
@@ -171,9 +156,12 @@ provides-extras = ["dev"]
171
 
172
  [package.metadata.requires-dev]
173
  dev = [
174
- { name = "pytest", specifier = ">=8.3.4" },
175
- { name = "pytest-asyncio", specifier = ">=0.24.0" },
176
- { name = "ruff", specifier = ">=0.8.4" },
 
 
 
177
  ]
178
 
179
  [[package]]
@@ -292,15 +280,6 @@ wheels = [
292
  { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
293
  ]
294
 
295
- [[package]]
296
- name = "asn1crypto"
297
- version = "1.5.1"
298
- source = { registry = "https://pypi.org/simple" }
299
- sdist = { url = "https://files.pythonhosted.org/packages/de/cf/d547feed25b5244fcb9392e288ff9fdc3280b10260362fc45d37a798a6ee/asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c", size = 121080, upload-time = "2022-03-15T14:46:52.889Z" }
300
- wheels = [
301
- { url = "https://files.pythonhosted.org/packages/c9/7f/09065fd9e27da0eda08b4d6897f1c13535066174cc023af248fc2a8d5e5a/asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67", size = 105045, upload-time = "2022-03-15T14:46:51.055Z" },
302
- ]
303
-
304
  [[package]]
305
  name = "asyncpg"
306
  version = "0.30.0"
@@ -449,34 +428,6 @@ wheels = [
449
  { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
450
  ]
451
 
452
- [[package]]
453
- name = "boto3"
454
- version = "1.42.89"
455
- source = { registry = "https://pypi.org/simple" }
456
- dependencies = [
457
- { name = "botocore" },
458
- { name = "jmespath" },
459
- { name = "s3transfer" },
460
- ]
461
- sdist = { url = "https://files.pythonhosted.org/packages/bb/0c/f7bccb22b245cabf392816baba20f9e95f78ace7dbc580fd40136e80e732/boto3-1.42.89.tar.gz", hash = "sha256:3e43aacc0801bba9bcd23a8c271c089af297a69565f783fcdd357ae0e330bf1e", size = 113165, upload-time = "2026-04-13T19:36:17.516Z" }
462
- wheels = [
463
- { url = "https://files.pythonhosted.org/packages/b9/33/55103ba5ef9975ea54b8d39e69b76eb6e9fded3beae5f01065e26951a3a1/boto3-1.42.89-py3-none-any.whl", hash = "sha256:6204b189f4d0c655535f43d7eaa57ff4e8d965b8463c97e45952291211162932", size = 140556, upload-time = "2026-04-13T19:36:13.894Z" },
464
- ]
465
-
466
- [[package]]
467
- name = "botocore"
468
- version = "1.42.89"
469
- source = { registry = "https://pypi.org/simple" }
470
- dependencies = [
471
- { name = "jmespath" },
472
- { name = "python-dateutil" },
473
- { name = "urllib3" },
474
- ]
475
- sdist = { url = "https://files.pythonhosted.org/packages/0f/cc/e6be943efa9051bd15c2ee14077c2b10d6e27c9e9385fc43a03a5c4ed8b5/botocore-1.42.89.tar.gz", hash = "sha256:95ac52f472dad29942f3088b278ab493044516c16dbf9133c975af16527baa99", size = 15206290, upload-time = "2026-04-13T19:36:02.321Z" }
476
- wheels = [
477
- { url = "https://files.pythonhosted.org/packages/91/f1/90a7b8eda38b7c3a65ca7ee0075bdf310b6b471cb1b95fab6e8994323a50/botocore-1.42.89-py3-none-any.whl", hash = "sha256:d9b786c8d9db6473063b4cc5be0ba7e6a381082307bd6afb69d4216f9fa95f35", size = 14887287, upload-time = "2026-04-13T19:35:56.677Z" },
478
- ]
479
-
480
  [[package]]
481
  name = "cachetools"
482
  version = "5.5.0"
@@ -990,109 +941,6 @@ wheels = [
990
  { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
991
  ]
992
 
993
- [[package]]
994
- name = "google-api-core"
995
- version = "2.30.3"
996
- source = { registry = "https://pypi.org/simple" }
997
- dependencies = [
998
- { name = "google-auth" },
999
- { name = "googleapis-common-protos" },
1000
- { name = "proto-plus" },
1001
- { name = "protobuf" },
1002
- { name = "requests" },
1003
- ]
1004
- sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" }
1005
- wheels = [
1006
- { url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" },
1007
- ]
1008
-
1009
- [package.optional-dependencies]
1010
- grpc = [
1011
- { name = "grpcio" },
1012
- { name = "grpcio-status" },
1013
- ]
1014
-
1015
- [[package]]
1016
- name = "google-auth"
1017
- version = "2.49.2"
1018
- source = { registry = "https://pypi.org/simple" }
1019
- dependencies = [
1020
- { name = "cryptography" },
1021
- { name = "pyasn1-modules" },
1022
- ]
1023
- sdist = { url = "https://files.pythonhosted.org/packages/c6/fc/e925290a1ad95c975c459e2df070fac2b90954e13a0370ac505dff78cb99/google_auth-2.49.2.tar.gz", hash = "sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409", size = 333958, upload-time = "2026-04-10T00:41:21.888Z" }
1024
- wheels = [
1025
- { url = "https://files.pythonhosted.org/packages/73/76/d241a5c927433420507215df6cac1b1fa4ac0ba7a794df42a84326c68da8/google_auth-2.49.2-py3-none-any.whl", hash = "sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5", size = 240638, upload-time = "2026-04-10T00:41:14.501Z" },
1026
- ]
1027
-
1028
- [[package]]
1029
- name = "google-cloud-bigquery"
1030
- version = "3.41.0"
1031
- source = { registry = "https://pypi.org/simple" }
1032
- dependencies = [
1033
- { name = "google-api-core", extra = ["grpc"] },
1034
- { name = "google-auth" },
1035
- { name = "google-cloud-core" },
1036
- { name = "google-resumable-media" },
1037
- { name = "packaging" },
1038
- { name = "python-dateutil" },
1039
- { name = "requests" },
1040
- ]
1041
- sdist = { url = "https://files.pythonhosted.org/packages/ce/13/6515c7aab55a4a0cf708ffd309fb9af5bab54c13e32dc22c5acd6497193c/google_cloud_bigquery-3.41.0.tar.gz", hash = "sha256:2217e488b47ed576360c9b2cc07d59d883a54b83167c0ef37f915c26b01a06fe", size = 513434, upload-time = "2026-03-30T22:50:55.347Z" }
1042
- wheels = [
1043
- { url = "https://files.pythonhosted.org/packages/40/33/1d3902efadef9194566d499d61507e1f038454e0b55499d2d7f8ab2a4fee/google_cloud_bigquery-3.41.0-py3-none-any.whl", hash = "sha256:2a5b5a737b401cbd824a6e5eac7554100b878668d908e6548836b5d8aaa4dcaa", size = 262343, upload-time = "2026-03-30T22:48:45.444Z" },
1044
- ]
1045
-
1046
- [[package]]
1047
- name = "google-cloud-core"
1048
- version = "2.5.1"
1049
- source = { registry = "https://pypi.org/simple" }
1050
- dependencies = [
1051
- { name = "google-api-core" },
1052
- { name = "google-auth" },
1053
- ]
1054
- sdist = { url = "https://files.pythonhosted.org/packages/dc/24/6ca08b0a03c7b0c620427503ab00353a4ae806b848b93bcea18b6b76fde6/google_cloud_core-2.5.1.tar.gz", hash = "sha256:3dc94bdec9d05a31d9f355045ed0f369fbc0d8c665076c734f065d729800f811", size = 36078, upload-time = "2026-03-30T22:50:08.057Z" }
1055
- wheels = [
1056
- { url = "https://files.pythonhosted.org/packages/73/d9/5bb050cb32826466aa9b25f79e2ca2879fe66cb76782d4ed798dd7506151/google_cloud_core-2.5.1-py3-none-any.whl", hash = "sha256:ea62cdf502c20e3e14be8a32c05ed02113d7bef454e40ff3fab6fe1ec9f1f4e7", size = 29452, upload-time = "2026-03-30T22:48:31.567Z" },
1057
- ]
1058
-
1059
- [[package]]
1060
- name = "google-crc32c"
1061
- version = "1.8.0"
1062
- source = { registry = "https://pypi.org/simple" }
1063
- sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" }
1064
- wheels = [
1065
- { url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" },
1066
- { url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" },
1067
- { url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" },
1068
- { url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" },
1069
- { url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" },
1070
- ]
1071
-
1072
- [[package]]
1073
- name = "google-resumable-media"
1074
- version = "2.8.2"
1075
- source = { registry = "https://pypi.org/simple" }
1076
- dependencies = [
1077
- { name = "google-crc32c" },
1078
- ]
1079
- sdist = { url = "https://files.pythonhosted.org/packages/3f/d1/b1ea14b93b6b78f57fc580125de44e9f593ab88dd2460f1a8a8d18f74754/google_resumable_media-2.8.2.tar.gz", hash = "sha256:f3354a182ebd193ae3f42e3ef95e6c9b10f128320de23ac7637236713b1acd70", size = 2164510, upload-time = "2026-03-30T23:34:25.369Z" }
1080
- wheels = [
1081
- { url = "https://files.pythonhosted.org/packages/5e/f8/50bfaf4658431ff9de45c5c3935af7ab01157a4903c603cd0eee6e78e087/google_resumable_media-2.8.2-py3-none-any.whl", hash = "sha256:82b6d8ccd11765268cdd2a2123f417ec806b8eef3000a9a38dfe3033da5fb220", size = 81511, upload-time = "2026-03-30T23:34:09.671Z" },
1082
- ]
1083
-
1084
- [[package]]
1085
- name = "googleapis-common-protos"
1086
- version = "1.74.0"
1087
- source = { registry = "https://pypi.org/simple" }
1088
- dependencies = [
1089
- { name = "protobuf" },
1090
- ]
1091
- sdist = { url = "https://files.pythonhosted.org/packages/20/18/a746c8344152d368a5aac738d4c857012f2c5d1fd2eac7e17b647a7861bd/googleapis_common_protos-1.74.0.tar.gz", hash = "sha256:57971e4eeeba6aad1163c1f0fc88543f965bb49129b8bb55b2b7b26ecab084f1", size = 151254, upload-time = "2026-04-02T21:23:26.679Z" }
1092
- wheels = [
1093
- { url = "https://files.pythonhosted.org/packages/b6/b0/be5d3329badb9230b765de6eea66b73abd5944bdeb5afb3562ddcd80ae84/googleapis_common_protos-1.74.0-py3-none-any.whl", hash = "sha256:702216f78610bb510e3f12ac3cafd281b7ac45cc5d86e90ad87e4d301a3426b5", size = 300743, upload-time = "2026-04-02T21:22:49.108Z" },
1094
- ]
1095
-
1096
  [[package]]
1097
  name = "greenlet"
1098
  version = "3.3.2"
@@ -1110,41 +958,6 @@ wheels = [
1110
  { url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
1111
  ]
1112
 
1113
- [[package]]
1114
- name = "grpcio"
1115
- version = "1.80.0"
1116
- source = { registry = "https://pypi.org/simple" }
1117
- dependencies = [
1118
- { name = "typing-extensions" },
1119
- ]
1120
- sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
1121
- wheels = [
1122
- { url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616, upload-time = "2026-03-30T08:47:13.428Z" },
1123
- { url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204, upload-time = "2026-03-30T08:47:15.873Z" },
1124
- { url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866, upload-time = "2026-03-30T08:47:18.588Z" },
1125
- { url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060, upload-time = "2026-03-30T08:47:21.113Z" },
1126
- { url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121, upload-time = "2026-03-30T08:47:23.827Z" },
1127
- { url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811, upload-time = "2026-03-30T08:47:26.517Z" },
1128
- { url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860, upload-time = "2026-03-30T08:47:29.439Z" },
1129
- { url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132, upload-time = "2026-03-30T08:47:33.254Z" },
1130
- { url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904, upload-time = "2026-03-30T08:47:35.319Z" },
1131
- { url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944, upload-time = "2026-03-30T08:47:37.831Z" },
1132
- ]
1133
-
1134
- [[package]]
1135
- name = "grpcio-status"
1136
- version = "1.80.0"
1137
- source = { registry = "https://pypi.org/simple" }
1138
- dependencies = [
1139
- { name = "googleapis-common-protos" },
1140
- { name = "grpcio" },
1141
- { name = "protobuf" },
1142
- ]
1143
- sdist = { url = "https://files.pythonhosted.org/packages/b1/ed/105f619bdd00cb47a49aa2feea6232ea2bbb04199d52a22cc6a7d603b5cb/grpcio_status-1.80.0.tar.gz", hash = "sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd", size = 13901, upload-time = "2026-03-30T08:54:34.784Z" }
1144
- wheels = [
1145
- { url = "https://files.pythonhosted.org/packages/76/80/58cd2dfc19a07d022abe44bde7c365627f6c7cb6f692ada6c65ca437d09a/grpcio_status-1.80.0-py3-none-any.whl", hash = "sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe", size = 14638, upload-time = "2026-03-30T08:54:01.569Z" },
1146
- ]
1147
-
1148
  [[package]]
1149
  name = "h11"
1150
  version = "0.16.0"
@@ -1314,15 +1127,6 @@ wheels = [
1314
  { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
1315
  ]
1316
 
1317
- [[package]]
1318
- name = "jmespath"
1319
- version = "1.1.0"
1320
- source = { registry = "https://pypi.org/simple" }
1321
- sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
1322
- wheels = [
1323
- { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
1324
- ]
1325
-
1326
  [[package]]
1327
  name = "joblib"
1328
  version = "1.5.3"
@@ -2150,18 +1954,6 @@ bcrypt = [
2150
  { name = "bcrypt" },
2151
  ]
2152
 
2153
- [[package]]
2154
- name = "pdf2image"
2155
- version = "1.17.0"
2156
- source = { registry = "https://pypi.org/simple" }
2157
- dependencies = [
2158
- { name = "pillow" },
2159
- ]
2160
- sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811, upload-time = "2024-01-07T20:33:01.965Z" }
2161
- wheels = [
2162
- { url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618, upload-time = "2024-01-07T20:32:59.957Z" },
2163
- ]
2164
-
2165
  [[package]]
2166
  name = "pgvector"
2167
  version = "0.3.6"
@@ -2329,33 +2121,6 @@ wheels = [
2329
  { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
2330
  ]
2331
 
2332
- [[package]]
2333
- name = "proto-plus"
2334
- version = "1.27.2"
2335
- source = { registry = "https://pypi.org/simple" }
2336
- dependencies = [
2337
- { name = "protobuf" },
2338
- ]
2339
- sdist = { url = "https://files.pythonhosted.org/packages/81/0d/94dfe80193e79d55258345901acd2917523d56e8381bc4dee7fd38e3868a/proto_plus-1.27.2.tar.gz", hash = "sha256:b2adde53adadf75737c44d3dcb0104fde65250dfc83ad59168b4aa3e574b6a24", size = 57204, upload-time = "2026-03-26T22:18:57.174Z" }
2340
- wheels = [
2341
- { url = "https://files.pythonhosted.org/packages/84/f3/1fba73eeffafc998a25d59703b63f8be4fe8a5cb12eaff7386a0ba0f7125/proto_plus-1.27.2-py3-none-any.whl", hash = "sha256:6432f75893d3b9e70b9c412f1d2f03f65b11fb164b793d14ae2ca01821d22718", size = 50450, upload-time = "2026-03-26T22:13:42.927Z" },
2342
- ]
2343
-
2344
- [[package]]
2345
- name = "protobuf"
2346
- version = "6.33.6"
2347
- source = { registry = "https://pypi.org/simple" }
2348
- sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" }
2349
- wheels = [
2350
- { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" },
2351
- { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" },
2352
- { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" },
2353
- { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" },
2354
- { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" },
2355
- { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" },
2356
- { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" },
2357
- ]
2358
-
2359
  [[package]]
2360
  name = "psycopg"
2361
  version = "3.2.3"
@@ -2416,42 +2181,6 @@ wheels = [
2416
  { url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
2417
  ]
2418
 
2419
- [[package]]
2420
- name = "pyarrow"
2421
- version = "24.0.0"
2422
- source = { registry = "https://pypi.org/simple" }
2423
- sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
2424
- wheels = [
2425
- { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" },
2426
- { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" },
2427
- { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" },
2428
- { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" },
2429
- { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" },
2430
- { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" },
2431
- { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" },
2432
- ]
2433
-
2434
- [[package]]
2435
- name = "pyasn1"
2436
- version = "0.6.3"
2437
- source = { registry = "https://pypi.org/simple" }
2438
- sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" }
2439
- wheels = [
2440
- { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" },
2441
- ]
2442
-
2443
- [[package]]
2444
- name = "pyasn1-modules"
2445
- version = "0.4.2"
2446
- source = { registry = "https://pypi.org/simple" }
2447
- dependencies = [
2448
- { name = "pyasn1" },
2449
- ]
2450
- sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" }
2451
- wheels = [
2452
- { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
2453
- ]
2454
-
2455
  [[package]]
2456
  name = "pycparser"
2457
  version = "3.0"
@@ -2581,43 +2310,6 @@ wheels = [
2581
  { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
2582
  ]
2583
 
2584
- [[package]]
2585
- name = "pymssql"
2586
- version = "2.3.13"
2587
- source = { registry = "https://pypi.org/simple" }
2588
- sdist = { url = "https://files.pythonhosted.org/packages/7a/cc/843c044b7f71ee329436b7327c578383e2f2499313899f88ad267cdf1f33/pymssql-2.3.13.tar.gz", hash = "sha256:2137e904b1a65546be4ccb96730a391fcd5a85aab8a0632721feb5d7e39cfbce", size = 203153, upload-time = "2026-02-14T05:00:36.865Z" }
2589
- wheels = [
2590
- { url = "https://files.pythonhosted.org/packages/ba/60/a2e8a8a38f7be21d54402e2b3365cd56f1761ce9f2706c97f864e8aa8300/pymssql-2.3.13-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cf4f32b4a05b66f02cb7d55a0f3bcb0574a6f8cf0bee4bea6f7b104038364733", size = 3158689, upload-time = "2026-02-14T04:59:46.982Z" },
2591
- { url = "https://files.pythonhosted.org/packages/43/9e/0cf0ffb9e2f73238baf766d8e31d7237b5bee3cc1bb29a376b404610994a/pymssql-2.3.13-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:2b056eb175955f7fb715b60dc1c0c624969f4d24dbdcf804b41ab1e640a2b131", size = 2960018, upload-time = "2026-02-14T04:59:48.668Z" },
2592
- { url = "https://files.pythonhosted.org/packages/93/ea/bc27354feaca717faa4626911f6b19bb62985c87dda28957c63de4de5895/pymssql-2.3.13-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:319810b89aa64b99d9c5c01518752c813938df230496fa2c4c6dda0603f04c4c", size = 3065719, upload-time = "2026-02-14T04:59:50.369Z" },
2593
- { url = "https://files.pythonhosted.org/packages/1e/7a/8028681c96241fb5fc850b87c8959402c353e4b83c6e049a99ffa67ded54/pymssql-2.3.13-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0ea72641cb0f8bce7ad8565dbdbda4a7437aa58bce045f2a3a788d71af2e4be", size = 3190567, upload-time = "2026-02-14T04:59:52.202Z" },
2594
- { url = "https://files.pythonhosted.org/packages/aa/f1/ab5b76adbbd6db9ce746d448db34b044683522e7e7b95053f9dd0165297b/pymssql-2.3.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1493f63d213607f708a5722aa230776ada726ccdb94097fab090a1717a2534e0", size = 3710481, upload-time = "2026-02-14T04:59:54.01Z" },
2595
- { url = "https://files.pythonhosted.org/packages/59/aa/2fa0951475cd0a1829e0b8bfbe334d04ece4bce11546a556b005c4100689/pymssql-2.3.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb3275985c23479e952d6462ae6c8b2b6993ab6b99a92805a9c17942cf3d5b3d", size = 3453789, upload-time = "2026-02-14T04:59:56.841Z" },
2596
- { url = "https://files.pythonhosted.org/packages/78/08/8cd2af9003f9fc03912b658a64f5a4919dcd68f0dd3bbc822b49a3d14fd9/pymssql-2.3.13-cp312-cp312-win_amd64.whl", hash = "sha256:a930adda87bdd8351a5637cf73d6491936f34e525a5e513068a6eac742f69cdb", size = 1994709, upload-time = "2026-02-14T04:59:58.972Z" },
2597
- ]
2598
-
2599
- [[package]]
2600
- name = "pymysql"
2601
- version = "1.1.2"
2602
- source = { registry = "https://pypi.org/simple" }
2603
- sdist = { url = "https://files.pythonhosted.org/packages/f5/ae/1fe3fcd9f959efa0ebe200b8de88b5a5ce3e767e38c7ac32fb179f16a388/pymysql-1.1.2.tar.gz", hash = "sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03", size = 48258, upload-time = "2025-08-24T12:55:55.146Z" }
2604
- wheels = [
2605
- { url = "https://files.pythonhosted.org/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" },
2606
- ]
2607
-
2608
- [[package]]
2609
- name = "pyopenssl"
2610
- version = "25.1.0"
2611
- source = { registry = "https://pypi.org/simple" }
2612
- dependencies = [
2613
- { name = "cryptography" },
2614
- { name = "typing-extensions" },
2615
- ]
2616
- sdist = { url = "https://files.pythonhosted.org/packages/04/8c/cd89ad05804f8e3c17dea8f178c3f40eeab5694c30e0c9f5bcd49f576fc3/pyopenssl-25.1.0.tar.gz", hash = "sha256:8d031884482e0c67ee92bf9a4d8cceb08d92aba7136432ffb0703c5280fc205b", size = 179937, upload-time = "2025-05-17T16:28:31.31Z" }
2617
- wheels = [
2618
- { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
2619
- ]
2620
-
2621
  [[package]]
2622
  name = "pyparsing"
2623
  version = "3.3.2"
@@ -2636,28 +2328,6 @@ wheels = [
2636
  { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
2637
  ]
2638
 
2639
- [[package]]
2640
- name = "pypdf2"
2641
- version = "3.0.1"
2642
- source = { registry = "https://pypi.org/simple" }
2643
- sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
2644
- wheels = [
2645
- { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
2646
- ]
2647
-
2648
- [[package]]
2649
- name = "pytesseract"
2650
- version = "0.3.13"
2651
- source = { registry = "https://pypi.org/simple" }
2652
- dependencies = [
2653
- { name = "packaging" },
2654
- { name = "pillow" },
2655
- ]
2656
- sdist = { url = "https://files.pythonhosted.org/packages/9f/a6/7d679b83c285974a7cb94d739b461fa7e7a9b17a3abfd7bf6cbc5c2394b0/pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9", size = 17689, upload-time = "2024-08-16T02:33:56.762Z" }
2657
- wheels = [
2658
- { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
2659
- ]
2660
-
2661
  [[package]]
2662
  name = "pytest"
2663
  version = "8.3.4"
@@ -2940,18 +2610,6 @@ wheels = [
2940
  { url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
2941
  ]
2942
 
2943
- [[package]]
2944
- name = "s3transfer"
2945
- version = "0.16.0"
2946
- source = { registry = "https://pypi.org/simple" }
2947
- dependencies = [
2948
- { name = "botocore" },
2949
- ]
2950
- sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
2951
- wheels = [
2952
- { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
2953
- ]
2954
-
2955
  [[package]]
2956
  name = "safetensors"
2957
  version = "0.7.0"
@@ -3106,60 +2764,6 @@ wheels = [
3106
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
3107
  ]
3108
 
3109
- [[package]]
3110
- name = "snowflake-connector-python"
3111
- version = "4.0.0"
3112
- source = { registry = "https://pypi.org/simple" }
3113
- dependencies = [
3114
- { name = "asn1crypto" },
3115
- { name = "boto3" },
3116
- { name = "botocore" },
3117
- { name = "certifi" },
3118
- { name = "charset-normalizer" },
3119
- { name = "cryptography" },
3120
- { name = "filelock" },
3121
- { name = "idna" },
3122
- { name = "packaging" },
3123
- { name = "platformdirs" },
3124
- { name = "pyjwt" },
3125
- { name = "pyopenssl" },
3126
- { name = "pytz" },
3127
- { name = "requests" },
3128
- { name = "sortedcontainers" },
3129
- { name = "tomlkit" },
3130
- { name = "typing-extensions" },
3131
- ]
3132
- sdist = { url = "https://files.pythonhosted.org/packages/1d/f1/4aff125021a9c5e0183f2f55dd7d04b7256a0e1e10db50d537a7415d9c55/snowflake_connector_python-4.0.0.tar.gz", hash = "sha256:4b10a865c4a5e1fa60c365c7fe41e0433605e6e5edc824e8730a9038f330b3a6", size = 813937, upload-time = "2025-10-09T10:11:34.631Z" }
3133
- wheels = [
3134
- { url = "https://files.pythonhosted.org/packages/ea/b0/462c0deee35d6d03d3d729b3f923615bae665beb7f9a94673a23a52080fe/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bfd3b8523d7adc830f99c5c4c635689ceca61700a05368d5bbb34c6811f2ec54", size = 1029568, upload-time = "2025-10-09T10:11:42.125Z" },
3135
- { url = "https://files.pythonhosted.org/packages/ff/4b/bb3ae3f07e7927c8f16c4c0f1283d3c721978d16e8bf4193fc8e41025c1e/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:835161dd46ef8f5fc9d2f135ca654c2f3fbdf57b035d3e1980506aa8eac671dc", size = 1041337, upload-time = "2025-10-09T10:11:43.692Z" },
3136
- { url = "https://files.pythonhosted.org/packages/9c/75/4bfac89f10c6dbb75e97adf1e217737fc599ebf964031c9298b6cbd807d0/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65e4e36dd1b0c7235d84cddef8a3c97c5ea0dc8fea85e31e45fc485000b77a83", size = 2699730, upload-time = "2025-10-09T10:11:25.295Z" },
3137
- { url = "https://files.pythonhosted.org/packages/cd/78/0e916416c50909dbae511fe38b1e671a9efa62decdce51b174a0396804e4/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6132986d6965e4005b0167270612fbc7fa4bc4ef42726a40b85a8f57475a78d", size = 2731336, upload-time = "2025-10-09T10:11:27.028Z" },
3138
- { url = "https://files.pythonhosted.org/packages/83/f0/3db8a2f3f5ee724d309c661af739a70d0643070b9b4597728151ef900f9b/snowflake_connector_python-4.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a790f06808e4481c23cfed1396d2c9a786060ddd62408b1fda1a63e1e6bc4b07", size = 1176292, upload-time = "2025-10-09T10:11:54.956Z" },
3139
- ]
3140
-
3141
- [[package]]
3142
- name = "snowflake-sqlalchemy"
3143
- version = "1.9.0"
3144
- source = { registry = "https://pypi.org/simple" }
3145
- dependencies = [
3146
- { name = "snowflake-connector-python" },
3147
- { name = "sqlalchemy" },
3148
- ]
3149
- sdist = { url = "https://files.pythonhosted.org/packages/ff/6a/fcc5c00c3a253029a7b7b293a3958ba07d5e97623b643de47be0cc9e5530/snowflake_sqlalchemy-1.9.0.tar.gz", hash = "sha256:fb32baf559f7f933ae8fde2ec535bcea5381bb15188777cd8c006b3226efa3b1", size = 141707, upload-time = "2026-03-04T13:48:17.905Z" }
3150
- wheels = [
3151
- { url = "https://files.pythonhosted.org/packages/88/28/b7ae8df80847e8157b74669ad7e1b0180e82ac0e3daf950612effd232fea/snowflake_sqlalchemy-1.9.0-py3-none-any.whl", hash = "sha256:f0b1528173e93c8c80bd9ca510985054667e0e514dd90b890271ac1cfae261c1", size = 78953, upload-time = "2026-03-04T13:48:16.393Z" },
3152
- ]
3153
-
3154
- [[package]]
3155
- name = "sortedcontainers"
3156
- version = "2.4.0"
3157
- source = { registry = "https://pypi.org/simple" }
3158
- sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
3159
- wheels = [
3160
- { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
3161
- ]
3162
-
3163
  [[package]]
3164
  name = "spacy"
3165
  version = "3.8.3"
@@ -3238,31 +2842,6 @@ asyncio = [
3238
  { name = "greenlet" },
3239
  ]
3240
 
3241
- [[package]]
3242
- name = "sqlalchemy-bigquery"
3243
- version = "1.16.0"
3244
- source = { registry = "https://pypi.org/simple" }
3245
- dependencies = [
3246
- { name = "google-api-core" },
3247
- { name = "google-auth" },
3248
- { name = "google-cloud-bigquery" },
3249
- { name = "packaging" },
3250
- { name = "sqlalchemy" },
3251
- ]
3252
- sdist = { url = "https://files.pythonhosted.org/packages/7e/6a/c49932b3d9c44cab9202b1866c5b36b7f0d0455d4653fbc0af4466aeaa76/sqlalchemy_bigquery-1.16.0.tar.gz", hash = "sha256:fe937a0d1f4cf7219fcf5d4995c6718805b38d4df43e29398dec5dc7b6d1987e", size = 119632, upload-time = "2025-11-06T01:35:40.373Z" }
3253
- wheels = [
3254
- { url = "https://files.pythonhosted.org/packages/c0/87/11e6de00ef7949bb8ea06b55304a1a4911c329fdf0d9882b464db240c2c5/sqlalchemy_bigquery-1.16.0-py3-none-any.whl", hash = "sha256:0fe7634cd954f3e74f5e2db6d159f9e5ee87a47fbe8d52eac3cd3bb3dadb3a77", size = 40615, upload-time = "2025-11-06T01:35:39.358Z" },
3255
- ]
3256
-
3257
- [[package]]
3258
- name = "sqlglot"
3259
- version = "30.6.0"
3260
- source = { registry = "https://pypi.org/simple" }
3261
- sdist = { url = "https://files.pythonhosted.org/packages/3c/66/6ece15f197874e56c76e1d0269cebf284ba992a80dfadca9d1972fdf7edf/sqlglot-30.6.0.tar.gz", hash = "sha256:246d34d39927422a50a3fa155f37b2f6346fba85f1a755b13c941eb32ef93361", size = 5835307, upload-time = "2026-04-20T20:11:08.164Z" }
3262
- wheels = [
3263
- { url = "https://files.pythonhosted.org/packages/dc/e7/64fe971cbca33a0446b06f4a5ff8e3fa4a1dbd0a039ceabcc3e6cf4087a9/sqlglot-30.6.0-py3-none-any.whl", hash = "sha256:e005fc2f47994f90d7d8df341f1cbe937518497b0b7b1507d4c03c4c9dfd2778", size = 673920, upload-time = "2026-04-20T20:11:05.758Z" },
3264
- ]
3265
-
3266
  [[package]]
3267
  name = "srsly"
3268
  version = "2.5.3"
@@ -3436,15 +3015,6 @@ wheels = [
3436
  { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
3437
  ]
3438
 
3439
- [[package]]
3440
- name = "tomlkit"
3441
- version = "0.14.0"
3442
- source = { registry = "https://pypi.org/simple" }
3443
- sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
3444
- wheels = [
3445
- { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
3446
- ]
3447
-
3448
  [[package]]
3449
  name = "torch"
3450
  version = "2.11.0"
 
1
  version = 1
2
+ revision = 2
3
  requires-python = "==3.12.*"
4
  resolution-markers = [
5
  "python_full_version >= '3.12.4'",
 
39
  { name = "orjson" },
40
  { name = "pandas" },
41
  { name = "passlib", extra = ["bcrypt"] },
 
42
  { name = "pgvector" },
43
  { name = "plotly" },
44
  { name = "presidio-analyzer" },
 
46
  { name = "prometheus-client" },
47
  { name = "psycopg", extra = ["binary", "pool"] },
48
  { name = "psycopg2" },
 
49
  { name = "pydantic" },
50
  { name = "pydantic-settings" },
51
  { name = "pymongo" },
 
 
52
  { name = "pypdf" },
 
 
53
  { name = "python-docx" },
54
  { name = "python-dotenv" },
55
  { name = "python-multipart" },
 
57
  { name = "redis" },
58
  { name = "sentence-transformers" },
59
  { name = "slowapi" },
 
60
  { name = "spacy" },
61
  { name = "sqlalchemy", extra = ["asyncio"] },
 
 
62
  { name = "sse-starlette" },
63
  { name = "starlette" },
64
  { name = "structlog" },
 
80
 
81
  [package.dev-dependencies]
82
  dev = [
83
+ { name = "mypy" },
84
+ { name = "pre-commit" },
85
  { name = "pytest" },
86
  { name = "pytest-asyncio" },
87
+ { name = "pytest-cov" },
88
  { name = "ruff" },
89
  ]
90
 
 
120
  { name = "orjson", specifier = "==3.10.12" },
121
  { name = "pandas", specifier = "==2.2.3" },
122
  { name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
 
123
  { name = "pgvector", specifier = "==0.3.6" },
124
  { name = "plotly", specifier = "==5.24.1" },
125
  { name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
 
128
  { name = "prometheus-client", specifier = "==0.21.1" },
129
  { name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
130
  { name = "psycopg2", specifier = ">=2.9.11" },
 
131
  { name = "pydantic", specifier = "==2.10.3" },
132
  { name = "pydantic-settings", specifier = "==2.7.0" },
133
  { name = "pymongo", specifier = ">=4.14.0" },
 
 
134
  { name = "pypdf", specifier = "==5.1.0" },
 
 
135
  { name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
136
  { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
137
  { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
 
143
  { name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
144
  { name = "sentence-transformers", specifier = "==3.3.1" },
145
  { name = "slowapi", specifier = "==0.1.9" },
 
146
  { name = "spacy", specifier = "==3.8.3" },
147
  { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
 
 
148
  { name = "sse-starlette", specifier = "==2.1.3" },
149
  { name = "starlette", specifier = "==0.41.3" },
150
  { name = "structlog", specifier = "==24.4.0" },
 
156
 
157
  [package.metadata.requires-dev]
158
  dev = [
159
+ { name = "mypy", specifier = "==1.13.0" },
160
+ { name = "pre-commit", specifier = "==4.0.1" },
161
+ { name = "pytest", specifier = "==8.3.4" },
162
+ { name = "pytest-asyncio", specifier = "==0.24.0" },
163
+ { name = "pytest-cov", specifier = "==6.0.0" },
164
+ { name = "ruff", specifier = "==0.8.4" },
165
  ]
166
 
167
  [[package]]
 
280
  { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
281
  ]
282
 
 
 
 
 
 
 
 
 
 
283
  [[package]]
284
  name = "asyncpg"
285
  version = "0.30.0"
 
428
  { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
429
  ]
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  [[package]]
432
  name = "cachetools"
433
  version = "5.5.0"
 
941
  { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
942
  ]
943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
944
  [[package]]
945
  name = "greenlet"
946
  version = "3.3.2"
 
958
  { url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
959
  ]
960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
  [[package]]
962
  name = "h11"
963
  version = "0.16.0"
 
1127
  { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
1128
  ]
1129
 
 
 
 
 
 
 
 
 
 
1130
  [[package]]
1131
  name = "joblib"
1132
  version = "1.5.3"
 
1954
  { name = "bcrypt" },
1955
  ]
1956
 
 
 
 
 
 
 
 
 
 
 
 
 
1957
  [[package]]
1958
  name = "pgvector"
1959
  version = "0.3.6"
 
2121
  { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
2122
  ]
2123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2124
  [[package]]
2125
  name = "psycopg"
2126
  version = "3.2.3"
 
2181
  { url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
2182
  ]
2183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2184
  [[package]]
2185
  name = "pycparser"
2186
  version = "3.0"
 
2310
  { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
2311
  ]
2312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2313
  [[package]]
2314
  name = "pyparsing"
2315
  version = "3.3.2"
 
2328
  { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
2329
  ]
2330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2331
  [[package]]
2332
  name = "pytest"
2333
  version = "8.3.4"
 
2610
  { url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
2611
  ]
2612
 
 
 
 
 
 
 
 
 
 
 
 
 
2613
  [[package]]
2614
  name = "safetensors"
2615
  version = "0.7.0"
 
2764
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
2765
  ]
2766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2767
  [[package]]
2768
  name = "spacy"
2769
  version = "3.8.3"
 
2842
  { name = "greenlet" },
2843
  ]
2844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2845
  [[package]]
2846
  name = "srsly"
2847
  version = "2.5.3"
 
3015
  { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
3016
  ]
3017
 
 
 
 
 
 
 
 
 
 
3018
  [[package]]
3019
  name = "torch"
3020
  version = "2.11.0"