Spaces:
Sleeping
Sleeping
Update app/rag_system.py
Browse files- app/rag_system.py +10 -13
app/rag_system.py
CHANGED
|
@@ -17,12 +17,14 @@ except Exception:
|
|
| 17 |
|
| 18 |
from sentence_transformers import SentenceTransformer
|
| 19 |
|
| 20 |
-
# ---------------- Paths & Cache ----------------
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
| 27 |
d.mkdir(parents=True, exist_ok=True)
|
| 28 |
|
|
@@ -45,7 +47,6 @@ def _fix_mojibake(s: str) -> str:
|
|
| 45 |
return s
|
| 46 |
|
| 47 |
def _split_sentences(text: str) -> List[str]:
|
| 48 |
-
# Split on punctuation boundaries and line breaks
|
| 49 |
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
| 50 |
|
| 51 |
def _mostly_numeric(s: str) -> bool:
|
|
@@ -206,7 +207,6 @@ class SimpleRAG:
|
|
| 206 |
|
| 207 |
# ---------- Fallbacks ----------
|
| 208 |
def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
|
| 209 |
-
"""Pick sentences sharing keywords with the question (question-dependent even if dense retrieval is weak)."""
|
| 210 |
qk = set(_keywords(question))
|
| 211 |
if not qk:
|
| 212 |
return []
|
|
@@ -237,7 +237,6 @@ class SimpleRAG:
|
|
| 237 |
|
| 238 |
# ---------- Answer Synthesis ----------
|
| 239 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
| 240 |
-
"""Extractive summary over retrieved contexts; falls back to keyword selection; EN translation if needed."""
|
| 241 |
if not contexts and self.is_empty:
|
| 242 |
return "No relevant context found. Index is empty — upload a PDF first."
|
| 243 |
|
|
@@ -246,7 +245,7 @@ class SimpleRAG:
|
|
| 246 |
|
| 247 |
# Build candidate sentences from nearby contexts
|
| 248 |
local_pool: List[str] = []
|
| 249 |
-
for c in (contexts or [])[:5]:
|
| 250 |
cleaned = _clean_for_summary(c)
|
| 251 |
for s in _split_sentences(cleaned):
|
| 252 |
w = s.split()
|
|
@@ -270,15 +269,13 @@ class SimpleRAG:
|
|
| 270 |
if len(selected) >= max_sentences:
|
| 271 |
break
|
| 272 |
|
| 273 |
-
# Keyword fallback if needed
|
| 274 |
if not selected:
|
| 275 |
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
| 276 |
|
| 277 |
if not selected:
|
| 278 |
return "No readable sentences matched the question. Try a more specific query."
|
| 279 |
|
| 280 |
-
|
| 281 |
-
if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
|
| 282 |
selected = self._translate_to_en(selected)
|
| 283 |
|
| 284 |
bullets = "\n".join(f"- {s}" for s in selected)
|
|
|
|
| 17 |
|
| 18 |
from sentence_transformers import SentenceTransformer
|
| 19 |
|
| 20 |
+
# ---------------- Paths & Cache (HF-safe) ----------------
|
| 21 |
+
# Writeable base is /app in HF Spaces. Allow ENV overrides.
|
| 22 |
+
ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
|
| 23 |
+
DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
|
| 24 |
+
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
|
| 25 |
+
INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
|
| 26 |
+
CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
|
| 27 |
+
|
| 28 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
| 29 |
d.mkdir(parents=True, exist_ok=True)
|
| 30 |
|
|
|
|
| 47 |
return s
|
| 48 |
|
| 49 |
def _split_sentences(text: str) -> List[str]:
|
|
|
|
| 50 |
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
| 51 |
|
| 52 |
def _mostly_numeric(s: str) -> bool:
|
|
|
|
| 207 |
|
| 208 |
# ---------- Fallbacks ----------
|
| 209 |
def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
|
|
|
|
| 210 |
qk = set(_keywords(question))
|
| 211 |
if not qk:
|
| 212 |
return []
|
|
|
|
| 237 |
|
| 238 |
# ---------- Answer Synthesis ----------
|
| 239 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
|
|
|
| 240 |
if not contexts and self.is_empty:
|
| 241 |
return "No relevant context found. Index is empty — upload a PDF first."
|
| 242 |
|
|
|
|
| 245 |
|
| 246 |
# Build candidate sentences from nearby contexts
|
| 247 |
local_pool: List[str] = []
|
| 248 |
+
for c in (contexts or [])[:5]:
|
| 249 |
cleaned = _clean_for_summary(c)
|
| 250 |
for s in _split_sentences(cleaned):
|
| 251 |
w = s.split()
|
|
|
|
| 269 |
if len(selected) >= max_sentences:
|
| 270 |
break
|
| 271 |
|
|
|
|
| 272 |
if not selected:
|
| 273 |
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
| 274 |
|
| 275 |
if not selected:
|
| 276 |
return "No readable sentences matched the question. Try a more specific query."
|
| 277 |
|
| 278 |
+
if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
|
|
|
|
| 279 |
selected = self._translate_to_en(selected)
|
| 280 |
|
| 281 |
bullets = "\n".join(f"- {s}" for s in selected)
|