Spaces:
Sleeping
Sleeping
Commit
·
a7ef914
1
Parent(s):
70b60a8
Clean Dockerfile; pre-translate paragraphs; add /debug/translate
Browse files- Dockerfile +1 -1
- app/api.py +13 -0
- app/rag_system.py +6 -6
Dockerfile
CHANGED
|
@@ -14,7 +14,7 @@ WORKDIR /app
|
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
-
# Bust pip cache
|
| 18 |
ARG CACHEBUST=20250810
|
| 19 |
COPY requirements.txt .
|
| 20 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
+
# Bust pip cache when requirements change
|
| 18 |
ARG CACHEBUST=20250810
|
| 19 |
COPY requirements.txt .
|
| 20 |
RUN pip install --no-cache-dir -r requirements.txt
|
app/api.py
CHANGED
|
@@ -5,6 +5,19 @@ from fastapi.responses import JSONResponse, RedirectResponse
|
|
| 5 |
from pathlib import Path
|
| 6 |
import shutil
|
| 7 |
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from .rag_system import SimpleRAG, UPLOAD_DIR, synthesize_answer as summarize
|
| 10 |
from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
import shutil
|
| 7 |
import traceback
|
| 8 |
+
# app/api.py (importların altından)
|
| 9 |
+
from fastapi.responses import JSONResponse
|
| 10 |
+
|
| 11 |
+
@app.get("/debug/translate")
|
| 12 |
+
def debug_translate():
|
| 13 |
+
try:
|
| 14 |
+
from transformers import pipeline
|
| 15 |
+
tr = pipeline("translation", model="Helsinki-NLP/opus-mt-az-en", cache_dir=str(CACHE_DIR), device=-1)
|
| 16 |
+
out = tr("Sənəd təmiri və quraşdırılması ilə bağlı işlər görülüb.", max_length=80)[0]["translation_text"]
|
| 17 |
+
return {"ok": True, "example_in": "Sənəd təmiri və quraşdırılması ilə bağlı işlər görülüb.", "example_out": out}
|
| 18 |
+
except Exception as e:
|
| 19 |
+
return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
|
| 20 |
+
|
| 21 |
|
| 22 |
from .rag_system import SimpleRAG, UPLOAD_DIR, synthesize_answer as summarize
|
| 23 |
from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
|
app/rag_system.py
CHANGED
|
@@ -114,7 +114,7 @@ class SimpleRAG:
|
|
| 114 |
chunks: List[str] = []
|
| 115 |
for txt in pages:
|
| 116 |
for i in range(0, len(txt), step):
|
| 117 |
-
part = txt[i:i+step].strip()
|
| 118 |
if part:
|
| 119 |
chunks.append(part)
|
| 120 |
return chunks
|
|
@@ -162,20 +162,20 @@ class SimpleRAG:
|
|
| 162 |
if not contexts:
|
| 163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
| 164 |
|
| 165 |
-
# 1) Clean
|
| 166 |
cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
|
| 167 |
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
| 168 |
if not cleaned_contexts:
|
| 169 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 170 |
|
| 171 |
-
# 2) Pre-translate paragraphs to EN (if
|
| 172 |
if OUTPUT_LANG == "en":
|
| 173 |
try:
|
| 174 |
cleaned_contexts = self._translate_to_en(cleaned_contexts)
|
| 175 |
except Exception:
|
| 176 |
pass
|
| 177 |
|
| 178 |
-
# 3) Split into
|
| 179 |
candidates: List[str] = []
|
| 180 |
for para in cleaned_contexts:
|
| 181 |
for s in _split_sentences(para):
|
|
@@ -189,13 +189,13 @@ class SimpleRAG:
|
|
| 189 |
if not candidates:
|
| 190 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 191 |
|
| 192 |
-
# 4) Rank by similarity
|
| 193 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 194 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 195 |
scores = (cand_emb @ q_emb.T).ravel()
|
| 196 |
order = np.argsort(-scores)
|
| 197 |
|
| 198 |
-
# 5) Aggressive near-duplicate removal
|
| 199 |
selected: List[str] = []
|
| 200 |
for i in order:
|
| 201 |
s = candidates[i].strip()
|
|
|
|
| 114 |
chunks: List[str] = []
|
| 115 |
for txt in pages:
|
| 116 |
for i in range(0, len(txt), step):
|
| 117 |
+
part = txt[i : i + step].strip()
|
| 118 |
if part:
|
| 119 |
chunks.append(part)
|
| 120 |
return chunks
|
|
|
|
| 162 |
if not contexts:
|
| 163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
| 164 |
|
| 165 |
+
# 1) Clean top contexts
|
| 166 |
cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
|
| 167 |
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
| 168 |
if not cleaned_contexts:
|
| 169 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 170 |
|
| 171 |
+
# 2) Pre-translate paragraphs to EN (if target is EN)
|
| 172 |
if OUTPUT_LANG == "en":
|
| 173 |
try:
|
| 174 |
cleaned_contexts = self._translate_to_en(cleaned_contexts)
|
| 175 |
except Exception:
|
| 176 |
pass
|
| 177 |
|
| 178 |
+
# 3) Split into sentence candidates & filter
|
| 179 |
candidates: List[str] = []
|
| 180 |
for para in cleaned_contexts:
|
| 181 |
for s in _split_sentences(para):
|
|
|
|
| 189 |
if not candidates:
|
| 190 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 191 |
|
| 192 |
+
# 4) Rank by similarity to question
|
| 193 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 194 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 195 |
scores = (cand_emb @ q_emb.T).ravel()
|
| 196 |
order = np.argsort(-scores)
|
| 197 |
|
| 198 |
+
# 5) Aggressive near-duplicate removal (Jaccard >= 0.90)
|
| 199 |
selected: List[str] = []
|
| 200 |
for i in order:
|
| 201 |
s = candidates[i].strip()
|