Spaces:
Sleeping
Sleeping
Commit
·
70b60a8
1
Parent(s):
a0a7164
Clean Dockerfile; pre-translate contexts to EN before summarizing
Browse files- Dockerfile +7 -14
- app/rag_system.py +21 -14
Dockerfile
CHANGED
|
@@ -1,9 +1,4 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
-
WORKDIR /app
|
| 3 |
-
ARG CACHEBUST=20250810
|
| 4 |
-
COPY requirements.txt .
|
| 5 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 6 |
-
FROM python:3.11-slim
|
| 7 |
|
| 8 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 9 |
PYTHONUNBUFFERED=1 \
|
|
@@ -15,25 +10,23 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 15 |
|
| 16 |
WORKDIR /app
|
| 17 |
|
|
|
|
| 18 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
| 19 |
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
|
| 21 |
-
|
|
|
|
| 22 |
COPY requirements.txt .
|
| 23 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 24 |
|
|
|
|
| 25 |
COPY . .
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
RUN mkdir -p /app/.cache /app/data/uploads /app/data/index
|
|
|
|
| 29 |
|
| 30 |
ENV PORT=7860
|
| 31 |
EXPOSE 7860
|
| 32 |
|
| 33 |
CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 34 |
-
|
| 35 |
-
COPY . .
|
| 36 |
-
RUN mkdir -p /app/data/uploads /app/data/index
|
| 37 |
-
ENV PORT=7860
|
| 38 |
-
EXPOSE 7860
|
| 39 |
-
CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
PYTHONUNBUFFERED=1 \
|
|
|
|
| 10 |
|
| 11 |
WORKDIR /app
|
| 12 |
|
| 13 |
+
# System deps
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
+
# Bust pip cache layer when requirements change
|
| 18 |
+
ARG CACHEBUST=20250810
|
| 19 |
COPY requirements.txt .
|
| 20 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
|
| 22 |
+
# App code
|
| 23 |
COPY . .
|
| 24 |
|
| 25 |
+
# Writable caches/data
|
| 26 |
+
RUN mkdir -p /app/.cache /app/data/uploads /app/data/index \
|
| 27 |
+
&& chmod -R 777 /app/.cache /app/data
|
| 28 |
|
| 29 |
ENV PORT=7860
|
| 30 |
EXPOSE 7860
|
| 31 |
|
| 32 |
CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/rag_system.py
CHANGED
|
@@ -114,7 +114,7 @@ class SimpleRAG:
|
|
| 114 |
chunks: List[str] = []
|
| 115 |
for txt in pages:
|
| 116 |
for i in range(0, len(txt), step):
|
| 117 |
-
part = txt[i
|
| 118 |
if part:
|
| 119 |
chunks.append(part)
|
| 120 |
return chunks
|
|
@@ -153,7 +153,7 @@ class SimpleRAG:
|
|
| 153 |
cache_dir=str(self.cache_dir),
|
| 154 |
device=-1,
|
| 155 |
)
|
| 156 |
-
outs = self._translator(texts, max_length=
|
| 157 |
return [o["translation_text"].strip() for o in outs]
|
| 158 |
except Exception:
|
| 159 |
return texts
|
|
@@ -162,11 +162,23 @@ class SimpleRAG:
|
|
| 162 |
if not contexts:
|
| 163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
| 164 |
|
| 165 |
-
# 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
candidates: List[str] = []
|
| 167 |
-
for
|
| 168 |
-
|
| 169 |
-
for s in _split_sentences(cleaned):
|
| 170 |
w = s.split()
|
| 171 |
if not (8 <= len(w) <= 35):
|
| 172 |
continue
|
|
@@ -177,17 +189,17 @@ class SimpleRAG:
|
|
| 177 |
if not candidates:
|
| 178 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 179 |
|
| 180 |
-
#
|
| 181 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 182 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 183 |
scores = (cand_emb @ q_emb.T).ravel()
|
| 184 |
order = np.argsort(-scores)
|
| 185 |
|
| 186 |
-
#
|
| 187 |
selected: List[str] = []
|
| 188 |
for i in order:
|
| 189 |
s = candidates[i].strip()
|
| 190 |
-
if any(_sim_jaccard(s, t) >= 0.
|
| 191 |
continue
|
| 192 |
selected.append(s)
|
| 193 |
if len(selected) >= max_sentences:
|
|
@@ -196,11 +208,6 @@ class SimpleRAG:
|
|
| 196 |
if not selected:
|
| 197 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 198 |
|
| 199 |
-
# 4) translate to EN if needed
|
| 200 |
-
if OUTPUT_LANG == "en":
|
| 201 |
-
if any(_looks_azerbaijani(s) for s in selected):
|
| 202 |
-
selected = self._translate_to_en(selected)
|
| 203 |
-
|
| 204 |
bullets = "\n".join(f"- {s}" for s in selected)
|
| 205 |
return f"Answer (based on document context):\n{bullets}"
|
| 206 |
|
|
|
|
| 114 |
chunks: List[str] = []
|
| 115 |
for txt in pages:
|
| 116 |
for i in range(0, len(txt), step):
|
| 117 |
+
part = txt[i:i+step].strip()
|
| 118 |
if part:
|
| 119 |
chunks.append(part)
|
| 120 |
return chunks
|
|
|
|
| 153 |
cache_dir=str(self.cache_dir),
|
| 154 |
device=-1,
|
| 155 |
)
|
| 156 |
+
outs = self._translator(texts, max_length=800)
|
| 157 |
return [o["translation_text"].strip() for o in outs]
|
| 158 |
except Exception:
|
| 159 |
return texts
|
|
|
|
| 162 |
if not contexts:
|
| 163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
| 164 |
|
| 165 |
+
# 1) Clean & keep top contexts
|
| 166 |
+
cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
|
| 167 |
+
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
| 168 |
+
if not cleaned_contexts:
|
| 169 |
+
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 170 |
+
|
| 171 |
+
# 2) Pre-translate paragraphs to EN (if output language is EN)
|
| 172 |
+
if OUTPUT_LANG == "en":
|
| 173 |
+
try:
|
| 174 |
+
cleaned_contexts = self._translate_to_en(cleaned_contexts)
|
| 175 |
+
except Exception:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
# 3) Split into candidate sentences and filter
|
| 179 |
candidates: List[str] = []
|
| 180 |
+
for para in cleaned_contexts:
|
| 181 |
+
for s in _split_sentences(para):
|
|
|
|
| 182 |
w = s.split()
|
| 183 |
if not (8 <= len(w) <= 35):
|
| 184 |
continue
|
|
|
|
| 189 |
if not candidates:
|
| 190 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 191 |
|
| 192 |
+
# 4) Rank by similarity
|
| 193 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 194 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 195 |
scores = (cand_emb @ q_emb.T).ravel()
|
| 196 |
order = np.argsort(-scores)
|
| 197 |
|
| 198 |
+
# 5) Aggressive near-duplicate removal
|
| 199 |
selected: List[str] = []
|
| 200 |
for i in order:
|
| 201 |
s = candidates[i].strip()
|
| 202 |
+
if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
|
| 203 |
continue
|
| 204 |
selected.append(s)
|
| 205 |
if len(selected) >= max_sentences:
|
|
|
|
| 208 |
if not selected:
|
| 209 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
bullets = "\n".join(f"- {s}" for s in selected)
|
| 212 |
return f"Answer (based on document context):\n{bullets}"
|
| 213 |
|