Spaces:

HamidOmarov
/

FastAPI-RAG-API

Sleeping

App Files Files Community

HamidOmarov commited on Aug 10, 2025

Commit

70b60a8

1 Parent(s): a0a7164

Clean Dockerfile; pre-translate contexts to EN before summarizing

Browse files

Files changed (2) hide show

Dockerfile +7 -14
app/rag_system.py +21 -14

Dockerfile CHANGED Viewed

@@ -1,9 +1,4 @@
 FROM python:3.11-slim
-WORKDIR /app
-ARG CACHEBUST=20250810
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-FROM python:3.11-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
@@ -15,25 +10,23 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
  && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
-# Cache və data qovluqları
-RUN mkdir -p /app/.cache /app/data/uploads /app/data/index && chmod -R 777 /app/.cache /app/data
 ENV PORT=7860
 EXPOSE 7860
 CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
-COPY . .
-RUN mkdir -p /app/data/uploads /app/data/index
-ENV PORT=7860
-EXPOSE 7860
-CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
 WORKDIR /app
+# System deps
 RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
  && rm -rf /var/lib/apt/lists/*
+# Bust pip cache layer when requirements change
+ARG CACHEBUST=20250810
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# App code
 COPY . .
+# Writable caches/data
+RUN mkdir -p /app/.cache /app/data/uploads /app/data/index \
+ && chmod -R 777 /app/.cache /app/data
 ENV PORT=7860
 EXPOSE 7860
 CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]

app/rag_system.py CHANGED Viewed

@@ -114,7 +114,7 @@ class SimpleRAG:
         chunks: List[str] = []
         for txt in pages:
             for i in range(0, len(txt), step):
-                part = txt[i : i + step].strip()
                 if part:
                     chunks.append(part)
         return chunks
@@ -153,7 +153,7 @@ class SimpleRAG:
                     cache_dir=str(self.cache_dir),
                     device=-1,
                 )
-            outs = self._translator(texts, max_length=400)
             return [o["translation_text"].strip() for o in outs]
         except Exception:
             return texts
@@ -162,11 +162,23 @@ class SimpleRAG:
         if not contexts:
             return "No relevant context found. Please upload a PDF or ask a more specific question."
-        # 1) candidates (aggressive clean)
         candidates: List[str] = []
-        for c in contexts[:5]:
-            cleaned = _clean_for_summary(c)
-            for s in _split_sentences(cleaned):
                 w = s.split()
                 if not (8 <= len(w) <= 35):
                     continue
@@ -177,17 +189,17 @@ class SimpleRAG:
         if not candidates:
             return "The document appears largely tabular/numeric; couldn't extract readable sentences."
-        # 2) rank by similarity
         q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
         cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
         scores = (cand_emb @ q_emb.T).ravel()
         order = np.argsort(-scores)
-        # 3) near-duplicate dedup
         selected: List[str] = []
         for i in order:
             s = candidates[i].strip()
-            if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
                 continue
             selected.append(s)
             if len(selected) >= max_sentences:
@@ -196,11 +208,6 @@ class SimpleRAG:
         if not selected:
             return "The document appears largely tabular/numeric; couldn't extract readable sentences."
-        # 4) translate to EN if needed
-        if OUTPUT_LANG == "en":
-            if any(_looks_azerbaijani(s) for s in selected):
-                selected = self._translate_to_en(selected)
         bullets = "\n".join(f"- {s}" for s in selected)
         return f"Answer (based on document context):\n{bullets}"

         chunks: List[str] = []
         for txt in pages:
             for i in range(0, len(txt), step):
+                part = txt[i:i+step].strip()
                 if part:
                     chunks.append(part)
         return chunks
                     cache_dir=str(self.cache_dir),
                     device=-1,
                 )
+            outs = self._translator(texts, max_length=800)
             return [o["translation_text"].strip() for o in outs]
         except Exception:
             return texts
         if not contexts:
             return "No relevant context found. Please upload a PDF or ask a more specific question."
+        # 1) Clean & keep top contexts
+        cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
+        cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
+        if not cleaned_contexts:
+            return "The document appears largely tabular/numeric; couldn't extract readable sentences."
+        # 2) Pre-translate paragraphs to EN (if output language is EN)
+        if OUTPUT_LANG == "en":
+            try:
+                cleaned_contexts = self._translate_to_en(cleaned_contexts)
+            except Exception:
+                pass
+        # 3) Split into candidate sentences and filter
         candidates: List[str] = []
+        for para in cleaned_contexts:
+            for s in _split_sentences(para):
                 w = s.split()
                 if not (8 <= len(w) <= 35):
                     continue
         if not candidates:
             return "The document appears largely tabular/numeric; couldn't extract readable sentences."
+        # 4) Rank by similarity
         q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
         cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
         scores = (cand_emb @ q_emb.T).ravel()
         order = np.argsort(-scores)
+        # 5) Aggressive near-duplicate removal
         selected: List[str] = []
         for i in order:
             s = candidates[i].strip()
+            if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
                 continue
             selected.append(s)
             if len(selected) >= max_sentences:
         if not selected:
             return "The document appears largely tabular/numeric; couldn't extract readable sentences."
         bullets = "\n".join(f"- {s}" for s in selected)
         return f"Answer (based on document context):\n{bullets}"