Spaces:
Sleeping
Sleeping
Commit
·
f06409c
1
Parent(s):
a46e32d
Force EN translation + strong dedup filtering
Browse files- app/rag_system.py +26 -15
app/rag_system.py
CHANGED
|
@@ -153,47 +153,58 @@ class SimpleRAG:
|
|
| 153 |
out.append((self.chunks[idx], float(score)))
|
| 154 |
return out
|
| 155 |
|
| 156 |
-
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int =
|
| 157 |
if not contexts:
|
| 158 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
| 159 |
|
| 160 |
-
# Candidate
|
| 161 |
-
|
| 162 |
for c in contexts[:5]:
|
| 163 |
-
|
| 164 |
for s in _split_sentences(cleaned):
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
if not candidates:
|
| 169 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 170 |
|
| 171 |
-
#
|
| 172 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 173 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 174 |
scores = (cand_emb @ q_emb.T).ravel()
|
| 175 |
order = np.argsort(-scores)
|
| 176 |
|
| 177 |
-
#
|
| 178 |
selected: List[str] = []
|
| 179 |
-
seen = set()
|
| 180 |
for i in order:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
if key in seen:
|
| 184 |
continue
|
| 185 |
-
seen.add(key)
|
| 186 |
selected.append(s)
|
| 187 |
if len(selected) >= max_sentences:
|
| 188 |
break
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
| 192 |
selected = self._translate_to_en(selected)
|
| 193 |
|
| 194 |
bullets = "\n".join(f"- {s}" for s in selected)
|
| 195 |
return f"Answer (based on document context):\n{bullets}"
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
| 199 |
return SimpleRAG().synthesize_answer(question, contexts)
|
|
|
|
| 153 |
out.append((self.chunks[idx], float(score)))
|
| 154 |
return out
|
| 155 |
|
| 156 |
+
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
| 157 |
if not contexts:
|
| 158 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
| 159 |
|
| 160 |
+
# 1) Candidate sentence-lər (aggressive clean)
|
| 161 |
+
candidates: List[str] = []
|
| 162 |
for c in contexts[:5]:
|
| 163 |
+
cleaned = _clean_for_summary(c)
|
| 164 |
for s in _split_sentences(cleaned):
|
| 165 |
+
# uzunluq və keyfiyyət filtrləri
|
| 166 |
+
w = s.split()
|
| 167 |
+
if not (8 <= len(w) <= 35):
|
| 168 |
+
continue
|
| 169 |
+
if _tabular_like(s) or _mostly_numeric(s):
|
| 170 |
+
continue
|
| 171 |
+
candidates.append(" ".join(w)) # normalizasiya: bir boşluq
|
| 172 |
|
| 173 |
if not candidates:
|
| 174 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 175 |
|
| 176 |
+
# 2) Oxşarlığa görə sıralama
|
| 177 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 178 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
| 179 |
scores = (cand_emb @ q_emb.T).ravel()
|
| 180 |
order = np.argsort(-scores)
|
| 181 |
|
| 182 |
+
# 3) Near-duplicate dedup (Jaccard söz seti) – threshold 0.82
|
| 183 |
selected: List[str] = []
|
|
|
|
| 184 |
for i in order:
|
| 185 |
+
s = candidates[i].strip()
|
| 186 |
+
if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
|
|
|
|
| 187 |
continue
|
|
|
|
| 188 |
selected.append(s)
|
| 189 |
if len(selected) >= max_sentences:
|
| 190 |
break
|
| 191 |
|
| 192 |
+
if not selected:
|
| 193 |
+
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
| 194 |
+
|
| 195 |
+
# 4) HƏMİŞƏ EN tərcümə (istəyin belədir)
|
| 196 |
+
if os.getenv("OUTPUT_LANG", "en").lower() == "en":
|
| 197 |
selected = self._translate_to_en(selected)
|
| 198 |
|
| 199 |
bullets = "\n".join(f"- {s}" for s in selected)
|
| 200 |
return f"Answer (based on document context):\n{bullets}"
|
| 201 |
|
| 202 |
+
def _sim_jaccard(a: str, b: str) -> float:
|
| 203 |
+
aw = set(a.lower().split())
|
| 204 |
+
bw = set(b.lower().split())
|
| 205 |
+
if not aw or not bw:
|
| 206 |
+
return 0.0
|
| 207 |
+
return len(aw & bw) / len(aw | bw)
|
| 208 |
|
| 209 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
| 210 |
return SimpleRAG().synthesize_answer(question, contexts)
|