Spaces:
Running
Running
Sync from GitHub (Code Only)
Browse files- services/news_lookup.py +23 -14
- services/text_service.py +1 -12
services/news_lookup.py
CHANGED
|
@@ -31,8 +31,8 @@ FACTCHECK_DOMAINS = {
|
|
| 31 |
"factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
|
| 32 |
}
|
| 33 |
|
| 34 |
-
# Domains eligible for truth-override (weight >= 0.
|
| 35 |
-
_HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.
|
| 36 |
|
| 37 |
# Thresholds per BUILD_PLAN §13.2
|
| 38 |
_OVERRIDE_SIMILARITY_THRESHOLD = 0.6
|
|
@@ -67,21 +67,22 @@ def _page_size() -> int:
|
|
| 67 |
return max(1, min(int(settings.NEWS_API_PAGE_SIZE or 10), 50))
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
| 70 |
def _sanitize_keywords(keywords: List[str]) -> List[str]:
|
| 71 |
-
"""Remove non-ASCII garbage from keywords.
|
| 72 |
|
| 73 |
Filters out:
|
| 74 |
-
- Non-ASCII characters (e.g., Devanagari numerals
|
| 75 |
-
-
|
| 76 |
-
|
| 77 |
-
Returns: List of cleaned keywords
|
| 78 |
"""
|
| 79 |
cleaned = []
|
| 80 |
for kw in keywords:
|
| 81 |
-
# Remove non-ASCII characters, keep only ASCII printable
|
| 82 |
ascii_only = ''.join(c for c in kw if ord(c) < 128 and c.isprintable())
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
return cleaned
|
| 86 |
|
| 87 |
|
|
@@ -196,7 +197,7 @@ def _compute_truth_override(
|
|
| 196 |
input_cmp = input_text[:512]
|
| 197 |
input_terms = {
|
| 198 |
t for t in input_cmp.lower().split()
|
| 199 |
-
if len(t.strip(".,!?;:()[]{}\"'")) >=
|
| 200 |
for t in [t.strip(".,!?;:()[]{}\"'")]
|
| 201 |
}
|
| 202 |
all_texts = [input_cmp] + source_texts
|
|
@@ -219,7 +220,7 @@ def _compute_truth_override(
|
|
| 219 |
|
| 220 |
best_terms = {
|
| 221 |
t for t in f"{best_source.title} {best_source.description or ''}".lower().split()
|
| 222 |
-
if len(t.strip(".,!?;:()[]{}\"'")) >=
|
| 223 |
for t in [t.strip(".,!?;:()[]{}\"'")]
|
| 224 |
}
|
| 225 |
lexical_overlap = len(input_terms & best_terms) / max(len(input_terms), 1)
|
|
@@ -362,8 +363,16 @@ async def search_news_full(
|
|
| 362 |
if not cleaned_keywords:
|
| 363 |
return NewsLookupResult([], [], 0)
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
# Fix 1: Parallel India + Global search
|
| 369 |
# Run both searches concurrently to catch both India-focused and global stories
|
|
|
|
| 31 |
"factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
|
| 32 |
}
|
| 33 |
|
| 34 |
+
# Domains eligible for truth-override (weight >= 0.8)
|
| 35 |
+
_HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.8}
|
| 36 |
|
| 37 |
# Thresholds per BUILD_PLAN §13.2
|
| 38 |
_OVERRIDE_SIMILARITY_THRESHOLD = 0.6
|
|
|
|
| 67 |
return max(1, min(int(settings.NEWS_API_PAGE_SIZE or 10), 50))
|
| 68 |
|
| 69 |
|
| 70 |
+
import re
|
| 71 |
+
|
| 72 |
def _sanitize_keywords(keywords: List[str]) -> List[str]:
|
| 73 |
+
"""Remove non-ASCII garbage and special characters from keywords.
|
| 74 |
|
| 75 |
Filters out:
|
| 76 |
+
- Non-ASCII characters (e.g., Devanagari numerals)
|
| 77 |
+
- Special characters like colons, quotes that break the NewsData API
|
|
|
|
|
|
|
| 78 |
"""
|
| 79 |
cleaned = []
|
| 80 |
for kw in keywords:
|
|
|
|
| 81 |
ascii_only = ''.join(c for c in kw if ord(c) < 128 and c.isprintable())
|
| 82 |
+
safe_kw = re.sub(r'[^A-Za-z0-9\s]', ' ', ascii_only)
|
| 83 |
+
safe_kw = " ".join(safe_kw.split())
|
| 84 |
+
if safe_kw.strip():
|
| 85 |
+
cleaned.append(safe_kw.strip())
|
| 86 |
return cleaned
|
| 87 |
|
| 88 |
|
|
|
|
| 197 |
input_cmp = input_text[:512]
|
| 198 |
input_terms = {
|
| 199 |
t for t in input_cmp.lower().split()
|
| 200 |
+
if len(t.strip(".,!?;:()[]{}\"'")) >= 4
|
| 201 |
for t in [t.strip(".,!?;:()[]{}\"'")]
|
| 202 |
}
|
| 203 |
all_texts = [input_cmp] + source_texts
|
|
|
|
| 220 |
|
| 221 |
best_terms = {
|
| 222 |
t for t in f"{best_source.title} {best_source.description or ''}".lower().split()
|
| 223 |
+
if len(t.strip(".,!?;:()[]{}\"'")) >= 4
|
| 224 |
for t in [t.strip(".,!?;:()[]{}\"'")]
|
| 225 |
}
|
| 226 |
lexical_overlap = len(input_terms & best_terms) / max(len(input_terms), 1)
|
|
|
|
| 363 |
if not cleaned_keywords:
|
| 364 |
return NewsLookupResult([], [], 0)
|
| 365 |
|
| 366 |
+
seen_words = set()
|
| 367 |
+
query_words = []
|
| 368 |
+
for kw in cleaned_keywords[:4]:
|
| 369 |
+
for word in kw.split():
|
| 370 |
+
wl = word.lower()
|
| 371 |
+
if wl not in seen_words:
|
| 372 |
+
seen_words.add(wl)
|
| 373 |
+
query_words.append(word)
|
| 374 |
+
q = " ".join(query_words[:8])
|
| 375 |
+
logger.info(f"News lookup query (after sanitization & deduplication): {q!r}")
|
| 376 |
|
| 377 |
# Fix 1: Parallel India + Global search
|
| 378 |
# Run both searches concurrently to catch both India-focused and global stories
|
services/text_service.py
CHANGED
|
@@ -227,18 +227,7 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
|
|
| 227 |
|
| 228 |
numeric: List[str] = []
|
| 229 |
|
| 230 |
-
|
| 231 |
-
for chunk in doc.noun_chunks:
|
| 232 |
-
parts = chunk.text.strip().split()
|
| 233 |
-
if len(parts) > 1 and parts[0].lower() in {"a", "an", "the", "some", "several", "many", "these", "those", "this", "that", "their", "our", "my", "your", "its"}:
|
| 234 |
-
parts = parts[1:]
|
| 235 |
-
chunk_text = " ".join(parts)
|
| 236 |
-
if len(parts) > 1 and len(chunk_text) > 4:
|
| 237 |
-
if not all(p.lower() in {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her"} for p in parts):
|
| 238 |
-
norm_lower = chunk_text.lower()
|
| 239 |
-
if norm_lower not in seen:
|
| 240 |
-
preferred.append(chunk_text)
|
| 241 |
-
seen.add(norm_lower)
|
| 242 |
|
| 243 |
for ent in doc.ents:
|
| 244 |
norm = ent.text.strip()
|
|
|
|
| 227 |
|
| 228 |
numeric: List[str] = []
|
| 229 |
|
| 230 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
for ent in doc.ents:
|
| 233 |
norm = ent.text.strip()
|