Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

ar07xd commited on 14 days ago

Commit

ed9f9c4

verified ·

1 Parent(s): f51c5bd

Sync from GitHub (Code Only)

Browse files

Files changed (2) hide show

services/news_lookup.py +23 -14
services/text_service.py +1 -12

services/news_lookup.py CHANGED Viewed

@@ -31,8 +31,8 @@ FACTCHECK_DOMAINS = {
     "factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
 }
-# Domains eligible for truth-override (weight >= 0.9 per BUILD_PLAN spec)
-_HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.9}
 # Thresholds per BUILD_PLAN §13.2
 _OVERRIDE_SIMILARITY_THRESHOLD = 0.6
@@ -67,21 +67,22 @@ def _page_size() -> int:
     return max(1, min(int(settings.NEWS_API_PAGE_SIZE or 10), 50))
 def _sanitize_keywords(keywords: List[str]) -> List[str]:
-    """Remove non-ASCII garbage from keywords.
     Filters out:
-    - Non-ASCII characters (e.g., Devanagari numerals '13२')
-    - Leaves ASCII alphanumeric and spaces intact
-    Returns: List of cleaned keywords
     """
     cleaned = []
     for kw in keywords:
-        # Remove non-ASCII characters, keep only ASCII printable
         ascii_only = ''.join(c for c in kw if ord(c) < 128 and c.isprintable())
-        if ascii_only.strip():  # Only add if something remains after cleaning
-            cleaned.append(ascii_only.strip())
     return cleaned
@@ -196,7 +197,7 @@ def _compute_truth_override(
         input_cmp = input_text[:512]
         input_terms = {
             t for t in input_cmp.lower().split()
-            if len(t.strip(".,!?;:()[]{}\"'")) >= 5
             for t in [t.strip(".,!?;:()[]{}\"'")]
         }
         all_texts = [input_cmp] + source_texts
@@ -219,7 +220,7 @@ def _compute_truth_override(
         best_terms = {
             t for t in f"{best_source.title} {best_source.description or ''}".lower().split()
-            if len(t.strip(".,!?;:()[]{}\"'")) >= 5
             for t in [t.strip(".,!?;:()[]{}\"'")]
         }
         lexical_overlap = len(input_terms & best_terms) / max(len(input_terms), 1)
@@ -362,8 +363,16 @@ async def search_news_full(
     if not cleaned_keywords:
         return NewsLookupResult([], [], 0)
-    q = " ".join(cleaned_keywords[:4])
-    logger.info(f"News lookup query (after sanitization): {q!r}")
     # Fix 1: Parallel India + Global search
     # Run both searches concurrently to catch both India-focused and global stories

     "factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
 }
+# Domains eligible for truth-override (weight >= 0.8)
+_HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.8}
 # Thresholds per BUILD_PLAN §13.2
 _OVERRIDE_SIMILARITY_THRESHOLD = 0.6
     return max(1, min(int(settings.NEWS_API_PAGE_SIZE or 10), 50))
+import re
 def _sanitize_keywords(keywords: List[str]) -> List[str]:
+    """Remove non-ASCII garbage and special characters from keywords.
     Filters out:
+    - Non-ASCII characters (e.g., Devanagari numerals)
+    - Special characters like colons, quotes that break the NewsData API
     """
     cleaned = []
     for kw in keywords:
         ascii_only = ''.join(c for c in kw if ord(c) < 128 and c.isprintable())
+        safe_kw = re.sub(r'[^A-Za-z0-9\s]', ' ', ascii_only)
+        safe_kw = " ".join(safe_kw.split())
+        if safe_kw.strip():
+            cleaned.append(safe_kw.strip())
     return cleaned
         input_cmp = input_text[:512]
         input_terms = {
             t for t in input_cmp.lower().split()
+            if len(t.strip(".,!?;:()[]{}\"'")) >= 4
             for t in [t.strip(".,!?;:()[]{}\"'")]
         }
         all_texts = [input_cmp] + source_texts
         best_terms = {
             t for t in f"{best_source.title} {best_source.description or ''}".lower().split()
+            if len(t.strip(".,!?;:()[]{}\"'")) >= 4
             for t in [t.strip(".,!?;:()[]{}\"'")]
         }
         lexical_overlap = len(input_terms & best_terms) / max(len(input_terms), 1)
     if not cleaned_keywords:
         return NewsLookupResult([], [], 0)
+    seen_words = set()
+    query_words = []
+    for kw in cleaned_keywords[:4]:
+        for word in kw.split():
+            wl = word.lower()
+            if wl not in seen_words:
+                seen_words.add(wl)
+                query_words.append(word)
+    q = " ".join(query_words[:8])
+    logger.info(f"News lookup query (after sanitization & deduplication): {q!r}")
     # Fix 1: Parallel India + Global search
     # Run both searches concurrently to catch both India-focused and global stories

services/text_service.py CHANGED Viewed

@@ -227,18 +227,7 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
         numeric: List[str] = []
-        # Extract meaningful multi-word noun chunks first
-        for chunk in doc.noun_chunks:
-            parts = chunk.text.strip().split()
-            if len(parts) > 1 and parts[0].lower() in {"a", "an", "the", "some", "several", "many", "these", "those", "this", "that", "their", "our", "my", "your", "its"}:
-                parts = parts[1:]
-            chunk_text = " ".join(parts)
-            if len(parts) > 1 and len(chunk_text) > 4:
-                if not all(p.lower() in {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her"} for p in parts):
-                    norm_lower = chunk_text.lower()
-                    if norm_lower not in seen:
-                        preferred.append(chunk_text)
-                        seen.add(norm_lower)
         for ent in doc.ents:
             norm = ent.text.strip()

         numeric: List[str] = []
         for ent in doc.ents:
             norm = ent.text.strip()