ar07xd commited on
Commit
ed9f9c4
·
verified ·
1 Parent(s): f51c5bd

Sync from GitHub (Code Only)

Browse files
Files changed (2) hide show
  1. services/news_lookup.py +23 -14
  2. services/text_service.py +1 -12
services/news_lookup.py CHANGED
@@ -31,8 +31,8 @@ FACTCHECK_DOMAINS = {
31
  "factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
32
  }
33
 
34
- # Domains eligible for truth-override (weight >= 0.9 per BUILD_PLAN spec)
35
- _HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.9}
36
 
37
  # Thresholds per BUILD_PLAN §13.2
38
  _OVERRIDE_SIMILARITY_THRESHOLD = 0.6
@@ -67,21 +67,22 @@ def _page_size() -> int:
67
  return max(1, min(int(settings.NEWS_API_PAGE_SIZE or 10), 50))
68
 
69
 
 
 
70
  def _sanitize_keywords(keywords: List[str]) -> List[str]:
71
- """Remove non-ASCII garbage from keywords.
72
 
73
  Filters out:
74
- - Non-ASCII characters (e.g., Devanagari numerals '13२')
75
- - Leaves ASCII alphanumeric and spaces intact
76
-
77
- Returns: List of cleaned keywords
78
  """
79
  cleaned = []
80
  for kw in keywords:
81
- # Remove non-ASCII characters, keep only ASCII printable
82
  ascii_only = ''.join(c for c in kw if ord(c) < 128 and c.isprintable())
83
- if ascii_only.strip(): # Only add if something remains after cleaning
84
- cleaned.append(ascii_only.strip())
 
 
85
  return cleaned
86
 
87
 
@@ -196,7 +197,7 @@ def _compute_truth_override(
196
  input_cmp = input_text[:512]
197
  input_terms = {
198
  t for t in input_cmp.lower().split()
199
- if len(t.strip(".,!?;:()[]{}\"'")) >= 5
200
  for t in [t.strip(".,!?;:()[]{}\"'")]
201
  }
202
  all_texts = [input_cmp] + source_texts
@@ -219,7 +220,7 @@ def _compute_truth_override(
219
 
220
  best_terms = {
221
  t for t in f"{best_source.title} {best_source.description or ''}".lower().split()
222
- if len(t.strip(".,!?;:()[]{}\"'")) >= 5
223
  for t in [t.strip(".,!?;:()[]{}\"'")]
224
  }
225
  lexical_overlap = len(input_terms & best_terms) / max(len(input_terms), 1)
@@ -362,8 +363,16 @@ async def search_news_full(
362
  if not cleaned_keywords:
363
  return NewsLookupResult([], [], 0)
364
 
365
- q = " ".join(cleaned_keywords[:4])
366
- logger.info(f"News lookup query (after sanitization): {q!r}")
 
 
 
 
 
 
 
 
367
 
368
  # Fix 1: Parallel India + Global search
369
  # Run both searches concurrently to catch both India-focused and global stories
 
31
  "factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
32
  }
33
 
34
+ # Domains eligible for truth-override (weight >= 0.8)
35
+ _HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.8}
36
 
37
  # Thresholds per BUILD_PLAN §13.2
38
  _OVERRIDE_SIMILARITY_THRESHOLD = 0.6
 
67
  return max(1, min(int(settings.NEWS_API_PAGE_SIZE or 10), 50))
68
 
69
 
70
+ import re
71
+
72
  def _sanitize_keywords(keywords: List[str]) -> List[str]:
73
+ """Remove non-ASCII garbage and special characters from keywords.
74
 
75
  Filters out:
76
+ - Non-ASCII characters (e.g., Devanagari numerals)
77
+ - Special characters like colons, quotes that break the NewsData API
 
 
78
  """
79
  cleaned = []
80
  for kw in keywords:
 
81
  ascii_only = ''.join(c for c in kw if ord(c) < 128 and c.isprintable())
82
+ safe_kw = re.sub(r'[^A-Za-z0-9\s]', ' ', ascii_only)
83
+ safe_kw = " ".join(safe_kw.split())
84
+ if safe_kw.strip():
85
+ cleaned.append(safe_kw.strip())
86
  return cleaned
87
 
88
 
 
197
  input_cmp = input_text[:512]
198
  input_terms = {
199
  t for t in input_cmp.lower().split()
200
+ if len(t.strip(".,!?;:()[]{}\"'")) >= 4
201
  for t in [t.strip(".,!?;:()[]{}\"'")]
202
  }
203
  all_texts = [input_cmp] + source_texts
 
220
 
221
  best_terms = {
222
  t for t in f"{best_source.title} {best_source.description or ''}".lower().split()
223
+ if len(t.strip(".,!?;:()[]{}\"'")) >= 4
224
  for t in [t.strip(".,!?;:()[]{}\"'")]
225
  }
226
  lexical_overlap = len(input_terms & best_terms) / max(len(input_terms), 1)
 
363
  if not cleaned_keywords:
364
  return NewsLookupResult([], [], 0)
365
 
366
+ seen_words = set()
367
+ query_words = []
368
+ for kw in cleaned_keywords[:4]:
369
+ for word in kw.split():
370
+ wl = word.lower()
371
+ if wl not in seen_words:
372
+ seen_words.add(wl)
373
+ query_words.append(word)
374
+ q = " ".join(query_words[:8])
375
+ logger.info(f"News lookup query (after sanitization & deduplication): {q!r}")
376
 
377
  # Fix 1: Parallel India + Global search
378
  # Run both searches concurrently to catch both India-focused and global stories
services/text_service.py CHANGED
@@ -227,18 +227,7 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
227
 
228
  numeric: List[str] = []
229
 
230
- # Extract meaningful multi-word noun chunks first
231
- for chunk in doc.noun_chunks:
232
- parts = chunk.text.strip().split()
233
- if len(parts) > 1 and parts[0].lower() in {"a", "an", "the", "some", "several", "many", "these", "those", "this", "that", "their", "our", "my", "your", "its"}:
234
- parts = parts[1:]
235
- chunk_text = " ".join(parts)
236
- if len(parts) > 1 and len(chunk_text) > 4:
237
- if not all(p.lower() in {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her"} for p in parts):
238
- norm_lower = chunk_text.lower()
239
- if norm_lower not in seen:
240
- preferred.append(chunk_text)
241
- seen.add(norm_lower)
242
 
243
  for ent in doc.ents:
244
  norm = ent.text.strip()
 
227
 
228
  numeric: List[str] = []
229
 
230
+
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  for ent in doc.ents:
233
  norm = ent.text.strip()