Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /external_research.py
| """External research loop for source-grounded answers. | |
| This is a practical retrieval policy, not a claim of autonomous perfect RL. | |
| It searches the live web, fetches candidate pages, hashes evidence, and returns | |
| only source-backed snippets that downstream answer guards may use. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from html.parser import HTMLParser | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| import re | |
| from typing import Iterable | |
| from urllib.parse import quote_plus, unquote, urlparse, parse_qs | |
| import httpx | |
| TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE) | |
| def _sha256(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() | |
| def _terms(text: str) -> set[str]: | |
| return {tok.lower() for tok in TOKEN_RE.findall(text) if len(tok) >= 2} | |
| class _TextExtractor(HTMLParser): | |
| def __init__(self): | |
| super().__init__() | |
| self.parts: list[str] = [] | |
| self.links: list[str] = [] | |
| self._skip = False | |
| def handle_starttag(self, tag, attrs): | |
| if tag in {"script", "style", "noscript"}: | |
| self._skip = True | |
| if tag == "a": | |
| attrs_dict = dict(attrs) | |
| href = attrs_dict.get("href") | |
| if href: | |
| self.links.append(href) | |
| def handle_endtag(self, tag): | |
| if tag in {"script", "style", "noscript"}: | |
| self._skip = False | |
| def handle_data(self, data): | |
| if not self._skip and data.strip(): | |
| self.parts.append(data.strip()) | |
| def text(self) -> str: | |
| return re.sub(r"\s+", " ", " ".join(self.parts)).strip() | |
| class ResearchSource: | |
| url: str | |
| title: str | |
| text: str | |
| sha256: str | |
| matched_terms: list[str] | |
| score: float | |
| class ExternalResearcher: | |
| def __init__(self, client: httpx.Client | None = None, timeout_s: float = 15.0): | |
| self.client = client or httpx.Client( | |
| timeout=timeout_s, | |
| follow_redirects=True, | |
| headers={"User-Agent": "TinyMindResearch/1.0 source-grounded evaluation"}, | |
| ) | |
| def search_urls(self, query: str, max_results: int = 5) -> list[str]: | |
| url = f"https://duckduckgo.com/html/?q={quote_plus(query)}" | |
| resp = self.client.get(url) | |
| resp.raise_for_status() | |
| parser = _TextExtractor() | |
| parser.feed(resp.text) | |
| urls: list[str] = [] | |
| for href in parser.links: | |
| parsed = urlparse(href) | |
| if parsed.path == "/l/": | |
| uddg = parse_qs(parsed.query).get("uddg", [""])[0] | |
| href = unquote(uddg) | |
| if href.startswith("http") and "duckduckgo.com" not in href: | |
| urls.append(href) | |
| if len(urls) >= max_results: | |
| break | |
| return list(dict.fromkeys(urls)) | |
| def fetch_source(self, url: str, query: str) -> ResearchSource | None: | |
| try: | |
| resp = self.client.get(url) | |
| resp.raise_for_status() | |
| except Exception: | |
| return None | |
| content_type = resp.headers.get("content-type", "") | |
| if "text/html" in content_type or "<html" in resp.text[:500].lower(): | |
| parser = _TextExtractor() | |
| parser.feed(resp.text) | |
| text = parser.text | |
| else: | |
| text = resp.text | |
| text = text[:12000] | |
| q_terms = _terms(query) | |
| matched = sorted(q_terms & _terms(text)) | |
| score = len(matched) / max(len(q_terms), 1) | |
| if score <= 0: | |
| return None | |
| title = text[:90] | |
| return ResearchSource(url=url, title=title, text=text, sha256=_sha256(text), matched_terms=matched, score=score) | |
| def research(self, query: str, out_dir: str | Path, max_results: int = 5) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| urls = self.search_urls(query, max_results=max_results) | |
| sources: list[ResearchSource] = [] | |
| for url in urls: | |
| source = self.fetch_source(url, query) | |
| if source is not None: | |
| sources.append(source) | |
| sources.sort(key=lambda src: src.score, reverse=True) | |
| report = { | |
| "schema_version": "tinymind-external-research-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "query": query, | |
| "policy": "search_live_web_fetch_hash_verify_then_answer_only_from_sources", | |
| "deep_rl_policy_proxy": { | |
| "state": "missing_or_insufficient_internal_evidence", | |
| "actions": ["search", "fetch", "hash", "score_overlap", "answer_or_refuse"], | |
| "reward": "maximize grounded evidence coverage and minimize unsupported claims", | |
| }, | |
| "searched_urls": urls, | |
| "sources": [source.__dict__ for source in sources], | |
| "source_count": len(sources), | |
| } | |
| path = out / "external_research_report.json" | |
| report["report_path"] = str(path) | |
| path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 5.18 kB
- Xet hash:
- 3a5444c75721f674bafca4a0d4021d63ede6a03acefca991183c4b22f8e3e6d3
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.