| """ |
| Live research-context lookup tools. |
| """ |
|
|
| import re |
| import xml.etree.ElementTree as ET |
|
|
| import requests |
|
|
| SEMANTIC_SCHOLAR_SEARCH_URL = "https://api.semanticscholar.org/graph/v1/paper/search" |
| ARXIV_SEARCH_URL = "https://export.arxiv.org/api/query" |
| PAPER_FIELDS = "title,abstract,year,authors,url,citationCount,venue,fieldsOfStudy" |
|
|
|
|
| def _clean_text(value: str | None, max_length: int | None = None) -> str: |
| text = re.sub(r"\s+", " ", value or "").strip() |
| if max_length and len(text) > max_length: |
| return f"{text[: max_length - 3].rstrip()}..." |
| return text |
|
|
|
|
| def _build_research_query(concept: str, paper_context: str, domain: str) -> str: |
| query_parts = [ |
| _clean_text(concept, 120), |
| _clean_text(paper_context, 220), |
| _clean_text(domain, 80), |
| ] |
| return " ".join(part for part in query_parts if part) |
|
|
|
|
| def _normalize_max_results(max_results: int) -> int: |
| return max(1, min(int(max_results or 5), 10)) |
|
|
|
|
| def _semantic_scholar_papers(query: str, max_results: int) -> list[dict]: |
| response = requests.get( |
| SEMANTIC_SCHOLAR_SEARCH_URL, |
| params={ |
| "query": query, |
| "limit": max_results, |
| "fields": PAPER_FIELDS, |
| }, |
| timeout=10, |
| ) |
| response.raise_for_status() |
| data = response.json() |
|
|
| papers: list[dict] = [] |
| for paper in data.get("data", []): |
| title = _clean_text(paper.get("title")) |
| if not title: |
| continue |
|
|
| papers.append( |
| { |
| "title": title, |
| "year": paper.get("year"), |
| "authors": [ |
| _clean_text(author.get("name")) |
| for author in paper.get("authors", [])[:5] |
| if author.get("name") |
| ], |
| "venue": _clean_text(paper.get("venue")), |
| "url": paper.get("url"), |
| "citation_count": paper.get("citationCount"), |
| "fields_of_study": paper.get("fieldsOfStudy") or [], |
| "abstract": _clean_text(paper.get("abstract"), 700), |
| "source": "Semantic Scholar", |
| } |
| ) |
|
|
| return papers |
|
|
|
|
| def _arxiv_papers(query: str, max_results: int) -> list[dict]: |
| response = requests.get( |
| ARXIV_SEARCH_URL, |
| params={ |
| "search_query": f"all:{query}", |
| "start": 0, |
| "max_results": max_results, |
| "sortBy": "relevance", |
| "sortOrder": "descending", |
| }, |
| timeout=10, |
| ) |
| response.raise_for_status() |
|
|
| root = ET.fromstring(response.text) |
| namespace = {"atom": "http://www.w3.org/2005/Atom"} |
| papers: list[dict] = [] |
|
|
| for entry in root.findall("atom:entry", namespace): |
| title = _clean_text( |
| entry.findtext("atom:title", default="", namespaces=namespace) |
| ) |
| if not title: |
| continue |
|
|
| authors = [ |
| _clean_text(author.findtext("atom:name", default="", namespaces=namespace)) |
| for author in entry.findall("atom:author", namespace)[:5] |
| ] |
| papers.append( |
| { |
| "title": title, |
| "year": ( |
| entry.findtext("atom:published", default="", namespaces=namespace) |
| or "" |
| )[:4], |
| "authors": [author for author in authors if author], |
| "venue": "arXiv", |
| "url": entry.findtext("atom:id", default="", namespaces=namespace), |
| "citation_count": None, |
| "fields_of_study": [], |
| "abstract": _clean_text( |
| entry.findtext("atom:summary", default="", namespaces=namespace), |
| 700, |
| ), |
| "source": "arXiv", |
| } |
| ) |
|
|
| return papers |
|
|
|
|
| def _suggest_research_directions(concept: str, papers: list[dict]) -> list[str]: |
| title_and_abstract = " ".join( |
| f"{paper.get('title', '')} {paper.get('abstract', '')}" for paper in papers |
| ).lower() |
| directions: list[str] = [] |
|
|
| keyword_directions = [ |
| ( |
| ("efficient", "linear", "sparse", "compression"), |
| f"More efficient versions of {concept}", |
| ), |
| ( |
| ("scaling", "large-scale", "foundation", "pretraining"), |
| f"Scaling {concept} to larger models or datasets", |
| ), |
| ( |
| ("vision", "image", "multimodal", "video"), |
| f"Using {concept} in vision or multimodal systems", |
| ), |
| ( |
| ("retrieval", "knowledge", "rag", "memory"), |
| f"Combining {concept} with retrieval or external knowledge", |
| ), |
| ( |
| ("robust", "safety", "bias", "privacy"), |
| f"Studying robustness, safety, or privacy around {concept}", |
| ), |
| ] |
|
|
| for keywords, direction in keyword_directions: |
| if any(keyword in title_and_abstract for keyword in keywords): |
| directions.append(direction) |
|
|
| if not directions: |
| directions = [ |
| f"Foundational papers that introduced or popularized {concept}", |
| f"Recent applications that adapt {concept} to new tasks", |
| f"Limitations and follow-up methods that improve on {concept}", |
| ] |
|
|
| return directions[:5] |
|
|
|
|
| async def find_research_context( |
| concept: str, |
| paper_context: str, |
| domain: str = "machine learning", |
| max_results: int = 5, |
| ) -> dict: |
| """ |
| Finds external research context for a concept discussed in the uploaded paper. |
| |
| Use this when the user asks where a concept leads, what uses it, related work, |
| follow-up reading, or how the idea connects to broader research. |
| |
| Args: |
| concept (str): The concept or method to investigate. |
| paper_context (str): Paper-specific context that makes the search precise. |
| domain (str): The broader research domain, such as machine learning. |
| max_results (int): Maximum number of papers to return, capped at 10. |
| |
| Returns: |
| dict: Related papers, suggested directions, source metadata, or error details. |
| """ |
| concept = _clean_text(concept, 120) |
| paper_context = _clean_text(paper_context, 500) |
| domain = _clean_text(domain, 80) or "machine learning" |
| max_results = _normalize_max_results(max_results) |
|
|
| if not concept: |
| return { |
| "status": "failed", |
| "detail": "Provide a non-empty concept to search for research context.", |
| } |
|
|
| query = _build_research_query(concept, paper_context, domain) |
| errors: list[str] = [] |
|
|
| try: |
| papers = _semantic_scholar_papers(query, max_results) |
| source = "Semantic Scholar" |
| except Exception as exc: |
| papers = [] |
| source = "arXiv" |
| errors.append(f"Semantic Scholar search failed: {exc}") |
|
|
| if not papers: |
| try: |
| papers = _arxiv_papers(query, max_results) |
| source = "arXiv" |
| except Exception as exc: |
| errors.append(f"arXiv search failed: {exc}") |
|
|
| if not papers: |
| return { |
| "status": "failed", |
| "query": query, |
| "detail": "No related papers found from Semantic Scholar or arXiv.", |
| "errors": errors, |
| } |
|
|
| return { |
| "status": "success", |
| "concept": concept, |
| "query": query, |
| "source": source, |
| "suggested_directions": _suggest_research_directions(concept, papers), |
| "papers": papers[:max_results], |
| "errors": errors, |
| } |
|
|