Spaces:

decodingdatascience
/

Research-analyst-ADK

Running

App Files Files Community

Research-analyst-ADK / research_explainer /tools /research_context.py

decodingdatascience

Upload 15 files

8bd78d1 verified 4 days ago

Raw

History Blame Contribute Delete

7.54 kB

	"""
	Live research-context lookup tools.
	"""

	import re
	import xml.etree.ElementTree as ET

	import requests

	SEMANTIC_SCHOLAR_SEARCH_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
	ARXIV_SEARCH_URL = "https://export.arxiv.org/api/query"
	PAPER_FIELDS = "title,abstract,year,authors,url,citationCount,venue,fieldsOfStudy"


	def _clean_text(value: str \| None, max_length: int \| None = None) -> str:
	text = re.sub(r"\s+", " ", value or "").strip()
	if max_length and len(text) > max_length:
	return f"{text[: max_length - 3].rstrip()}..."
	return text


	def _build_research_query(concept: str, paper_context: str, domain: str) -> str:
	query_parts = [
	_clean_text(concept, 120),
	_clean_text(paper_context, 220),
	_clean_text(domain, 80),
	]
	return " ".join(part for part in query_parts if part)


	def _normalize_max_results(max_results: int) -> int:
	return max(1, min(int(max_results or 5), 10))


	def _semantic_scholar_papers(query: str, max_results: int) -> list[dict]:
	response = requests.get(
	SEMANTIC_SCHOLAR_SEARCH_URL,
	params={
	"query": query,
	"limit": max_results,
	"fields": PAPER_FIELDS,
	},
	timeout=10,
	)
	response.raise_for_status()
	data = response.json()

	papers: list[dict] = []
	for paper in data.get("data", []):
	title = _clean_text(paper.get("title"))
	if not title:
	continue

	papers.append(
	{
	"title": title,
	"year": paper.get("year"),
	"authors": [
	_clean_text(author.get("name"))
	for author in paper.get("authors", [])[:5]
	if author.get("name")
	],
	"venue": _clean_text(paper.get("venue")),
	"url": paper.get("url"),
	"citation_count": paper.get("citationCount"),
	"fields_of_study": paper.get("fieldsOfStudy") or [],
	"abstract": _clean_text(paper.get("abstract"), 700),
	"source": "Semantic Scholar",
	}
	)

	return papers


	def _arxiv_papers(query: str, max_results: int) -> list[dict]:
	response = requests.get(
	ARXIV_SEARCH_URL,
	params={
	"search_query": f"all:{query}",
	"start": 0,
	"max_results": max_results,
	"sortBy": "relevance",
	"sortOrder": "descending",
	},
	timeout=10,
	)
	response.raise_for_status()

	root = ET.fromstring(response.text)
	namespace = {"atom": "http://www.w3.org/2005/Atom"}
	papers: list[dict] = []

	for entry in root.findall("atom:entry", namespace):
	title = _clean_text(
	entry.findtext("atom:title", default="", namespaces=namespace)
	)
	if not title:
	continue

	authors = [
	_clean_text(author.findtext("atom:name", default="", namespaces=namespace))
	for author in entry.findall("atom:author", namespace)[:5]
	]
	papers.append(
	{
	"title": title,
	"year": (
	entry.findtext("atom:published", default="", namespaces=namespace)
	or ""
	)[:4],
	"authors": [author for author in authors if author],
	"venue": "arXiv",
	"url": entry.findtext("atom:id", default="", namespaces=namespace),
	"citation_count": None,
	"fields_of_study": [],
	"abstract": _clean_text(
	entry.findtext("atom:summary", default="", namespaces=namespace),
	700,
	),
	"source": "arXiv",
	}
	)

	return papers


	def _suggest_research_directions(concept: str, papers: list[dict]) -> list[str]:
	title_and_abstract = " ".join(
	f"{paper.get('title', '')} {paper.get('abstract', '')}" for paper in papers
	).lower()
	directions: list[str] = []

	keyword_directions = [
	(
	("efficient", "linear", "sparse", "compression"),
	f"More efficient versions of {concept}",
	),
	(
	("scaling", "large-scale", "foundation", "pretraining"),
	f"Scaling {concept} to larger models or datasets",
	),
	(
	("vision", "image", "multimodal", "video"),
	f"Using {concept} in vision or multimodal systems",
	),
	(
	("retrieval", "knowledge", "rag", "memory"),
	f"Combining {concept} with retrieval or external knowledge",
	),
	(
	("robust", "safety", "bias", "privacy"),
	f"Studying robustness, safety, or privacy around {concept}",
	),
	]

	for keywords, direction in keyword_directions:
	if any(keyword in title_and_abstract for keyword in keywords):
	directions.append(direction)

	if not directions:
	directions = [
	f"Foundational papers that introduced or popularized {concept}",
	f"Recent applications that adapt {concept} to new tasks",
	f"Limitations and follow-up methods that improve on {concept}",
	]

	return directions[:5]


	async def find_research_context(
	concept: str,
	paper_context: str,
	domain: str = "machine learning",
	max_results: int = 5,
	) -> dict:
	"""
	Finds external research context for a concept discussed in the uploaded paper.

	Use this when the user asks where a concept leads, what uses it, related work,
	follow-up reading, or how the idea connects to broader research.

	Args:
	concept (str): The concept or method to investigate.
	paper_context (str): Paper-specific context that makes the search precise.
	domain (str): The broader research domain, such as machine learning.
	max_results (int): Maximum number of papers to return, capped at 10.

	Returns:
	dict: Related papers, suggested directions, source metadata, or error details.
	"""
	concept = _clean_text(concept, 120)
	paper_context = _clean_text(paper_context, 500)
	domain = _clean_text(domain, 80) or "machine learning"
	max_results = _normalize_max_results(max_results)

	if not concept:
	return {
	"status": "failed",
	"detail": "Provide a non-empty concept to search for research context.",
	}

	query = _build_research_query(concept, paper_context, domain)
	errors: list[str] = []

	try:
	papers = _semantic_scholar_papers(query, max_results)
	source = "Semantic Scholar"
	except Exception as exc:
	papers = []
	source = "arXiv"
	errors.append(f"Semantic Scholar search failed: {exc}")

	if not papers:
	try:
	papers = _arxiv_papers(query, max_results)
	source = "arXiv"
	except Exception as exc:
	errors.append(f"arXiv search failed: {exc}")

	if not papers:
	return {
	"status": "failed",
	"query": query,
	"detail": "No related papers found from Semantic Scholar or arXiv.",
	"errors": errors,
	}

	return {
	"status": "success",
	"concept": concept,
	"query": query,
	"source": source,
	"suggested_directions": _suggest_research_directions(concept, papers),
	"papers": papers[:max_results],
	"errors": errors,
	}