Spaces:

Elfsong
/

Daily_Paper_Reader

Running

elfsong

Add .gitignore, update requirements, and implement Hugging Face paper crawler and reader

ea972e7 1 day ago

5.97 kB

	"""
	HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用，JSON 输出)
	Usage:
	python reader.py # 默认读取最新 json
	python reader.py --input hf_papers_2026-03-10.json # 指定输入
	python reader.py --input hf_papers_2026-03-10.json --top 10 # 只处理前10篇
	"""

	import argparse
	import json
	import os
	import time
	from pathlib import Path

	from dotenv import load_dotenv
	from google import genai

	load_dotenv(Path(__file__).parent / ".env")

	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if not GEMINI_API_KEY:
	raise RuntimeError("GEMINI_API_KEY not found in .env")

	client = genai.Client(api_key=GEMINI_API_KEY)

	SYSTEM_PROMPT = """\
	You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \
	with exactly two keys:

	1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \
	and WHY it matters. Avoid jargon; end with the key result or takeaway.

	2. "detailed_analysis": A longer analysis with your own understanding, structured as:
	- "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \
	this paper addresses. What prior work fell short, and why is this research needed now?
	- "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \
	and explain how it fits into the broader research landscape.
	- "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.)
	- "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.)

	Reply with ONLY valid JSON — no markdown fences, no extra text. English only."""


	def summarize_paper(title: str, abstract: str) -> dict:
	"""单次调用 Gemini，返回包含两种摘要的 dict"""
	prompt = f"Title: {title}\n\nAbstract: {abstract}"
	resp = client.models.generate_content(
	model="gemini-3-pro-preview",
	contents=prompt,
	config=genai.types.GenerateContentConfig(
	system_instruction=SYSTEM_PROMPT,
	temperature=0.3,
	max_output_tokens=16384*4,
	response_mime_type="application/json",
	),
	)
	return json.loads(resp.text)


	def main():
	parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要")
	parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径")
	parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径")
	parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)")
	args = parser.parse_args()

	# 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件)
	if args.input:
	input_path = Path(args.input)
	else:
	candidates = sorted(
	p for p in Path(__file__).parent.glob("hf_papers_*.json")
	if "_summarized" not in p.name
	)
	if not candidates:
	print("未找到 hf_papers_*.json 文件，请用 --input 指定")
	return
	input_path = candidates[-1]

	with open(input_path, "r", encoding="utf-8") as f:
	papers = json.load(f)

	if args.top > 0:
	papers = papers[: args.top]

	total = len(papers)
	print(f"读取 {input_path}，共 {total} 篇论文，开始生成摘要...\n")

	for i, paper in enumerate(papers, 1):
	title = paper["title"]
	abstract = paper.get("summary", "")
	if not abstract:
	print(f"[{i}/{total}] 跳过（无摘要）: {title}")
	paper["concise_summary"] = ""
	paper["detailed_analysis"] = {}
	continue

	print(f"[{i}/{total}] {title}")
	try:
	result = summarize_paper(title, abstract)
	paper["concise_summary"] = result.get("concise_summary", "")
	paper["detailed_analysis"] = result.get("detailed_analysis", {})
	print(f" [concise] {paper['concise_summary'][:120]}...")
	summary_preview = paper["detailed_analysis"].get("summary", "")[:120]
	print(f" [detailed] {summary_preview}...\n")
	except Exception as e:
	print(f" ✗ 生成失败: {e}\n")
	paper["concise_summary"] = f"ERROR: {e}"
	paper["detailed_analysis"] = {"error": str(e)}

	# 简单限速，避免 API rate limit
	if i < total:
	time.sleep(1)

	# 保存 JSON
	output_path = Path(args.output) if args.output else input_path.with_name(
	input_path.stem + "_summarized.json"
	)
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(papers, f, ensure_ascii=False, indent=2)
	print(f"\n已保存到 {output_path}")

	# 输出易读的文本版
	txt_path = output_path.with_suffix(".txt")
	with open(txt_path, "w", encoding="utf-8") as f:
	for i, p in enumerate(papers, 1):
	f.write(f"{'='*80}\n")
	f.write(f"[{i}] {p['title']}\n")
	f.write(f" Upvotes: {p.get('upvotes', 0)} \| {p['hf_url']}\n")
	f.write(f"{'='*80}\n\n")
	f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n")
	da = p.get("detailed_analysis", {})
	if isinstance(da, dict) and "summary" in da:
	f.write(f"--- Detailed Analysis ---\n")
	if da.get("background_and_motivation"):
	f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n")
	f.write(f"Summary:\n{da['summary']}\n\n")
	f.write(f"Pros:\n")
	for pro in da.get("pros", []):
	f.write(f" + {pro}\n")
	f.write(f"\nCons:\n")
	for con in da.get("cons", []):
	f.write(f" - {con}\n")
	f.write(f"\n\n")
	print(f"文本版已保存到 {txt_path}")


	if __name__ == "__main__":
	main()