elfsong
Add .gitignore, update requirements, and implement Hugging Face paper crawler and reader
ea972e7
"""
HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用,JSON 输出)
Usage:
python reader.py # 默认读取最新 json
python reader.py --input hf_papers_2026-03-10.json # 指定输入
python reader.py --input hf_papers_2026-03-10.json --top 10 # 只处理前10篇
"""
import argparse
import json
import os
import time
from pathlib import Path
from dotenv import load_dotenv
from google import genai
load_dotenv(Path(__file__).parent / ".env")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise RuntimeError("GEMINI_API_KEY not found in .env")
client = genai.Client(api_key=GEMINI_API_KEY)
SYSTEM_PROMPT = """\
You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \
with exactly two keys:
1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \
and WHY it matters. Avoid jargon; end with the key result or takeaway.
2. "detailed_analysis": A longer analysis with your own understanding, structured as:
- "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \
this paper addresses. What prior work fell short, and why is this research needed now?
- "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \
and explain how it fits into the broader research landscape.
- "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.)
- "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.)
Reply with ONLY valid JSON — no markdown fences, no extra text. English only."""
def summarize_paper(title: str, abstract: str) -> dict:
"""单次调用 Gemini,返回包含两种摘要的 dict"""
prompt = f"Title: {title}\n\nAbstract: {abstract}"
resp = client.models.generate_content(
model="gemini-3-pro-preview",
contents=prompt,
config=genai.types.GenerateContentConfig(
system_instruction=SYSTEM_PROMPT,
temperature=0.3,
max_output_tokens=16384*4,
response_mime_type="application/json",
),
)
return json.loads(resp.text)
def main():
parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要")
parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径")
parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径")
parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)")
args = parser.parse_args()
# 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件)
if args.input:
input_path = Path(args.input)
else:
candidates = sorted(
p for p in Path(__file__).parent.glob("hf_papers_*.json")
if "_summarized" not in p.name
)
if not candidates:
print("未找到 hf_papers_*.json 文件,请用 --input 指定")
return
input_path = candidates[-1]
with open(input_path, "r", encoding="utf-8") as f:
papers = json.load(f)
if args.top > 0:
papers = papers[: args.top]
total = len(papers)
print(f"读取 {input_path},共 {total} 篇论文,开始生成摘要...\n")
for i, paper in enumerate(papers, 1):
title = paper["title"]
abstract = paper.get("summary", "")
if not abstract:
print(f"[{i}/{total}] 跳过(无摘要): {title}")
paper["concise_summary"] = ""
paper["detailed_analysis"] = {}
continue
print(f"[{i}/{total}] {title}")
try:
result = summarize_paper(title, abstract)
paper["concise_summary"] = result.get("concise_summary", "")
paper["detailed_analysis"] = result.get("detailed_analysis", {})
print(f" [concise] {paper['concise_summary'][:120]}...")
summary_preview = paper["detailed_analysis"].get("summary", "")[:120]
print(f" [detailed] {summary_preview}...\n")
except Exception as e:
print(f" ✗ 生成失败: {e}\n")
paper["concise_summary"] = f"ERROR: {e}"
paper["detailed_analysis"] = {"error": str(e)}
# 简单限速,避免 API rate limit
if i < total:
time.sleep(1)
# 保存 JSON
output_path = Path(args.output) if args.output else input_path.with_name(
input_path.stem + "_summarized.json"
)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(papers, f, ensure_ascii=False, indent=2)
print(f"\n已保存到 {output_path}")
# 输出易读的文本版
txt_path = output_path.with_suffix(".txt")
with open(txt_path, "w", encoding="utf-8") as f:
for i, p in enumerate(papers, 1):
f.write(f"{'='*80}\n")
f.write(f"[{i}] {p['title']}\n")
f.write(f" Upvotes: {p.get('upvotes', 0)} | {p['hf_url']}\n")
f.write(f"{'='*80}\n\n")
f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n")
da = p.get("detailed_analysis", {})
if isinstance(da, dict) and "summary" in da:
f.write(f"--- Detailed Analysis ---\n")
if da.get("background_and_motivation"):
f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n")
f.write(f"Summary:\n{da['summary']}\n\n")
f.write(f"Pros:\n")
for pro in da.get("pros", []):
f.write(f" + {pro}\n")
f.write(f"\nCons:\n")
for con in da.get("cons", []):
f.write(f" - {con}\n")
f.write(f"\n\n")
print(f"文本版已保存到 {txt_path}")
if __name__ == "__main__":
main()