""" HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用,JSON 输出) Usage: python reader.py # 默认读取最新 json python reader.py --input hf_papers_2026-03-10.json # 指定输入 python reader.py --input hf_papers_2026-03-10.json --top 10 # 只处理前10篇 """ import argparse import json import os import time from pathlib import Path from dotenv import load_dotenv from google import genai load_dotenv(Path(__file__).parent / ".env") GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") if not GEMINI_API_KEY: raise RuntimeError("GEMINI_API_KEY not found in .env") client = genai.Client(api_key=GEMINI_API_KEY) SYSTEM_PROMPT = """\ You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \ with exactly two keys: 1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \ and WHY it matters. Avoid jargon; end with the key result or takeaway. 2. "detailed_analysis": A longer analysis with your own understanding, structured as: - "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \ this paper addresses. What prior work fell short, and why is this research needed now? - "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \ and explain how it fits into the broader research landscape. - "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.) - "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.) Reply with ONLY valid JSON — no markdown fences, no extra text. English only.""" def summarize_paper(title: str, abstract: str) -> dict: """单次调用 Gemini,返回包含两种摘要的 dict""" prompt = f"Title: {title}\n\nAbstract: {abstract}" resp = client.models.generate_content( model="gemini-3-pro-preview", contents=prompt, config=genai.types.GenerateContentConfig( system_instruction=SYSTEM_PROMPT, temperature=0.3, max_output_tokens=16384*4, response_mime_type="application/json", ), ) return json.loads(resp.text) def main(): parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要") parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径") parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径") parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)") args = parser.parse_args() # 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件) if args.input: input_path = Path(args.input) else: candidates = sorted( p for p in Path(__file__).parent.glob("hf_papers_*.json") if "_summarized" not in p.name ) if not candidates: print("未找到 hf_papers_*.json 文件,请用 --input 指定") return input_path = candidates[-1] with open(input_path, "r", encoding="utf-8") as f: papers = json.load(f) if args.top > 0: papers = papers[: args.top] total = len(papers) print(f"读取 {input_path},共 {total} 篇论文,开始生成摘要...\n") for i, paper in enumerate(papers, 1): title = paper["title"] abstract = paper.get("summary", "") if not abstract: print(f"[{i}/{total}] 跳过(无摘要): {title}") paper["concise_summary"] = "" paper["detailed_analysis"] = {} continue print(f"[{i}/{total}] {title}") try: result = summarize_paper(title, abstract) paper["concise_summary"] = result.get("concise_summary", "") paper["detailed_analysis"] = result.get("detailed_analysis", {}) print(f" [concise] {paper['concise_summary'][:120]}...") summary_preview = paper["detailed_analysis"].get("summary", "")[:120] print(f" [detailed] {summary_preview}...\n") except Exception as e: print(f" ✗ 生成失败: {e}\n") paper["concise_summary"] = f"ERROR: {e}" paper["detailed_analysis"] = {"error": str(e)} # 简单限速,避免 API rate limit if i < total: time.sleep(1) # 保存 JSON output_path = Path(args.output) if args.output else input_path.with_name( input_path.stem + "_summarized.json" ) with open(output_path, "w", encoding="utf-8") as f: json.dump(papers, f, ensure_ascii=False, indent=2) print(f"\n已保存到 {output_path}") # 输出易读的文本版 txt_path = output_path.with_suffix(".txt") with open(txt_path, "w", encoding="utf-8") as f: for i, p in enumerate(papers, 1): f.write(f"{'='*80}\n") f.write(f"[{i}] {p['title']}\n") f.write(f" Upvotes: {p.get('upvotes', 0)} | {p['hf_url']}\n") f.write(f"{'='*80}\n\n") f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n") da = p.get("detailed_analysis", {}) if isinstance(da, dict) and "summary" in da: f.write(f"--- Detailed Analysis ---\n") if da.get("background_and_motivation"): f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n") f.write(f"Summary:\n{da['summary']}\n\n") f.write(f"Pros:\n") for pro in da.get("pros", []): f.write(f" + {pro}\n") f.write(f"\nCons:\n") for con in da.get("cons", []): f.write(f" - {con}\n") f.write(f"\n\n") print(f"文本版已保存到 {txt_path}") if __name__ == "__main__": main()