Spaces:
Running
Running
elfsong
Add .gitignore, update requirements, and implement Hugging Face paper crawler and reader
ea972e7 | """ | |
| HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用,JSON 输出) | |
| Usage: | |
| python reader.py # 默认读取最新 json | |
| python reader.py --input hf_papers_2026-03-10.json # 指定输入 | |
| python reader.py --input hf_papers_2026-03-10.json --top 10 # 只处理前10篇 | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import time | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from google import genai | |
| load_dotenv(Path(__file__).parent / ".env") | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| if not GEMINI_API_KEY: | |
| raise RuntimeError("GEMINI_API_KEY not found in .env") | |
| client = genai.Client(api_key=GEMINI_API_KEY) | |
| SYSTEM_PROMPT = """\ | |
| You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \ | |
| with exactly two keys: | |
| 1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \ | |
| and WHY it matters. Avoid jargon; end with the key result or takeaway. | |
| 2. "detailed_analysis": A longer analysis with your own understanding, structured as: | |
| - "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \ | |
| this paper addresses. What prior work fell short, and why is this research needed now? | |
| - "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \ | |
| and explain how it fits into the broader research landscape. | |
| - "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.) | |
| - "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.) | |
| Reply with ONLY valid JSON — no markdown fences, no extra text. English only.""" | |
| def summarize_paper(title: str, abstract: str) -> dict: | |
| """单次调用 Gemini,返回包含两种摘要的 dict""" | |
| prompt = f"Title: {title}\n\nAbstract: {abstract}" | |
| resp = client.models.generate_content( | |
| model="gemini-3-pro-preview", | |
| contents=prompt, | |
| config=genai.types.GenerateContentConfig( | |
| system_instruction=SYSTEM_PROMPT, | |
| temperature=0.3, | |
| max_output_tokens=16384*4, | |
| response_mime_type="application/json", | |
| ), | |
| ) | |
| return json.loads(resp.text) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要") | |
| parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径") | |
| parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径") | |
| parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)") | |
| args = parser.parse_args() | |
| # 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件) | |
| if args.input: | |
| input_path = Path(args.input) | |
| else: | |
| candidates = sorted( | |
| p for p in Path(__file__).parent.glob("hf_papers_*.json") | |
| if "_summarized" not in p.name | |
| ) | |
| if not candidates: | |
| print("未找到 hf_papers_*.json 文件,请用 --input 指定") | |
| return | |
| input_path = candidates[-1] | |
| with open(input_path, "r", encoding="utf-8") as f: | |
| papers = json.load(f) | |
| if args.top > 0: | |
| papers = papers[: args.top] | |
| total = len(papers) | |
| print(f"读取 {input_path},共 {total} 篇论文,开始生成摘要...\n") | |
| for i, paper in enumerate(papers, 1): | |
| title = paper["title"] | |
| abstract = paper.get("summary", "") | |
| if not abstract: | |
| print(f"[{i}/{total}] 跳过(无摘要): {title}") | |
| paper["concise_summary"] = "" | |
| paper["detailed_analysis"] = {} | |
| continue | |
| print(f"[{i}/{total}] {title}") | |
| try: | |
| result = summarize_paper(title, abstract) | |
| paper["concise_summary"] = result.get("concise_summary", "") | |
| paper["detailed_analysis"] = result.get("detailed_analysis", {}) | |
| print(f" [concise] {paper['concise_summary'][:120]}...") | |
| summary_preview = paper["detailed_analysis"].get("summary", "")[:120] | |
| print(f" [detailed] {summary_preview}...\n") | |
| except Exception as e: | |
| print(f" ✗ 生成失败: {e}\n") | |
| paper["concise_summary"] = f"ERROR: {e}" | |
| paper["detailed_analysis"] = {"error": str(e)} | |
| # 简单限速,避免 API rate limit | |
| if i < total: | |
| time.sleep(1) | |
| # 保存 JSON | |
| output_path = Path(args.output) if args.output else input_path.with_name( | |
| input_path.stem + "_summarized.json" | |
| ) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(papers, f, ensure_ascii=False, indent=2) | |
| print(f"\n已保存到 {output_path}") | |
| # 输出易读的文本版 | |
| txt_path = output_path.with_suffix(".txt") | |
| with open(txt_path, "w", encoding="utf-8") as f: | |
| for i, p in enumerate(papers, 1): | |
| f.write(f"{'='*80}\n") | |
| f.write(f"[{i}] {p['title']}\n") | |
| f.write(f" Upvotes: {p.get('upvotes', 0)} | {p['hf_url']}\n") | |
| f.write(f"{'='*80}\n\n") | |
| f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n") | |
| da = p.get("detailed_analysis", {}) | |
| if isinstance(da, dict) and "summary" in da: | |
| f.write(f"--- Detailed Analysis ---\n") | |
| if da.get("background_and_motivation"): | |
| f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n") | |
| f.write(f"Summary:\n{da['summary']}\n\n") | |
| f.write(f"Pros:\n") | |
| for pro in da.get("pros", []): | |
| f.write(f" + {pro}\n") | |
| f.write(f"\nCons:\n") | |
| for con in da.get("cons", []): | |
| f.write(f" - {con}\n") | |
| f.write(f"\n\n") | |
| print(f"文本版已保存到 {txt_path}") | |
| if __name__ == "__main__": | |
| main() | |