Spaces:
Running
Running
File size: 5,970 Bytes
ea972e7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """
HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用,JSON 输出)
Usage:
python reader.py # 默认读取最新 json
python reader.py --input hf_papers_2026-03-10.json # 指定输入
python reader.py --input hf_papers_2026-03-10.json --top 10 # 只处理前10篇
"""
import argparse
import json
import os
import time
from pathlib import Path
from dotenv import load_dotenv
from google import genai
load_dotenv(Path(__file__).parent / ".env")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise RuntimeError("GEMINI_API_KEY not found in .env")
client = genai.Client(api_key=GEMINI_API_KEY)
SYSTEM_PROMPT = """\
You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \
with exactly two keys:
1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \
and WHY it matters. Avoid jargon; end with the key result or takeaway.
2. "detailed_analysis": A longer analysis with your own understanding, structured as:
- "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \
this paper addresses. What prior work fell short, and why is this research needed now?
- "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \
and explain how it fits into the broader research landscape.
- "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.)
- "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.)
Reply with ONLY valid JSON — no markdown fences, no extra text. English only."""
def summarize_paper(title: str, abstract: str) -> dict:
"""单次调用 Gemini,返回包含两种摘要的 dict"""
prompt = f"Title: {title}\n\nAbstract: {abstract}"
resp = client.models.generate_content(
model="gemini-3-pro-preview",
contents=prompt,
config=genai.types.GenerateContentConfig(
system_instruction=SYSTEM_PROMPT,
temperature=0.3,
max_output_tokens=16384*4,
response_mime_type="application/json",
),
)
return json.loads(resp.text)
def main():
parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要")
parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径")
parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径")
parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)")
args = parser.parse_args()
# 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件)
if args.input:
input_path = Path(args.input)
else:
candidates = sorted(
p for p in Path(__file__).parent.glob("hf_papers_*.json")
if "_summarized" not in p.name
)
if not candidates:
print("未找到 hf_papers_*.json 文件,请用 --input 指定")
return
input_path = candidates[-1]
with open(input_path, "r", encoding="utf-8") as f:
papers = json.load(f)
if args.top > 0:
papers = papers[: args.top]
total = len(papers)
print(f"读取 {input_path},共 {total} 篇论文,开始生成摘要...\n")
for i, paper in enumerate(papers, 1):
title = paper["title"]
abstract = paper.get("summary", "")
if not abstract:
print(f"[{i}/{total}] 跳过(无摘要): {title}")
paper["concise_summary"] = ""
paper["detailed_analysis"] = {}
continue
print(f"[{i}/{total}] {title}")
try:
result = summarize_paper(title, abstract)
paper["concise_summary"] = result.get("concise_summary", "")
paper["detailed_analysis"] = result.get("detailed_analysis", {})
print(f" [concise] {paper['concise_summary'][:120]}...")
summary_preview = paper["detailed_analysis"].get("summary", "")[:120]
print(f" [detailed] {summary_preview}...\n")
except Exception as e:
print(f" ✗ 生成失败: {e}\n")
paper["concise_summary"] = f"ERROR: {e}"
paper["detailed_analysis"] = {"error": str(e)}
# 简单限速,避免 API rate limit
if i < total:
time.sleep(1)
# 保存 JSON
output_path = Path(args.output) if args.output else input_path.with_name(
input_path.stem + "_summarized.json"
)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(papers, f, ensure_ascii=False, indent=2)
print(f"\n已保存到 {output_path}")
# 输出易读的文本版
txt_path = output_path.with_suffix(".txt")
with open(txt_path, "w", encoding="utf-8") as f:
for i, p in enumerate(papers, 1):
f.write(f"{'='*80}\n")
f.write(f"[{i}] {p['title']}\n")
f.write(f" Upvotes: {p.get('upvotes', 0)} | {p['hf_url']}\n")
f.write(f"{'='*80}\n\n")
f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n")
da = p.get("detailed_analysis", {})
if isinstance(da, dict) and "summary" in da:
f.write(f"--- Detailed Analysis ---\n")
if da.get("background_and_motivation"):
f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n")
f.write(f"Summary:\n{da['summary']}\n\n")
f.write(f"Pros:\n")
for pro in da.get("pros", []):
f.write(f" + {pro}\n")
f.write(f"\nCons:\n")
for con in da.get("cons", []):
f.write(f" - {con}\n")
f.write(f"\n\n")
print(f"文本版已保存到 {txt_path}")
if __name__ == "__main__":
main()
|