| import argparse |
| import json |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional |
|
|
|
|
| PAPER_META_FILE = "paper_metadata.json" |
| CITATIONS_FILE = "citations_metadata.json" |
| DEFAULT_OUT_NAME = "usage_contexts.json" |
|
|
|
|
| def load_json(path: Path) -> Any | None: |
| if not path.exists(): |
| return None |
| try: |
| return json.loads(path.read_text(encoding="utf-8")) |
| except Exception as e: |
| print(f"[WARN] could not parse JSON at {path}: {e}") |
| return None |
|
|
|
|
| def iter_paper_dirs(root: Path) -> List[Path]: |
| out: List[Path] = [] |
| for child in root.iterdir(): |
| if child.is_dir() and (child / PAPER_META_FILE).exists(): |
| out.append(child) |
| return out |
|
|
|
|
| def _extract_contexts(item: Dict[str, Any]) -> List[Dict[str, Any]]: |
| contexts: List[Dict[str, Any]] = [] |
|
|
| raw = item.get("contextsWithIntent") or [] |
| if isinstance(raw, list) and raw: |
| for entry in raw: |
| if not isinstance(entry, dict): |
| continue |
| text_raw = (entry.get("context") or "").strip() |
| text = (entry.get("context_with_marker") or text_raw).strip() |
| intents = entry.get("intents") or [] |
| contexts.append( |
| { |
| "text": text, |
| "text_raw": text_raw, |
| "intents": intents, |
| } |
| ) |
|
|
| |
| if not contexts: |
| raw_alt = item.get("contexts") or [] |
| if isinstance(raw_alt, list): |
| for text in raw_alt: |
| if not isinstance(text, str): |
| continue |
| text = text.strip() |
| if text: |
| contexts.append( |
| { |
| "text": text, |
| "intents": [], |
| } |
| ) |
|
|
| return contexts |
|
|
|
|
| def build_usage_contexts_for_paper(paper_dir: Path) -> Optional[Dict[str, Any]]: |
| citations_path = paper_dir / CITATIONS_FILE |
| data = load_json(citations_path) |
| if data is None: |
| return None |
|
|
| if not isinstance(data, list): |
| print(f"[WARN] {paper_dir.name}: {CITATIONS_FILE} is not a list") |
| return None |
|
|
| citing_entries: List[Dict[str, Any]] = [] |
| total_contexts = 0 |
| citing_with_context = 0 |
| influential_citations = 0 |
| influential_with_context = 0 |
| influential_contexts: List[Dict[str, Any]] = [] |
|
|
| for item in data: |
| if not isinstance(item, dict): |
| continue |
| citing = item.get("citingPaper") or {} |
|
|
| contexts = _extract_contexts(item) |
| is_influential = bool(item.get("isInfluential", False)) |
| if is_influential: |
| influential_citations += 1 |
| if contexts: |
| citing_with_context += 1 |
| total_contexts += len(contexts) |
| if is_influential: |
| influential_with_context += 1 |
|
|
| citing_entries.append( |
| { |
| "citing_paper_id": citing.get("paperId"), |
| "title": citing.get("title"), |
| "external_ids": citing.get("externalIds") or {}, |
| "is_influential": is_influential, |
| "contexts": contexts, |
| } |
| ) |
| if is_influential and contexts: |
| influential_contexts.append( |
| { |
| "citing_paper_id": citing.get("paperId"), |
| "title": citing.get("title"), |
| "external_ids": citing.get("externalIds") or {}, |
| "contexts": contexts, |
| } |
| ) |
|
|
| payload = { |
| "paper_id": paper_dir.name, |
| "total_citations": len(data), |
| "num_contexts": total_contexts, |
| "num_citing_with_context": citing_with_context, |
| "num_citing_without_context": len(data) - citing_with_context, |
| "num_influential_citations": influential_citations, |
| "num_influential_with_context": influential_with_context, |
| "influential_contexts": influential_contexts, |
| "citing_papers": citing_entries, |
| } |
| return payload |
|
|
|
|
| def run(root: Path, out_name: str, overwrite: bool) -> None: |
| root = root.resolve() |
| if not root.exists(): |
| raise SystemExit(f"Root directory does not exist: {root}") |
|
|
| paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name) |
| print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}") |
|
|
| for paper_dir in paper_dirs: |
| out_path = paper_dir / out_name |
| if out_path.exists() and not overwrite: |
| print(f"[SKIP] {paper_dir.name}: {out_name} already exists") |
| continue |
| payload = build_usage_contexts_for_paper(paper_dir) |
| if payload is None: |
| continue |
|
|
| out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") |
| print( |
| f"[OK] {paper_dir.name}: wrote {out_name} " |
| f"({payload['num_contexts']} contexts from {payload['total_citations']} citations)" |
| ) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description="Build usage_contexts.json from citations_metadata.json files." |
| ) |
| parser.add_argument( |
| "--root", |
| type=str, |
| default="processed_papers/acl_2024", |
| help="Root directory containing processed_papers/acl_2024/<paper_id> dirs.", |
| ) |
| parser.add_argument( |
| "--out-name", |
| type=str, |
| default=DEFAULT_OUT_NAME, |
| help="Output filename to write inside each paper dir.", |
| ) |
| parser.add_argument( |
| "--overwrite", |
| action="store_true", |
| help="Overwrite existing usage_contexts.json files.", |
| ) |
| args = parser.parse_args() |
|
|
| run(Path(args.root), out_name=args.out_name, overwrite=args.overwrite) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|