File size: 4,670 Bytes
11a28db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | #!/usr/bin/env python3
"""
Build a title-based index from downloaded DBLP bib files.
Reads all .bib files in data/raw/ and produces sharded JSON files
under data/index_shards/ (~25MB each) for GitHub-friendly storage.
Usage:
python scripts/build_index.py
"""
import json
import os
import re
import shutil
import sys
from pathlib import Path
try:
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
except ImportError:
print("Error: bibtexparser required. Install: pip install bibtexparser")
sys.exit(1)
MAX_SHARD_MB = 25 # Target shard size in MB
def normalize_title(title: str) -> str:
"""Normalize a title for index lookup."""
title = re.sub(r'\{([^}]*)\}', r'\1', title)
title = re.sub(r'[^\w\s]', ' ', title.lower())
return re.sub(r'\s+', ' ', title).strip()
def write_shards(index: dict, shard_dir: Path):
"""Split index into ~25MB JSON shard files."""
if shard_dir.exists():
shutil.rmtree(shard_dir)
shard_dir.mkdir(parents=True)
shard_num = 0
shard_items = []
shard_size = 0
max_bytes = MAX_SHARD_MB * 1024 * 1024
for key, val in index.items():
entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))
if shard_size + entry_size > max_bytes and shard_items:
path = shard_dir / f"index_{shard_num:02d}.json"
path.write_text(
json.dumps(dict(shard_items), ensure_ascii=False),
encoding="utf-8"
)
mb = path.stat().st_size / 1024 / 1024
print(f" β index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
shard_num += 1
shard_items = []
shard_size = 0
shard_items.append((key, val))
shard_size += entry_size
# Last shard
if shard_items:
path = shard_dir / f"index_{shard_num:02d}.json"
path.write_text(
json.dumps(dict(shard_items), ensure_ascii=False),
encoding="utf-8"
)
mb = path.stat().st_size / 1024 / 1024
print(f" β index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
shard_num += 1
return shard_num
def main():
raw_dir = Path(__file__).resolve().parent.parent / "data" / "raw"
shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
if not raw_dir.exists():
print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
sys.exit(1)
bib_files = sorted(raw_dir.glob("*.bib"))
if not bib_files:
print(f"No .bib files found in {raw_dir}")
sys.exit(1)
print(f"π¦ Building index from {len(bib_files)} bib files...")
index = {}
skipped_files = 0
for bib_file in bib_files:
try:
parser = BibTexParser(common_strings=True)
parser.customization = convert_to_unicode
with open(bib_file, encoding="utf-8", errors="replace") as f:
db = bibtexparser.load(f, parser=parser)
except Exception as e:
print(f" β Skip {bib_file.name}: {e}")
skipped_files += 1
continue
for entry in db.entries:
title = entry.get("title", "")
if not title:
continue
key = normalize_title(title)
if not key:
continue
if key not in index:
index[key] = {
"title": title.rstrip('.'),
"author": entry.get("author", ""),
"year": entry.get("year", ""),
"booktitle": entry.get("booktitle", ""),
"journal": entry.get("journal", ""),
"doi": entry.get("doi", ""),
"url": entry.get("url", ""),
"pages": entry.get("pages", ""),
"volume": entry.get("volume", ""),
"_type": entry.get("ENTRYTYPE", "inproceedings"),
"_source": bib_file.stem,
}
print(f"\nπ Writing sharded index...")
n_shards = write_shards(index, shard_dir)
total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
print(f"\nβ
Index: {len(index):,} unique entries β {n_shards} shards ({total_mb:.1f} MB total)")
print(f" Saved to: {shard_dir}/")
if skipped_files:
print(f" β {skipped_files} file(s) skipped due to parse errors")
if __name__ == "__main__":
main()
|