Spaces:

voidful
/

RefCheck

Sleeping

File size: 4,670 Bytes

11a28db

#!/usr/bin/env python3
"""
Build a title-based index from downloaded DBLP bib files.

Reads all .bib files in data/raw/ and produces sharded JSON files
under data/index_shards/ (~25MB each) for GitHub-friendly storage.

Usage:
    python scripts/build_index.py
"""
import json
import os
import re
import shutil
import sys
from pathlib import Path

try:
    import bibtexparser
    from bibtexparser.bparser import BibTexParser
    from bibtexparser.customization import convert_to_unicode
except ImportError:
    print("Error: bibtexparser required. Install: pip install bibtexparser")
    sys.exit(1)

MAX_SHARD_MB = 25  # Target shard size in MB


def normalize_title(title: str) -> str:
    """Normalize a title for index lookup."""
    title = re.sub(r'\{([^}]*)\}', r'\1', title)
    title = re.sub(r'[^\w\s]', ' ', title.lower())
    return re.sub(r'\s+', ' ', title).strip()


def write_shards(index: dict, shard_dir: Path):
    """Split index into ~25MB JSON shard files."""
    if shard_dir.exists():
        shutil.rmtree(shard_dir)
    shard_dir.mkdir(parents=True)

    shard_num = 0
    shard_items = []
    shard_size = 0
    max_bytes = MAX_SHARD_MB * 1024 * 1024

    for key, val in index.items():
        entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))

        if shard_size + entry_size > max_bytes and shard_items:
            path = shard_dir / f"index_{shard_num:02d}.json"
            path.write_text(
                json.dumps(dict(shard_items), ensure_ascii=False),
                encoding="utf-8"
            )
            mb = path.stat().st_size / 1024 / 1024
            print(f"  ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
            shard_num += 1
            shard_items = []
            shard_size = 0

        shard_items.append((key, val))
        shard_size += entry_size

    # Last shard
    if shard_items:
        path = shard_dir / f"index_{shard_num:02d}.json"
        path.write_text(
            json.dumps(dict(shard_items), ensure_ascii=False),
            encoding="utf-8"
        )
        mb = path.stat().st_size / 1024 / 1024
        print(f"  ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
        shard_num += 1

    return shard_num


def main():
    raw_dir   = Path(__file__).resolve().parent.parent / "data" / "raw"
    shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
    
    if not raw_dir.exists():
        print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
        sys.exit(1)

    bib_files = sorted(raw_dir.glob("*.bib"))
    if not bib_files:
        print(f"No .bib files found in {raw_dir}")
        sys.exit(1)

    print(f"📦 Building index from {len(bib_files)} bib files...")

    index = {}
    skipped_files = 0

    for bib_file in bib_files:
        try:
            parser = BibTexParser(common_strings=True)
            parser.customization = convert_to_unicode
            with open(bib_file, encoding="utf-8", errors="replace") as f:
                db = bibtexparser.load(f, parser=parser)
        except Exception as e:
            print(f"  ⚠ Skip {bib_file.name}: {e}")
            skipped_files += 1
            continue

        for entry in db.entries:
            title = entry.get("title", "")
            if not title:
                continue
            
            key = normalize_title(title)
            if not key:
                continue

            if key not in index:
                index[key] = {
                    "title":     title.rstrip('.'),
                    "author":    entry.get("author", ""),
                    "year":      entry.get("year", ""),
                    "booktitle": entry.get("booktitle", ""),
                    "journal":   entry.get("journal", ""),
                    "doi":       entry.get("doi", ""),
                    "url":       entry.get("url", ""),
                    "pages":     entry.get("pages", ""),
                    "volume":    entry.get("volume", ""),
                    "_type":     entry.get("ENTRYTYPE", "inproceedings"),
                    "_source":   bib_file.stem,
                }

    print(f"\n📂 Writing sharded index...")
    n_shards = write_shards(index, shard_dir)

    total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
    print(f"\n✅ Index: {len(index):,} unique entries → {n_shards} shards ({total_mb:.1f} MB total)")
    print(f"   Saved to: {shard_dir}/")
    if skipped_files:
        print(f"   ⚠ {skipped_files} file(s) skipped due to parse errors")


if __name__ == "__main__":
    main()