File size: 4,670 Bytes
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
"""
Build a title-based index from downloaded DBLP bib files.

Reads all .bib files in data/raw/ and produces sharded JSON files
under data/index_shards/ (~25MB each) for GitHub-friendly storage.

Usage:
    python scripts/build_index.py
"""
import json
import os
import re
import shutil
import sys
from pathlib import Path

try:
    import bibtexparser
    from bibtexparser.bparser import BibTexParser
    from bibtexparser.customization import convert_to_unicode
except ImportError:
    print("Error: bibtexparser required. Install: pip install bibtexparser")
    sys.exit(1)

MAX_SHARD_MB = 25  # Target shard size in MB


def normalize_title(title: str) -> str:
    """Normalize a title for index lookup."""
    title = re.sub(r'\{([^}]*)\}', r'\1', title)
    title = re.sub(r'[^\w\s]', ' ', title.lower())
    return re.sub(r'\s+', ' ', title).strip()


def write_shards(index: dict, shard_dir: Path):
    """Split index into ~25MB JSON shard files."""
    if shard_dir.exists():
        shutil.rmtree(shard_dir)
    shard_dir.mkdir(parents=True)

    shard_num = 0
    shard_items = []
    shard_size = 0
    max_bytes = MAX_SHARD_MB * 1024 * 1024

    for key, val in index.items():
        entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))

        if shard_size + entry_size > max_bytes and shard_items:
            path = shard_dir / f"index_{shard_num:02d}.json"
            path.write_text(
                json.dumps(dict(shard_items), ensure_ascii=False),
                encoding="utf-8"
            )
            mb = path.stat().st_size / 1024 / 1024
            print(f"  βœ“ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
            shard_num += 1
            shard_items = []
            shard_size = 0

        shard_items.append((key, val))
        shard_size += entry_size

    # Last shard
    if shard_items:
        path = shard_dir / f"index_{shard_num:02d}.json"
        path.write_text(
            json.dumps(dict(shard_items), ensure_ascii=False),
            encoding="utf-8"
        )
        mb = path.stat().st_size / 1024 / 1024
        print(f"  βœ“ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
        shard_num += 1

    return shard_num


def main():
    raw_dir   = Path(__file__).resolve().parent.parent / "data" / "raw"
    shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
    
    if not raw_dir.exists():
        print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
        sys.exit(1)

    bib_files = sorted(raw_dir.glob("*.bib"))
    if not bib_files:
        print(f"No .bib files found in {raw_dir}")
        sys.exit(1)

    print(f"πŸ“¦ Building index from {len(bib_files)} bib files...")

    index = {}
    skipped_files = 0

    for bib_file in bib_files:
        try:
            parser = BibTexParser(common_strings=True)
            parser.customization = convert_to_unicode
            with open(bib_file, encoding="utf-8", errors="replace") as f:
                db = bibtexparser.load(f, parser=parser)
        except Exception as e:
            print(f"  ⚠ Skip {bib_file.name}: {e}")
            skipped_files += 1
            continue

        for entry in db.entries:
            title = entry.get("title", "")
            if not title:
                continue
            
            key = normalize_title(title)
            if not key:
                continue

            if key not in index:
                index[key] = {
                    "title":     title.rstrip('.'),
                    "author":    entry.get("author", ""),
                    "year":      entry.get("year", ""),
                    "booktitle": entry.get("booktitle", ""),
                    "journal":   entry.get("journal", ""),
                    "doi":       entry.get("doi", ""),
                    "url":       entry.get("url", ""),
                    "pages":     entry.get("pages", ""),
                    "volume":    entry.get("volume", ""),
                    "_type":     entry.get("ENTRYTYPE", "inproceedings"),
                    "_source":   bib_file.stem,
                }

    print(f"\nπŸ“‚ Writing sharded index...")
    n_shards = write_shards(index, shard_dir)

    total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
    print(f"\nβœ… Index: {len(index):,} unique entries β†’ {n_shards} shards ({total_mb:.1f} MB total)")
    print(f"   Saved to: {shard_dir}/")
    if skipped_files:
        print(f"   ⚠ {skipped_files} file(s) skipped due to parse errors")


if __name__ == "__main__":
    main()