File size: 3,690 Bytes
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Local Conference Database: fast, offline title lookup against DBLP index.

This module provides a local database of conference/journal proceedings
downloaded from DBLP. It serves as a "ground truth" source that eliminates
the need for network API calls for entries that match known publications.
"""
import json
import re
from pathlib import Path
from typing import Optional
from dataclasses import dataclass


def _normalize(title: str) -> str:
    """Normalize a title for index lookup (must match build_index.py)."""
    title = re.sub(r'\{([^}]*)\}', r'\1', title)
    title = re.sub(r'[^\w\s]', ' ', title.lower())
    return re.sub(r'\s+', ' ', title).strip()


@dataclass
class LocalMatch:
    """Result from a local DB lookup."""
    title: str
    author: str
    year: str
    booktitle: str
    journal: str
    doi: str
    url: str
    pages: str
    volume: str
    entry_type: str
    source_file: str


class LocalConferenceDB:
    """Title-based lookup against locally cached DBLP proceedings."""

    def __init__(self, index_dir: str = None):
        if index_dir is None:
            base = Path(__file__).resolve().parent.parent / "data"
            self._shard_dir = base / "index_shards"
            self._legacy_path = base / "conference_index.json"
        else:
            self._shard_dir = Path(index_dir)
            self._legacy_path = Path(index_dir).parent / "conference_index.json"
        self._idx: dict = {}
        self._loaded = False

    def load(self) -> bool:
        """Load index from shards or legacy single file. Returns True if successful."""
        try:
            # Try sharded index first
            if self._shard_dir.exists():
                shard_files = sorted(self._shard_dir.glob("index_*.json"))
                if shard_files:
                    for shard_path in shard_files:
                        shard_data = json.loads(shard_path.read_text(encoding="utf-8"))
                        self._idx.update(shard_data)
                    self._loaded = True
                    print(f"  ๐Ÿ“š Local DB: {len(self._idx):,} entries loaded ({len(shard_files)} shards).")
                    return True

            # Fallback: legacy single file
            if self._legacy_path.exists():
                self._idx = json.loads(self._legacy_path.read_text(encoding="utf-8"))
                self._loaded = True
                print(f"  ๐Ÿ“š Local DB: {len(self._idx):,} entries loaded.")
                return True

            print("  โš  Local DB not found. Run: python scripts/update_db.py && python scripts/build_index.py")
            return False
        except Exception as e:
            print(f"  โš  Failed to load local DB: {e}")
            return False

    @property
    def is_loaded(self) -> bool:
        return self._loaded and len(self._idx) > 0

    def lookup(self, title: str) -> Optional[LocalMatch]:
        """
        Look up an entry by title.
        Returns LocalMatch if found, None otherwise.
        """
        if not self._loaded:
            return None

        key = _normalize(title)
        data = self._idx.get(key)
        if not data:
            return None

        return LocalMatch(
            title=data.get("title", ""),
            author=data.get("author", ""),
            year=data.get("year", ""),
            booktitle=data.get("booktitle", ""),
            journal=data.get("journal", ""),
            doi=data.get("doi", ""),
            url=data.get("url", ""),
            pages=data.get("pages", ""),
            volume=data.get("volume", ""),
            entry_type=data.get("_type", "inproceedings"),
            source_file=data.get("_source", ""),
        )