File size: 6,229 Bytes
c429a2d
b91b0a5
c429a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
 
 
 
92c9b4d
 
 
c429a2d
 
 
 
 
 
92c9b4d
 
c429a2d
 
 
 
 
92c9b4d
 
c429a2d
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
 
92c9b4d
c429a2d
92c9b4d
c429a2d
92c9b4d
 
c429a2d
 
92c9b4d
c429a2d
92c9b4d
c429a2d
 
 
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
92c9b4d
 
c429a2d
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
 
 
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
92c9b4d
c429a2d
bf7ec12
 
92c9b4d
bf7ec12
 
 
 
92c9b4d
bf7ec12
c429a2d
92c9b4d
c429a2d
92c9b4d
 
 
 
 
c429a2d
92c9b4d
c429a2d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import sys
import argparse
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv(usecwd=True))

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from core.rag.chunk import chunk_markdown_file
from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
from core.rag.vector_store import ChromaConfig, ChromaVectorDB
from core.hash_file.hash_file import HashProcessor

_hasher = HashProcessor(verbose=False)


def get_db_file_info(db: ChromaVectorDB) -> dict:

    docs = db.get_all_documents()
    file_to_ids = {}
    file_to_hash = {}
    
    for d in docs:
        meta = d.get("metadata", {})
        source = meta.get("source_basename") or meta.get("source_file")
        doc_id = d.get("id")
        content_hash = meta.get("content_hash", "")
        
        if source and doc_id:
            if source not in file_to_ids:
                file_to_ids[source] = set()
            file_to_ids[source].add(doc_id)
            
            # Store first hash found for file
            if source not in file_to_hash and content_hash:
                file_to_hash[source] = content_hash
    
    return {"ids": file_to_ids, "hashes": file_to_hash}


def main():
    parser = argparse.ArgumentParser(description="Build ChromaDB from markdown files")
    parser.add_argument("--force", action="store_true", help="Rebuild all files")
    parser.add_argument("--no-delete", action="store_true", help="Don't delete orphaned docs")
    args = parser.parse_args()
    
    print("=" * 60)
    print("BUILD HUST RAG DATABASE")
    print("=" * 60)
    
    # Step 1: Initialize embedder
    print("\n[1/5] Initializing embedder...")
    emb_cfg = EmbeddingConfig()
    emb = QwenEmbeddings(emb_cfg)
    print(f"  Model: {emb_cfg.model}")
    print(f"  API: {emb_cfg.api_base_url}")
    
    # Step 2: Initialize ChromaDB
    print("\n[2/5] Initializing ChromaDB...")
    db_cfg = ChromaConfig()
    db = ChromaVectorDB(embedder=emb, config=db_cfg)
    old_count = db.count()
    print(f"  Collection: {db_cfg.collection_name}")
    print(f"  Current docs: {old_count}")
    
    # Get current DB state
    db_info = {"ids": {}, "hashes": {}}
    if not args.force and old_count > 0:
        print("\n  Scanning documents in DB...")
        db_info = get_db_file_info(db)
        print(f"  Found {len(db_info['ids'])} source files in DB")
    
    # Step 3: Scan markdown files
    print("\n[3/5] Scanning markdown files...")
    root = REPO_ROOT / "data" / "data_process"
    md_files = sorted(root.rglob("*.md"))
    print(f"  Found {len(md_files)} markdown files on disk")
    
    # Compare files on disk vs in DB
    current_files = {f.name for f in md_files}
    db_files = set(db_info["ids"].keys())
    
    # Find files to delete (in DB but not on disk)
    files_to_delete = db_files - current_files
    
    # Step 4: Delete orphaned docs
    deleted_count = 0
    if files_to_delete and not args.no_delete:
        print(f"\n[4/5] Cleaning up {len(files_to_delete)} deleted files...")
        for filename in files_to_delete:
            doc_ids = list(db_info["ids"].get(filename, []))
            if doc_ids:
                db.delete_documents(doc_ids)
                deleted_count += len(doc_ids)
                print(f"  Deleted: {filename} ({len(doc_ids)} chunks)")
    else:
        print("\n[4/5] No files to delete")
    
    # Step 5: Process markdown files (add new, update)
    print("\n[5/5] Processing markdown files...")
    total_added = 0
    total_updated = 0
    skipped = 0
    
    for i, f in enumerate(md_files, 1):
        file_hash = _hasher.get_file_hash(str(f))
        db_hash = db_info["hashes"].get(f.name, "")
        existing_ids = db_info["ids"].get(f.name, set())
        
        # Skip if hash matches (file unchanged)
        if not args.force and db_hash == file_hash:
            print(f"  [{i}/{len(md_files)}] {f.name}: SKIPPED (unchanged)")
            skipped += 1
            continue
        
        # If file changed, delete old chunks first
        if existing_ids and not args.force:
            db.delete_documents(list(existing_ids))
            print(f"  [{i}/{len(md_files)}] {f.name}: UPDATED (deleted {len(existing_ids)} old chunks)")
            is_update = True
        else:
            is_update = False
            
        try:
            docs = chunk_markdown_file(f)
            if docs:
                # Add hash to metadata for change detection
                for doc in docs:
                    if hasattr(doc, 'metadata'):
                        doc.metadata["content_hash"] = file_hash
                    elif isinstance(doc, dict) and "metadata" in doc:
                        doc["metadata"]["content_hash"] = file_hash
                
                n = db.upsert_documents(docs)
                if is_update:
                    total_updated += n
                    print(f"  [{i}/{len(md_files)}] {f.name}: +{n} new chunks")
                else:
                    total_added += n
                    print(f"  [{i}/{len(md_files)}] {f.name}: {n} chunks")
            else:
                print(f"  [{i}/{len(md_files)}] {f.name}: SKIPPED (no chunks)")
        except Exception as e:
            print(f"  [{i}/{len(md_files)}] {f.name}: ERROR - {e}")
    
    # Summary
    new_count = db.count()
    has_changes = deleted_count > 0 or total_updated > 0 or total_added > 0
    
    # Delete BM25 cache if changes detected (BM25 doesn't support incremental update)
    if has_changes:
        bm25_cache = REPO_ROOT / "data" / "chroma" / "bm25_cache.pkl"
        if bm25_cache.exists():
            bm25_cache.unlink()
            print("\n[!] Deleted BM25 cache (will auto-rebuild on next query)")
    
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print("=" * 60)
    print(f"  Deleted (orphaned): {deleted_count} chunks")
    print(f"  Updated: {total_updated} chunks") 
    print(f"  Added: {total_added} chunks")
    print(f"  Skipped: {skipped} files")
    print(f"  DB docs: {old_count} -> {new_count} ({new_count - old_count:+d})")
    
    print("\nDONE!")


if __name__ == "__main__":
    main()