File size: 6,229 Bytes

c429a2d
b91b0a5
c429a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
 
 
 
92c9b4d
 
 
c429a2d
 
 
 
 
 
92c9b4d
 
c429a2d
 
 
 
 
92c9b4d
 
c429a2d
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
 
92c9b4d
c429a2d
92c9b4d
c429a2d
92c9b4d
 
c429a2d
 
92c9b4d
c429a2d
92c9b4d
c429a2d
 
 
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
92c9b4d
 
c429a2d
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
 
 
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
 
 
 
 
 
92c9b4d
c429a2d
 
 
 
92c9b4d
c429a2d
92c9b4d
c429a2d
92c9b4d
c429a2d
bf7ec12
 
92c9b4d
bf7ec12
 
 
 
92c9b4d
bf7ec12
c429a2d
92c9b4d
c429a2d
92c9b4d
 
 
 
 
c429a2d
92c9b4d
c429a2d

import sys
import argparse
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv(usecwd=True))

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from core.rag.chunk import chunk_markdown_file
from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
from core.rag.vector_store import ChromaConfig, ChromaVectorDB
from core.hash_file.hash_file import HashProcessor

_hasher = HashProcessor(verbose=False)


def get_db_file_info(db: ChromaVectorDB) -> dict:

    docs = db.get_all_documents()
    file_to_ids = {}
    file_to_hash = {}
    
    for d in docs:
        meta = d.get("metadata", {})
        source = meta.get("source_basename") or meta.get("source_file")
        doc_id = d.get("id")
        content_hash = meta.get("content_hash", "")
        
        if source and doc_id:
            if source not in file_to_ids:
                file_to_ids[source] = set()
            file_to_ids[source].add(doc_id)
            
            # Store first hash found for file
            if source not in file_to_hash and content_hash:
                file_to_hash[source] = content_hash
    
    return {"ids": file_to_ids, "hashes": file_to_hash}


def main():
    parser = argparse.ArgumentParser(description="Build ChromaDB from markdown files")
    parser.add_argument("--force", action="store_true", help="Rebuild all files")
    parser.add_argument("--no-delete", action="store_true", help="Don't delete orphaned docs")
    args = parser.parse_args()
    
    print("=" * 60)
    print("BUILD HUST RAG DATABASE")
    print("=" * 60)
    
    # Step 1: Initialize embedder
    print("\n[1/5] Initializing embedder...")
    emb_cfg = EmbeddingConfig()
    emb = QwenEmbeddings(emb_cfg)
    print(f"  Model: {emb_cfg.model}")
    print(f"  API: {emb_cfg.api_base_url}")
    
    # Step 2: Initialize ChromaDB
    print("\n[2/5] Initializing ChromaDB...")
    db_cfg = ChromaConfig()
    db = ChromaVectorDB(embedder=emb, config=db_cfg)
    old_count = db.count()
    print(f"  Collection: {db_cfg.collection_name}")
    print(f"  Current docs: {old_count}")
    
    # Get current DB state
    db_info = {"ids": {}, "hashes": {}}
    if not args.force and old_count > 0:
        print("\n  Scanning documents in DB...")
        db_info = get_db_file_info(db)
        print(f"  Found {len(db_info['ids'])} source files in DB")
    
    # Step 3: Scan markdown files
    print("\n[3/5] Scanning markdown files...")
    root = REPO_ROOT / "data" / "data_process"
    md_files = sorted(root.rglob("*.md"))
    print(f"  Found {len(md_files)} markdown files on disk")
    
    # Compare files on disk vs in DB
    current_files = {f.name for f in md_files}
    db_files = set(db_info["ids"].keys())
    
    # Find files to delete (in DB but not on disk)
    files_to_delete = db_files - current_files
    
    # Step 4: Delete orphaned docs
    deleted_count = 0
    if files_to_delete and not args.no_delete:
        print(f"\n[4/5] Cleaning up {len(files_to_delete)} deleted files...")
        for filename in files_to_delete:
            doc_ids = list(db_info["ids"].get(filename, []))
            if doc_ids:
                db.delete_documents(doc_ids)
                deleted_count += len(doc_ids)
                print(f"  Deleted: {filename} ({len(doc_ids)} chunks)")
    else:
        print("\n[4/5] No files to delete")
    
    # Step 5: Process markdown files (add new, update)
    print("\n[5/5] Processing markdown files...")
    total_added = 0
    total_updated = 0
    skipped = 0
    
    for i, f in enumerate(md_files, 1):
        file_hash = _hasher.get_file_hash(str(f))
        db_hash = db_info["hashes"].get(f.name, "")
        existing_ids = db_info["ids"].get(f.name, set())
        
        # Skip if hash matches (file unchanged)
        if not args.force and db_hash == file_hash:
            print(f"  [{i}/{len(md_files)}] {f.name}: SKIPPED (unchanged)")
            skipped += 1
            continue
        
        # If file changed, delete old chunks first
        if existing_ids and not args.force:
            db.delete_documents(list(existing_ids))
            print(f"  [{i}/{len(md_files)}] {f.name}: UPDATED (deleted {len(existing_ids)} old chunks)")
            is_update = True
        else:
            is_update = False
            
        try:
            docs = chunk_markdown_file(f)
            if docs:
                # Add hash to metadata for change detection
                for doc in docs:
                    if hasattr(doc, 'metadata'):
                        doc.metadata["content_hash"] = file_hash
                    elif isinstance(doc, dict) and "metadata" in doc:
                        doc["metadata"]["content_hash"] = file_hash
                
                n = db.upsert_documents(docs)
                if is_update:
                    total_updated += n
                    print(f"  [{i}/{len(md_files)}] {f.name}: +{n} new chunks")
                else:
                    total_added += n
                    print(f"  [{i}/{len(md_files)}] {f.name}: {n} chunks")
            else:
                print(f"  [{i}/{len(md_files)}] {f.name}: SKIPPED (no chunks)")
        except Exception as e:
            print(f"  [{i}/{len(md_files)}] {f.name}: ERROR - {e}")
    
    # Summary
    new_count = db.count()
    has_changes = deleted_count > 0 or total_updated > 0 or total_added > 0
    
    # Delete BM25 cache if changes detected (BM25 doesn't support incremental update)
    if has_changes:
        bm25_cache = REPO_ROOT / "data" / "chroma" / "bm25_cache.pkl"
        if bm25_cache.exists():
            bm25_cache.unlink()
            print("\n[!] Deleted BM25 cache (will auto-rebuild on next query)")
    
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print("=" * 60)
    print(f"  Deleted (orphaned): {deleted_count} chunks")
    print(f"  Updated: {total_updated} chunks") 
    print(f"  Added: {total_added} chunks")
    print(f"  Skipped: {skipped} files")
    print(f"  DB docs: {old_count} -> {new_count} ({new_count - old_count:+d})")
    
    print("\nDONE!")


if __name__ == "__main__":
    main()