File size: 6,229 Bytes
c429a2d b91b0a5 c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d bf7ec12 92c9b4d bf7ec12 92c9b4d bf7ec12 c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | import sys
import argparse
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(usecwd=True))
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from core.rag.chunk import chunk_markdown_file
from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
from core.rag.vector_store import ChromaConfig, ChromaVectorDB
from core.hash_file.hash_file import HashProcessor
_hasher = HashProcessor(verbose=False)
def get_db_file_info(db: ChromaVectorDB) -> dict:
docs = db.get_all_documents()
file_to_ids = {}
file_to_hash = {}
for d in docs:
meta = d.get("metadata", {})
source = meta.get("source_basename") or meta.get("source_file")
doc_id = d.get("id")
content_hash = meta.get("content_hash", "")
if source and doc_id:
if source not in file_to_ids:
file_to_ids[source] = set()
file_to_ids[source].add(doc_id)
# Store first hash found for file
if source not in file_to_hash and content_hash:
file_to_hash[source] = content_hash
return {"ids": file_to_ids, "hashes": file_to_hash}
def main():
parser = argparse.ArgumentParser(description="Build ChromaDB from markdown files")
parser.add_argument("--force", action="store_true", help="Rebuild all files")
parser.add_argument("--no-delete", action="store_true", help="Don't delete orphaned docs")
args = parser.parse_args()
print("=" * 60)
print("BUILD HUST RAG DATABASE")
print("=" * 60)
# Step 1: Initialize embedder
print("\n[1/5] Initializing embedder...")
emb_cfg = EmbeddingConfig()
emb = QwenEmbeddings(emb_cfg)
print(f" Model: {emb_cfg.model}")
print(f" API: {emb_cfg.api_base_url}")
# Step 2: Initialize ChromaDB
print("\n[2/5] Initializing ChromaDB...")
db_cfg = ChromaConfig()
db = ChromaVectorDB(embedder=emb, config=db_cfg)
old_count = db.count()
print(f" Collection: {db_cfg.collection_name}")
print(f" Current docs: {old_count}")
# Get current DB state
db_info = {"ids": {}, "hashes": {}}
if not args.force and old_count > 0:
print("\n Scanning documents in DB...")
db_info = get_db_file_info(db)
print(f" Found {len(db_info['ids'])} source files in DB")
# Step 3: Scan markdown files
print("\n[3/5] Scanning markdown files...")
root = REPO_ROOT / "data" / "data_process"
md_files = sorted(root.rglob("*.md"))
print(f" Found {len(md_files)} markdown files on disk")
# Compare files on disk vs in DB
current_files = {f.name for f in md_files}
db_files = set(db_info["ids"].keys())
# Find files to delete (in DB but not on disk)
files_to_delete = db_files - current_files
# Step 4: Delete orphaned docs
deleted_count = 0
if files_to_delete and not args.no_delete:
print(f"\n[4/5] Cleaning up {len(files_to_delete)} deleted files...")
for filename in files_to_delete:
doc_ids = list(db_info["ids"].get(filename, []))
if doc_ids:
db.delete_documents(doc_ids)
deleted_count += len(doc_ids)
print(f" Deleted: {filename} ({len(doc_ids)} chunks)")
else:
print("\n[4/5] No files to delete")
# Step 5: Process markdown files (add new, update)
print("\n[5/5] Processing markdown files...")
total_added = 0
total_updated = 0
skipped = 0
for i, f in enumerate(md_files, 1):
file_hash = _hasher.get_file_hash(str(f))
db_hash = db_info["hashes"].get(f.name, "")
existing_ids = db_info["ids"].get(f.name, set())
# Skip if hash matches (file unchanged)
if not args.force and db_hash == file_hash:
print(f" [{i}/{len(md_files)}] {f.name}: SKIPPED (unchanged)")
skipped += 1
continue
# If file changed, delete old chunks first
if existing_ids and not args.force:
db.delete_documents(list(existing_ids))
print(f" [{i}/{len(md_files)}] {f.name}: UPDATED (deleted {len(existing_ids)} old chunks)")
is_update = True
else:
is_update = False
try:
docs = chunk_markdown_file(f)
if docs:
# Add hash to metadata for change detection
for doc in docs:
if hasattr(doc, 'metadata'):
doc.metadata["content_hash"] = file_hash
elif isinstance(doc, dict) and "metadata" in doc:
doc["metadata"]["content_hash"] = file_hash
n = db.upsert_documents(docs)
if is_update:
total_updated += n
print(f" [{i}/{len(md_files)}] {f.name}: +{n} new chunks")
else:
total_added += n
print(f" [{i}/{len(md_files)}] {f.name}: {n} chunks")
else:
print(f" [{i}/{len(md_files)}] {f.name}: SKIPPED (no chunks)")
except Exception as e:
print(f" [{i}/{len(md_files)}] {f.name}: ERROR - {e}")
# Summary
new_count = db.count()
has_changes = deleted_count > 0 or total_updated > 0 or total_added > 0
# Delete BM25 cache if changes detected (BM25 doesn't support incremental update)
if has_changes:
bm25_cache = REPO_ROOT / "data" / "chroma" / "bm25_cache.pkl"
if bm25_cache.exists():
bm25_cache.unlink()
print("\n[!] Deleted BM25 cache (will auto-rebuild on next query)")
print(f"\n{'=' * 60}")
print("SUMMARY")
print("=" * 60)
print(f" Deleted (orphaned): {deleted_count} chunks")
print(f" Updated: {total_updated} chunks")
print(f" Added: {total_added} chunks")
print(f" Skipped: {skipped} files")
print(f" DB docs: {old_count} -> {new_count} ({new_count - old_count:+d})")
print("\nDONE!")
if __name__ == "__main__":
main() |