import sys
import argparse
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv(usecwd=True))

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from core.rag.chunk import chunk_markdown_file
from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
from core.rag.vector_store import ChromaConfig, ChromaVectorDB
from core.hash_file.hash_file import HashProcessor

_hasher = HashProcessor(verbose=False)


def get_db_file_info(db: ChromaVectorDB) -> dict:

    docs = db.get_all_documents()
    file_to_ids = {}
    file_to_hash = {}
    
    for d in docs:
        meta = d.get("metadata", {})
        source = meta.get("source_basename") or meta.get("source_file")
        doc_id = d.get("id")
        content_hash = meta.get("content_hash", "")
        
        if source and doc_id:
            if source not in file_to_ids:
                file_to_ids[source] = set()
            file_to_ids[source].add(doc_id)
            
            # Store first hash found for file
            if source not in file_to_hash and content_hash:
                file_to_hash[source] = content_hash
    
    return {"ids": file_to_ids, "hashes": file_to_hash}


def main():
    parser = argparse.ArgumentParser(description="Build ChromaDB from markdown files")
    parser.add_argument("--force", action="store_true", help="Rebuild all files")
    parser.add_argument("--no-delete", action="store_true", help="Don't delete orphaned docs")
    args = parser.parse_args()
    
    print("=" * 60)
    print("BUILD HUST RAG DATABASE")
    print("=" * 60)
    
    # Step 1: Initialize embedder
    print("\n[1/5] Initializing embedder...")
    emb_cfg = EmbeddingConfig()
    emb = QwenEmbeddings(emb_cfg)
    print(f"  Model: {emb_cfg.model}")
    print(f"  API: {emb_cfg.api_base_url}")
    
    # Step 2: Initialize ChromaDB
    print("\n[2/5] Initializing ChromaDB...")
    db_cfg = ChromaConfig()
    db = ChromaVectorDB(embedder=emb, config=db_cfg)
    old_count = db.count()
    print(f"  Collection: {db_cfg.collection_name}")
    print(f"  Current docs: {old_count}")
    
    # Get current DB state
    db_info = {"ids": {}, "hashes": {}}
    if not args.force and old_count > 0:
        print("\n  Scanning documents in DB...")
        db_info = get_db_file_info(db)
        print(f"  Found {len(db_info['ids'])} source files in DB")
    
    # Step 3: Scan markdown files
    print("\n[3/5] Scanning markdown files...")
    root = REPO_ROOT / "data" / "data_process"
    md_files = sorted(root.rglob("*.md"))
    print(f"  Found {len(md_files)} markdown files on disk")
    
    # Compare files on disk vs in DB
    current_files = {f.name for f in md_files}
    db_files = set(db_info["ids"].keys())
    
    # Find files to delete (in DB but not on disk)
    files_to_delete = db_files - current_files
    
    # Step 4: Delete orphaned docs
    deleted_count = 0
    if files_to_delete and not args.no_delete:
        print(f"\n[4/5] Cleaning up {len(files_to_delete)} deleted files...")
        for filename in files_to_delete:
            doc_ids = list(db_info["ids"].get(filename, []))
            if doc_ids:
                db.delete_documents(doc_ids)
                deleted_count += len(doc_ids)
                print(f"  Deleted: {filename} ({len(doc_ids)} chunks)")
    else:
        print("\n[4/5] No files to delete")
    
    # Step 5: Process markdown files (add new, update)
    print("\n[5/5] Processing markdown files...")
    total_added = 0
    total_updated = 0
    skipped = 0
    
    for i, f in enumerate(md_files, 1):
        file_hash = _hasher.get_file_hash(str(f))
        db_hash = db_info["hashes"].get(f.name, "")
        existing_ids = db_info["ids"].get(f.name, set())
        
        # Skip if hash matches (file unchanged)
        if not args.force and db_hash == file_hash:
            print(f"  [{i}/{len(md_files)}] {f.name}: SKIPPED (unchanged)")
            skipped += 1
            continue
        
        # If file changed, delete old chunks first
        if existing_ids and not args.force:
            db.delete_documents(list(existing_ids))
            print(f"  [{i}/{len(md_files)}] {f.name}: UPDATED (deleted {len(existing_ids)} old chunks)")
            is_update = True
        else:
            is_update = False
            
        try:
            docs = chunk_markdown_file(f)
            if docs:
                # Add hash to metadata for change detection
                for doc in docs:
                    if hasattr(doc, 'metadata'):
                        doc.metadata["content_hash"] = file_hash
                    elif isinstance(doc, dict) and "metadata" in doc:
                        doc["metadata"]["content_hash"] = file_hash
                
                n = db.upsert_documents(docs)
                if is_update:
                    total_updated += n
                    print(f"  [{i}/{len(md_files)}] {f.name}: +{n} new chunks")
                else:
                    total_added += n
                    print(f"  [{i}/{len(md_files)}] {f.name}: {n} chunks")
            else:
                print(f"  [{i}/{len(md_files)}] {f.name}: SKIPPED (no chunks)")
        except Exception as e:
            print(f"  [{i}/{len(md_files)}] {f.name}: ERROR - {e}")
    
    # Summary
    new_count = db.count()
    has_changes = deleted_count > 0 or total_updated > 0 or total_added > 0
    
    # Delete BM25 cache if changes detected (BM25 doesn't support incremental update)
    if has_changes:
        bm25_cache = REPO_ROOT / "data" / "chroma" / "bm25_cache.pkl"
        if bm25_cache.exists():
            bm25_cache.unlink()
            print("\n[!] Deleted BM25 cache (will auto-rebuild on next query)")
    
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print("=" * 60)
    print(f"  Deleted (orphaned): {deleted_count} chunks")
    print(f"  Updated: {total_updated} chunks") 
    print(f"  Added: {total_added} chunks")
    print(f"  Skipped: {skipped} files")
    print(f"  DB docs: {old_count} -> {new_count} ({new_count - old_count:+d})")
    
    print("\nDONE!")


if __name__ == "__main__":
    main()