File size: 6,986 Bytes

291c0bb
 
bcf20be
291c0bb
 
 
 
 
 
 
bcf20be
291c0bb
 
bcf20be
291c0bb
 
bcf20be
291c0bb
 
 
 
bcf20be
 
 
291c0bb
 
bcf20be
 
 
 
291c0bb
bcf20be
 
 
 
291c0bb
 
 
bcf20be
291c0bb
 
 
bcf20be
 
291c0bb
 
 
 
 
 
 
 
 
 
bcf20be
291c0bb
 
bcf20be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291c0bb
 
 
 
 
 
bcf20be
291c0bb
 
 
 
 
 
 
 
bcf20be
 
291c0bb
 
 
 
bcf20be
291c0bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcf20be
 
 
 
 
 
 
 
 
 
291c0bb
bcf20be
 
291c0bb

"""
Ingest ONLY PDFs from hackathon_data folder
Parallel processing with 4 workers using ThreadPoolExecutor (better for I/O-bound tasks)
"""

import os
import sys
import time
import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv

# Load environment first (before any imports that need env vars)
load_dotenv()

# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
PDFS_DIR = PROJECT_ROOT / "data" / "hackathon_data"  # Changed to hackathon_data
OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))


def worker_ingest(pdf_path: str):
    """
    Worker function to ingest a single PDF.
    Uses lazy imports to avoid issues with multiprocessing/threading.
    """
    try:
        # Import here to avoid global state issues in parallel execution
        import ingest_pdfs

        # Call the ingestion function
        result = ingest_pdfs.ingest_pdf(str(pdf_path))
        return result
    except Exception as e:
        import traceback
        return {
            "pdf_name": Path(pdf_path).name,
            "status": "error",
            "error": str(e),
            "traceback": traceback.format_exc()
        }


def main():
    """Main parallel ingestion pipeline"""
    print("\n" + "="*70)
    print("🚀 HACKATHON DATA INGESTION (4x PARALLEL)")
    print("="*70)
    print(f"📂 PDF Directory: {PDFS_DIR}")
    print(f"⚡ Workers: 4 PDFs at once")
    print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME', 'hackathon')})")
    print("="*70)

    # Validate required environment variables
    required_env_vars = [
        "AZURE_OPENAI_API_KEY",
        "AZURE_OPENAI_ENDPOINT",
        "PINECONE_API_KEY",
        "PINECONE_INDEX_NAME"
    ]

    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
    if missing_vars:
        print(f"\n❌ Missing required environment variables:")
        for var in missing_vars:
            print(f"   - {var}")
        print("\nPlease set these in your .env file.")
        return

    # Check if directory exists
    if not PDFS_DIR.exists():
        print(f"\n❌ Directory not found: {PDFS_DIR}")
        print(f"   Please create the directory and add PDFs to it.")
        return

    # Get all PDFs
    all_pdfs = sorted(PDFS_DIR.glob("*.pdf"))
    print(f"\n📚 Found {len(all_pdfs)} PDFs in hackathon_data folder")

    if not all_pdfs:
        print("\n❌ No PDFs found in hackathon_data folder!")
        print(f"   Please add PDF files to: {PDFS_DIR}")
        return

    for pdf in all_pdfs:
        print(f"   → {pdf.name}")

    print(f"\n⚡ Starting parallel processing with 4 workers...")
    print(f"⏱️  Estimated time: ~{len(all_pdfs) * 80 / 4 / 60:.1f} minutes\n")

    # Process in parallel using ThreadPoolExecutor
    # (Better for I/O-bound tasks like API calls to Azure and Pinecone)
    results = []
    completed = 0
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        # Submit all jobs
        future_to_pdf = {
            executor.submit(worker_ingest, str(pdf)): pdf
            for pdf in all_pdfs
        }

        # Collect results as they complete
        for future in as_completed(future_to_pdf):
            pdf = future_to_pdf[future]
            completed += 1

            try:
                result = future.result()
                results.append(result)

                if result.get("status") == "success":
                    elapsed = time.time() - start_time
                    avg_time = elapsed / completed
                    remaining = len(all_pdfs) - completed
                    eta = remaining * avg_time / 60

                    print(f"✅ [{completed}/{len(all_pdfs)}] {pdf.name}")
                    print(f"   📊 {result['num_vectors']} vectors, {result['time_total']:.1f}s")
                    print(f"   ⏱️  ETA: {eta:.1f} minutes remaining\n")
                else:
                    print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - {result.get('error', 'Unknown error')}\n")

            except Exception as e:
                print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - Error: {e}\n")
                results.append({
                    "pdf_name": pdf.name,
                    "status": "error",
                    "error": str(e)
                })

    total_time = time.time() - start_time

    # Summary
    print("\n" + "="*70)
    print("📊 INGESTION COMPLETE")
    print("="*70)

    successful = [r for r in results if r.get("status") == "success"]
    failed = [r for r in results if r.get("status") == "error"]

    print(f"\n✅ Successful: {len(successful)}/{len(all_pdfs)}")
    print(f"❌ Failed: {len(failed)}")
    print(f"⏱️  Total Time: {total_time/60:.1f} minutes")

    if successful:
        total_vectors = sum(r["num_vectors"] for r in successful)
        avg_time = sum(r["time_total"] for r in successful) / len(successful)
        print(f"\n📦 Total Vectors Uploaded: {total_vectors}")
        print(f"⏱️  Average Time per PDF: {avg_time:.1f}s")

    # Save results
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    results_file = OUTPUT_DIR / "hackathon_data_ingestion.json"

    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump({
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "source_folder": "hackathon_data",
            "total_pdfs": len(all_pdfs),
            "successful": len(successful),
            "failed": len(failed),
            "total_time_seconds": round(total_time, 2),
            "results": results
        }, f, indent=2, ensure_ascii=False)

    print(f"\n📄 Results saved to: {results_file}")

    # Final Pinecone stats
    try:
        from pinecone import Pinecone
        pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
        index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
        stats = index.describe_index_stats()

        print(f"\n📊 Final Pinecone Stats:")
        # Handle both dict-like and object attribute access
        total_vectors = getattr(stats, 'total_vector_count', None) or stats.get('total_vector_count', 0)
        dimension = getattr(stats, 'dimension', None) or stats.get('dimension', 0)
        print(f"   Total Vectors: {total_vectors}")
        print(f"   Dimensions: {dimension}")

        # Show namespaces if available
        namespaces = getattr(stats, 'namespaces', None) or stats.get('namespaces', {})
        if namespaces:
            print(f"   Namespaces: {len(namespaces)}")
    except Exception as e:
        print(f"\n⚠️  Could not fetch Pinecone stats: {e}")
        print(f"   (This is non-fatal - ingestion was still successful)")

    print("\n" + "="*70)
    print("🎉 HACKATHON DATA INGESTION COMPLETE!")
    print("="*70)


if __name__ == "__main__":
    main()