SOCAR_Hackathon / scripts /ingest_hackathon_data.py

init

bcf20be 2 months ago

6.99 kB

	"""
	Ingest ONLY PDFs from hackathon_data folder
	Parallel processing with 4 workers using ThreadPoolExecutor (better for I/O-bound tasks)
	"""

	import os
	import sys
	import time
	import json
	from pathlib import Path
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from dotenv import load_dotenv

	# Load environment first (before any imports that need env vars)
	load_dotenv()

	# Project paths
	PROJECT_ROOT = Path(__file__).parent.parent
	PDFS_DIR = PROJECT_ROOT / "data" / "hackathon_data" # Changed to hackathon_data
	OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"

	# Add parent directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))


	def worker_ingest(pdf_path: str):
	"""
	Worker function to ingest a single PDF.
	Uses lazy imports to avoid issues with multiprocessing/threading.
	"""
	try:
	# Import here to avoid global state issues in parallel execution
	import ingest_pdfs

	# Call the ingestion function
	result = ingest_pdfs.ingest_pdf(str(pdf_path))
	return result
	except Exception as e:
	import traceback
	return {
	"pdf_name": Path(pdf_path).name,
	"status": "error",
	"error": str(e),
	"traceback": traceback.format_exc()
	}


	def main():
	"""Main parallel ingestion pipeline"""
	print("\n" + "="*70)
	print("🚀 HACKATHON DATA INGESTION (4x PARALLEL)")
	print("="*70)
	print(f"📂 PDF Directory: {PDFS_DIR}")
	print(f"⚡ Workers: 4 PDFs at once")
	print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME', 'hackathon')})")
	print("="*70)

	# Validate required environment variables
	required_env_vars = [
	"AZURE_OPENAI_API_KEY",
	"AZURE_OPENAI_ENDPOINT",
	"PINECONE_API_KEY",
	"PINECONE_INDEX_NAME"
	]

	missing_vars = [var for var in required_env_vars if not os.getenv(var)]
	if missing_vars:
	print(f"\n❌ Missing required environment variables:")
	for var in missing_vars:
	print(f" - {var}")
	print("\nPlease set these in your .env file.")
	return

	# Check if directory exists
	if not PDFS_DIR.exists():
	print(f"\n❌ Directory not found: {PDFS_DIR}")
	print(f" Please create the directory and add PDFs to it.")
	return

	# Get all PDFs
	all_pdfs = sorted(PDFS_DIR.glob("*.pdf"))
	print(f"\n📚 Found {len(all_pdfs)} PDFs in hackathon_data folder")

	if not all_pdfs:
	print("\n❌ No PDFs found in hackathon_data folder!")
	print(f" Please add PDF files to: {PDFS_DIR}")
	return

	for pdf in all_pdfs:
	print(f" → {pdf.name}")

	print(f"\n⚡ Starting parallel processing with 4 workers...")
	print(f"⏱️ Estimated time: ~{len(all_pdfs) * 80 / 4 / 60:.1f} minutes\n")

	# Process in parallel using ThreadPoolExecutor
	# (Better for I/O-bound tasks like API calls to Azure and Pinecone)
	results = []
	completed = 0
	start_time = time.time()

	with ThreadPoolExecutor(max_workers=4) as executor:
	# Submit all jobs
	future_to_pdf = {
	executor.submit(worker_ingest, str(pdf)): pdf
	for pdf in all_pdfs
	}

	# Collect results as they complete
	for future in as_completed(future_to_pdf):
	pdf = future_to_pdf[future]
	completed += 1

	try:
	result = future.result()
	results.append(result)

	if result.get("status") == "success":
	elapsed = time.time() - start_time
	avg_time = elapsed / completed
	remaining = len(all_pdfs) - completed
	eta = remaining * avg_time / 60

	print(f"✅ [{completed}/{len(all_pdfs)}] {pdf.name}")
	print(f" 📊 {result['num_vectors']} vectors, {result['time_total']:.1f}s")
	print(f" ⏱️ ETA: {eta:.1f} minutes remaining\n")
	else:
	print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - {result.get('error', 'Unknown error')}\n")

	except Exception as e:
	print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - Error: {e}\n")
	results.append({
	"pdf_name": pdf.name,
	"status": "error",
	"error": str(e)
	})

	total_time = time.time() - start_time

	# Summary
	print("\n" + "="*70)
	print("📊 INGESTION COMPLETE")
	print("="*70)

	successful = [r for r in results if r.get("status") == "success"]
	failed = [r for r in results if r.get("status") == "error"]

	print(f"\n✅ Successful: {len(successful)}/{len(all_pdfs)}")
	print(f"❌ Failed: {len(failed)}")
	print(f"⏱️ Total Time: {total_time/60:.1f} minutes")

	if successful:
	total_vectors = sum(r["num_vectors"] for r in successful)
	avg_time = sum(r["time_total"] for r in successful) / len(successful)
	print(f"\n📦 Total Vectors Uploaded: {total_vectors}")
	print(f"⏱️ Average Time per PDF: {avg_time:.1f}s")

	# Save results
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
	results_file = OUTPUT_DIR / "hackathon_data_ingestion.json"

	with open(results_file, 'w', encoding='utf-8') as f:
	json.dump({
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
	"source_folder": "hackathon_data",
	"total_pdfs": len(all_pdfs),
	"successful": len(successful),
	"failed": len(failed),
	"total_time_seconds": round(total_time, 2),
	"results": results
	}, f, indent=2, ensure_ascii=False)

	print(f"\n📄 Results saved to: {results_file}")

	# Final Pinecone stats
	try:
	from pinecone import Pinecone
	pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
	index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
	stats = index.describe_index_stats()

	print(f"\n📊 Final Pinecone Stats:")
	# Handle both dict-like and object attribute access
	total_vectors = getattr(stats, 'total_vector_count', None) or stats.get('total_vector_count', 0)
	dimension = getattr(stats, 'dimension', None) or stats.get('dimension', 0)
	print(f" Total Vectors: {total_vectors}")
	print(f" Dimensions: {dimension}")

	# Show namespaces if available
	namespaces = getattr(stats, 'namespaces', None) or stats.get('namespaces', {})
	if namespaces:
	print(f" Namespaces: {len(namespaces)}")
	except Exception as e:
	print(f"\n⚠️ Could not fetch Pinecone stats: {e}")
	print(f" (This is non-fatal - ingestion was still successful)")

	print("\n" + "="*70)
	print("🎉 HACKATHON DATA INGESTION COMPLETE!")
	print("="*70)


	if __name__ == "__main__":
	main()