| | """ |
| | Example: RAG Pipeline |
| | |
| | Demonstrates: |
| | 1. Indexing documents into vector store |
| | 2. Semantic search |
| | 3. Question answering with citations |
| | """ |
| |
|
| | from pathlib import Path |
| | from loguru import logger |
| |
|
| | |
| | from src.rag import ( |
| | VectorStoreConfig, |
| | EmbeddingConfig, |
| | RetrieverConfig, |
| | GeneratorConfig, |
| | get_document_indexer, |
| | get_document_retriever, |
| | get_grounded_generator, |
| | ) |
| |
|
| |
|
| | def example_indexing(): |
| | """Index documents into vector store.""" |
| | print("=" * 50) |
| | print("Document Indexing") |
| | print("=" * 50) |
| |
|
| | |
| | indexer = get_document_indexer() |
| |
|
| | |
| | sample_doc = Path("./data/sample.pdf") |
| |
|
| | if not sample_doc.exists(): |
| | print(f"Sample document not found: {sample_doc}") |
| | print("Create a sample PDF at ./data/sample.pdf") |
| | return False |
| |
|
| | |
| | result = indexer.index_document(sample_doc) |
| |
|
| | if result.success: |
| | print(f"\nIndexed: {result.source_path}") |
| | print(f" Document ID: {result.document_id}") |
| | print(f" Chunks indexed: {result.num_chunks_indexed}") |
| | print(f" Chunks skipped: {result.num_chunks_skipped}") |
| | else: |
| | print(f"Indexing failed: {result.error}") |
| | return False |
| |
|
| | |
| | stats = indexer.get_index_stats() |
| | print(f"\nIndex Stats:") |
| | print(f" Total chunks: {stats['total_chunks']}") |
| | print(f" Documents: {stats['num_documents']}") |
| | print(f" Embedding model: {stats['embedding_model']}") |
| |
|
| | return True |
| |
|
| |
|
| | def example_search(): |
| | """Search indexed documents.""" |
| | print("\n" + "=" * 50) |
| | print("Semantic Search") |
| | print("=" * 50) |
| |
|
| | |
| | retriever = get_document_retriever() |
| |
|
| | |
| | queries = [ |
| | "What is the main topic?", |
| | "key findings", |
| | "conclusions and recommendations", |
| | ] |
| |
|
| | for query in queries: |
| | print(f"\nQuery: '{query}'") |
| |
|
| | chunks = retriever.retrieve(query, top_k=3) |
| |
|
| | if not chunks: |
| | print(" No results found") |
| | continue |
| |
|
| | for i, chunk in enumerate(chunks, 1): |
| | print(f"\n [{i}] Similarity: {chunk.similarity:.3f}") |
| | if chunk.page is not None: |
| | print(f" Page: {chunk.page + 1}") |
| | print(f" Text: {chunk.text[:150]}...") |
| |
|
| |
|
| | def example_question_answering(): |
| | """Answer questions using RAG.""" |
| | print("\n" + "=" * 50) |
| | print("Question Answering with Citations") |
| | print("=" * 50) |
| |
|
| | |
| | generator = get_grounded_generator() |
| |
|
| | |
| | questions = [ |
| | "What is the main purpose of this document?", |
| | "What are the key findings?", |
| | "What recommendations are made?", |
| | ] |
| |
|
| | for question in questions: |
| | print(f"\nQuestion: {question}") |
| | print("-" * 40) |
| |
|
| | result = generator.answer_question(question, top_k=5) |
| |
|
| | print(f"\nAnswer: {result.answer}") |
| | print(f"\nConfidence: {result.confidence:.2f}") |
| |
|
| | if result.abstained: |
| | print(f"Note: {result.abstain_reason}") |
| |
|
| | if result.citations: |
| | print(f"\nCitations ({len(result.citations)}):") |
| | for citation in result.citations: |
| | page = f"Page {citation.page + 1}" if citation.page is not None else "" |
| | print(f" [{citation.index}] {page}: {citation.text_snippet[:60]}...") |
| |
|
| |
|
| | def example_filtered_search(): |
| | """Search with metadata filters.""" |
| | print("\n" + "=" * 50) |
| | print("Filtered Search") |
| | print("=" * 50) |
| |
|
| | retriever = get_document_retriever() |
| |
|
| | |
| | print("\nSearching for tables only...") |
| | table_chunks = retriever.retrieve_tables("data values", top_k=3) |
| |
|
| | if table_chunks: |
| | print(f"Found {len(table_chunks)} table chunks:") |
| | for chunk in table_chunks: |
| | print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...") |
| | else: |
| | print("No table chunks found") |
| |
|
| | |
| | print("\nSearching pages 1-3...") |
| | page_chunks = retriever.retrieve_by_page( |
| | "introduction", |
| | page_range=(0, 2), |
| | top_k=3, |
| | ) |
| |
|
| | if page_chunks: |
| | print(f"Found {len(page_chunks)} chunks in pages 1-3:") |
| | for chunk in page_chunks: |
| | print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...") |
| | else: |
| | print("No chunks found in specified pages") |
| |
|
| |
|
| | def example_full_pipeline(): |
| | """Complete RAG pipeline demo.""" |
| | print("\n" + "=" * 50) |
| | print("Full RAG Pipeline Demo") |
| | print("=" * 50) |
| |
|
| | |
| | print("\n[Step 1] Indexing documents...") |
| | if not example_indexing(): |
| | return |
| |
|
| | |
| | print("\n[Step 2] Testing search...") |
| | example_search() |
| |
|
| | |
| | print("\n[Step 3] Question answering...") |
| | example_question_answering() |
| |
|
| | print("\n" + "=" * 50) |
| | print("Pipeline demo complete!") |
| | print("=" * 50) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | example_full_pipeline() |
| |
|