""" Unit Tests for RAG Integration with Document Intelligence Tests the bridge between document_intelligence and RAG subsystems: - DocIntIndexer: Indexing ParseResult into vector store - DocIntRetriever: Semantic retrieval with evidence - RAG Tools: IndexDocumentTool, RetrieveChunksTool, RAGAnswerTool """ import pytest from unittest.mock import Mock, MagicMock, patch from typing import List class TestDocIntBridge: """Tests for the document intelligence RAG bridge.""" def test_bridge_imports(self): """Test that bridge module imports correctly.""" from src.rag.docint_bridge import ( DocIntIndexer, DocIntRetriever, get_docint_indexer, get_docint_retriever, ) assert DocIntIndexer is not None assert DocIntRetriever is not None def test_indexer_creation(self): """Test DocIntIndexer creation.""" from src.rag.docint_bridge import DocIntIndexer from src.rag.indexer import IndexerConfig config = IndexerConfig( batch_size=16, include_bbox=True, min_chunk_length=5, ) # Create with mock store and embedder mock_store = Mock() mock_embedder = Mock() mock_embedder.embed_batch = Mock(return_value=[[0.1] * 768]) indexer = DocIntIndexer( config=config, vector_store=mock_store, embedding_adapter=mock_embedder, ) assert indexer.config.batch_size == 16 assert indexer.config.include_bbox is True def test_retriever_creation(self): """Test DocIntRetriever creation.""" from src.rag.docint_bridge import DocIntRetriever mock_store = Mock() mock_embedder = Mock() retriever = DocIntRetriever( vector_store=mock_store, embedding_adapter=mock_embedder, similarity_threshold=0.6, ) assert retriever.similarity_threshold == 0.6 class TestDocIntIndexer: """Tests for DocIntIndexer functionality.""" @pytest.fixture def mock_parse_result(self): """Create a mock ParseResult for testing.""" from src.document_intelligence.chunks import ( ParseResult, DocumentChunk, ChunkType, BoundingBox, ) chunks = [ DocumentChunk( chunk_id="chunk_001", doc_id="test_doc", chunk_type=ChunkType.PARAGRAPH, text="This is a test paragraph with enough content to index.", page=1, bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.2), confidence=0.9, sequence_index=0, ), DocumentChunk( chunk_id="chunk_002", doc_id="test_doc", chunk_type=ChunkType.PARAGRAPH, text="Second paragraph with different content for testing.", page=1, bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.9, y_max=0.4), confidence=0.85, sequence_index=1, ), DocumentChunk( chunk_id="chunk_003", doc_id="test_doc", chunk_type=ChunkType.TABLE, text="| Header | Value |\n| --- | --- |\n| A | 100 |", page=2, bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.5), confidence=0.95, sequence_index=2, ), ] return ParseResult( doc_id="test_doc", filename="test.pdf", chunks=chunks, num_pages=2, processing_time_ms=100, markdown_full="# Test Document\n\nContent here.", ) def test_index_parse_result(self, mock_parse_result): """Test indexing a ParseResult.""" from src.rag.docint_bridge import DocIntIndexer mock_store = Mock() mock_store.add_chunks = Mock() mock_embedder = Mock() # Return embeddings for each chunk mock_embedder.embed_batch = Mock(return_value=[ [0.1] * 768, [0.2] * 768, [0.3] * 768, ]) indexer = DocIntIndexer( vector_store=mock_store, embedding_adapter=mock_embedder, ) result = indexer.index_parse_result(mock_parse_result) assert result.success is True assert result.document_id == "test_doc" assert result.num_chunks_indexed == 3 assert result.num_chunks_skipped == 0 # Verify store was called mock_store.add_chunks.assert_called_once() def test_index_skips_short_chunks(self, mock_parse_result): """Test that short chunks are skipped.""" from src.rag.docint_bridge import DocIntIndexer from src.rag.indexer import IndexerConfig # Add a short chunk from src.document_intelligence.chunks import ( DocumentChunk, ChunkType, BoundingBox, ) mock_parse_result.chunks.append( DocumentChunk( chunk_id="chunk_short", doc_id="test_doc", chunk_type=ChunkType.PARAGRAPH, text="Short", # Too short page=1, bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1), confidence=0.9, sequence_index=3, ) ) config = IndexerConfig(min_chunk_length=10) mock_store = Mock() mock_store.add_chunks = Mock() mock_embedder = Mock() mock_embedder.embed_batch = Mock(return_value=[ [0.1] * 768, [0.2] * 768, [0.3] * 768, ]) indexer = DocIntIndexer( config=config, vector_store=mock_store, embedding_adapter=mock_embedder, ) result = indexer.index_parse_result(mock_parse_result) assert result.success is True assert result.num_chunks_indexed == 3 assert result.num_chunks_skipped == 1 # Short chunk skipped def test_delete_document(self): """Test deleting a document from index.""" from src.rag.docint_bridge import DocIntIndexer mock_store = Mock() mock_store.delete_document = Mock(return_value=5) indexer = DocIntIndexer(vector_store=mock_store) deleted = indexer.delete_document("test_doc") assert deleted == 5 mock_store.delete_document.assert_called_once_with("test_doc") class TestDocIntRetriever: """Tests for DocIntRetriever functionality.""" def test_retrieve_chunks(self): """Test basic chunk retrieval.""" from src.rag.docint_bridge import DocIntRetriever from src.rag.store import VectorSearchResult # Mock search results mock_results = [ VectorSearchResult( chunk_id="chunk_001", document_id="test_doc", text="Relevant content about the query.", similarity=0.85, page=1, chunk_type="paragraph", bbox={"x_min": 0.1, "y_min": 0.1, "x_max": 0.9, "y_max": 0.2}, metadata={"source_path": "test.pdf", "confidence": 0.9}, ), VectorSearchResult( chunk_id="chunk_002", document_id="test_doc", text="Another relevant chunk.", similarity=0.75, page=2, chunk_type="paragraph", bbox={"x_min": 0.1, "y_min": 0.3, "x_max": 0.9, "y_max": 0.4}, metadata={"source_path": "test.pdf", "confidence": 0.85}, ), ] mock_store = Mock() mock_store.search = Mock(return_value=mock_results) mock_embedder = Mock() mock_embedder.embed_text = Mock(return_value=[0.1] * 768) retriever = DocIntRetriever( vector_store=mock_store, embedding_adapter=mock_embedder, similarity_threshold=0.5, ) chunks = retriever.retrieve("test query", top_k=5) assert len(chunks) == 2 assert chunks[0]["chunk_id"] == "chunk_001" assert chunks[0]["similarity"] == 0.85 def test_retrieve_with_evidence(self): """Test retrieval with evidence references.""" from src.rag.docint_bridge import DocIntRetriever from src.rag.store import VectorSearchResult mock_results = [ VectorSearchResult( chunk_id="chunk_001", document_id="test_doc", text="Content with evidence.", similarity=0.9, page=1, chunk_type="paragraph", bbox={"x_min": 0.1, "y_min": 0.1, "x_max": 0.9, "y_max": 0.2}, metadata={}, ), ] mock_store = Mock() mock_store.search = Mock(return_value=mock_results) mock_embedder = Mock() mock_embedder.embed_text = Mock(return_value=[0.1] * 768) retriever = DocIntRetriever( vector_store=mock_store, embedding_adapter=mock_embedder, ) chunks, evidence_refs = retriever.retrieve_with_evidence("query") assert len(chunks) == 1 assert len(evidence_refs) == 1 assert evidence_refs[0].chunk_id == "chunk_001" assert evidence_refs[0].page == 1 def test_retrieve_with_filters(self): """Test retrieval with filters.""" from src.rag.docint_bridge import DocIntRetriever mock_store = Mock() mock_store.search = Mock(return_value=[]) mock_embedder = Mock() mock_embedder.embed_text = Mock(return_value=[0.1] * 768) retriever = DocIntRetriever( vector_store=mock_store, embedding_adapter=mock_embedder, ) # Retrieve with document filter chunks = retriever.retrieve( "query", document_id="specific_doc", chunk_types=["paragraph", "table"], page_range=(1, 5), ) # Verify filters were passed to store call_args = mock_store.search.call_args filters = call_args.kwargs.get("filters") assert filters["document_id"] == "specific_doc" assert filters["chunk_type"] == ["paragraph", "table"] assert filters["page"] == {"min": 1, "max": 5} def test_build_context(self): """Test context building from chunks.""" from src.rag.docint_bridge import DocIntRetriever retriever = DocIntRetriever() chunks = [ { "chunk_id": "c1", "text": "First chunk content.", "page": 1, "chunk_type": "paragraph", "similarity": 0.9, }, { "chunk_id": "c2", "text": "Second chunk content.", "page": 2, "chunk_type": "table", "similarity": 0.8, }, ] context = retriever.build_context(chunks) assert "[1]" in context assert "[2]" in context assert "Page 1" in context assert "Page 2" in context assert "First chunk content" in context assert "Second chunk content" in context class TestRAGTools: """Tests for RAG tools in document_intelligence.""" def test_tool_imports(self): """Test that RAG tools import correctly.""" from src.document_intelligence.tools import ( IndexDocumentTool, RetrieveChunksTool, RAGAnswerTool, DeleteDocumentTool, GetIndexStatsTool, get_rag_tool, list_rag_tools, ) assert IndexDocumentTool is not None assert RetrieveChunksTool is not None assert RAGAnswerTool is not None def test_list_rag_tools(self): """Test listing RAG tools.""" from src.document_intelligence.tools import list_rag_tools tools = list_rag_tools() assert len(tools) >= 3 tool_names = [t["name"] for t in tools] assert "index_document" in tool_names assert "retrieve_chunks" in tool_names assert "rag_answer" in tool_names def test_get_rag_tool(self): """Test getting RAG tool by name.""" from src.document_intelligence.tools import get_rag_tool tool = get_rag_tool("index_document") assert tool.name == "index_document" tool = get_rag_tool("retrieve_chunks") assert tool.name == "retrieve_chunks" @patch("src.document_intelligence.tools.rag_tools.RAG_AVAILABLE", False) def test_tool_graceful_degradation(self): """Test that tools handle missing RAG gracefully.""" from src.document_intelligence.tools.rag_tools import IndexDocumentTool tool = IndexDocumentTool() result = tool.execute(path="test.pdf") assert result.success is False assert "not available" in result.error.lower() class TestAnswerQuestionRAGMode: """Tests for AnswerQuestionTool with RAG mode.""" def test_answer_with_keywords(self): """Test keyword-based answering (use_rag=False).""" from src.document_intelligence.tools import get_tool from src.document_intelligence.chunks import ( ParseResult, DocumentChunk, ChunkType, BoundingBox, ) # Create mock parse result chunks = [ DocumentChunk( chunk_id="chunk_001", doc_id="test_doc", chunk_type=ChunkType.PARAGRAPH, text="The total amount due is $500.00 as shown on page one.", page=1, bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.2), confidence=0.9, sequence_index=0, ), ] parse_result = ParseResult( doc_id="test_doc", filename="test.pdf", chunks=chunks, num_pages=1, processing_time_ms=100, markdown_full="# Test", ) tool = get_tool("answer_question") result = tool.execute( parse_result=parse_result, question="What is the total amount?", use_rag=False, ) assert result.success is True assert "500" in result.data.get("answer", "") class TestAbstentionPolicy: """Tests for abstention behavior.""" def test_abstains_on_no_results(self): """Test that system abstains when no relevant chunks found.""" from src.document_intelligence.tools import get_tool from src.document_intelligence.chunks import ( ParseResult, DocumentChunk, ChunkType, BoundingBox, ) # Create parse result with unrelated content chunks = [ DocumentChunk( chunk_id="chunk_001", doc_id="test_doc", chunk_type=ChunkType.PARAGRAPH, text="This document discusses weather patterns in Antarctica.", page=1, bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1), confidence=0.9, sequence_index=0, ), ] parse_result = ParseResult( doc_id="test_doc", filename="test.pdf", chunks=chunks, num_pages=1, processing_time_ms=100, markdown_full="# Test", ) tool = get_tool("answer_question") result = tool.execute( parse_result=parse_result, question="What is the invoice number?", use_rag=False, ) assert result.success is True assert result.data.get("abstained") is True assert result.data.get("confidence", 1.0) == 0.0 class TestEvidenceGeneration: """Tests for evidence reference generation.""" def test_evidence_from_retrieval(self): """Test evidence refs are generated from retrieval.""" from src.rag.docint_bridge import DocIntRetriever from src.rag.store import VectorSearchResult mock_results = [ VectorSearchResult( chunk_id="chunk_001", document_id="doc_001", text="Evidence text here.", similarity=0.9, page=1, chunk_type="paragraph", bbox={"x_min": 0.1, "y_min": 0.2, "x_max": 0.9, "y_max": 0.3}, metadata={"confidence": 0.95}, ), ] mock_store = Mock() mock_store.search = Mock(return_value=mock_results) mock_embedder = Mock() mock_embedder.embed_text = Mock(return_value=[0.1] * 768) retriever = DocIntRetriever( vector_store=mock_store, embedding_adapter=mock_embedder, ) chunks, evidence = retriever.retrieve_with_evidence("query") assert len(evidence) == 1 ev = evidence[0] assert ev.chunk_id == "chunk_001" assert ev.page == 1 assert ev.bbox.x_min == 0.1 assert ev.bbox.y_max == 0.3 assert "Evidence text" in ev.snippet if __name__ == "__main__": pytest.main([__file__, "-v"])