| | """ |
| | Test data validation fixes for MCP paper parsing and PDF processing. |
| | This test verifies that malformed data (dicts instead of lists) is handled correctly. |
| | """ |
| | import sys |
| | from datetime import datetime |
| | from utils.schemas import Paper |
| | from utils.pdf_processor import PDFProcessor |
| |
|
| |
|
| | def test_paper_schema_validators(): |
| | """Test that Paper schema validators correctly normalize malformed data.""" |
| | print("\n" + "="*80) |
| | print("TEST 1: Paper Schema Validators") |
| | print("="*80) |
| |
|
| | |
| | print("\n1. Testing authors as dict (malformed data)...") |
| | try: |
| | paper = Paper( |
| | arxiv_id="test.001", |
| | title="Test Paper", |
| | authors={"author1": "John Doe", "author2": "Jane Smith"}, |
| | abstract="Test abstract", |
| | pdf_url="https://arxiv.org/pdf/test.001.pdf", |
| | published=datetime.now(), |
| | categories=["cs.AI"] |
| | ) |
| | print(f" β Paper created successfully") |
| | print(f" Authors type: {type(paper.authors)}") |
| | print(f" Authors value: {paper.authors}") |
| | assert isinstance(paper.authors, list), "Authors should be normalized to list" |
| | print(f" β Authors correctly normalized to list") |
| | except Exception as e: |
| | print(f" β Failed: {str(e)}") |
| | return False |
| |
|
| | |
| | print("\n2. Testing categories as dict (malformed data)...") |
| | try: |
| | paper = Paper( |
| | arxiv_id="test.002", |
| | title="Test Paper 2", |
| | authors=["John Doe"], |
| | abstract="Test abstract", |
| | pdf_url="https://arxiv.org/pdf/test.002.pdf", |
| | published=datetime.now(), |
| | categories={"cat1": "cs.AI", "cat2": "cs.LG"} |
| | ) |
| | print(f" β Paper created successfully") |
| | print(f" Categories type: {type(paper.categories)}") |
| | print(f" Categories value: {paper.categories}") |
| | assert isinstance(paper.categories, list), "Categories should be normalized to list" |
| | print(f" β Categories correctly normalized to list") |
| | except Exception as e: |
| | print(f" β Failed: {str(e)}") |
| | return False |
| |
|
| | |
| | print("\n3. Testing multiple fields malformed...") |
| | try: |
| | paper = Paper( |
| | arxiv_id="test.003", |
| | title={"title": "Test Paper 3"}, |
| | authors={"names": ["John Doe", "Jane Smith"]}, |
| | abstract={"summary": "Test abstract"}, |
| | pdf_url={"url": "https://arxiv.org/pdf/test.003.pdf"}, |
| | published=datetime.now(), |
| | categories={"categories": ["cs.AI"]} |
| | ) |
| | print(f" β Paper created successfully") |
| | print(f" Title type: {type(paper.title)}, value: {paper.title}") |
| | print(f" Authors type: {type(paper.authors)}, value: {paper.authors}") |
| | print(f" Abstract type: {type(paper.abstract)}, value: {paper.abstract[:50]}...") |
| | print(f" PDF URL type: {type(paper.pdf_url)}, value: {paper.pdf_url}") |
| | print(f" Categories type: {type(paper.categories)}, value: {paper.categories}") |
| |
|
| | assert isinstance(paper.title, str), "Title should be normalized to string" |
| | assert isinstance(paper.authors, list), "Authors should be normalized to list" |
| | assert isinstance(paper.abstract, str), "Abstract should be normalized to string" |
| | assert isinstance(paper.pdf_url, str), "PDF URL should be normalized to string" |
| | assert isinstance(paper.categories, list), "Categories should be normalized to list" |
| | print(f" β All fields correctly normalized") |
| | except Exception as e: |
| | print(f" β Failed: {str(e)}") |
| | return False |
| |
|
| | print("\n" + "="*80) |
| | print("β ALL PAPER SCHEMA VALIDATION TESTS PASSED") |
| | print("="*80) |
| | return True |
| |
|
| |
|
| | def test_pdf_processor_resilience(): |
| | """Test that PDFProcessor handles malformed Paper objects gracefully.""" |
| | print("\n" + "="*80) |
| | print("TEST 2: PDFProcessor Resilience") |
| | print("="*80) |
| |
|
| | processor = PDFProcessor(chunk_size=100, chunk_overlap=10) |
| |
|
| | |
| | print("\n1. Testing PDF processor with validated Paper object...") |
| | try: |
| | paper = Paper( |
| | arxiv_id="test.004", |
| | title="Test Paper", |
| | authors={"author1": "John Doe"}, |
| | abstract="Test abstract", |
| | pdf_url="https://arxiv.org/pdf/test.004.pdf", |
| | published=datetime.now(), |
| | categories=["cs.AI"] |
| | ) |
| |
|
| | |
| | test_text = "This is a test document. " * 100 |
| |
|
| | chunks = processor.chunk_text(test_text, paper) |
| | print(f" β Created {len(chunks)} chunks successfully") |
| | print(f" First chunk metadata authors type: {type(chunks[0].metadata['authors'])}") |
| | print(f" First chunk metadata authors: {chunks[0].metadata['authors']}") |
| |
|
| | assert isinstance(chunks[0].metadata['authors'], list), "Chunk metadata authors should be list" |
| | print(f" β Chunk metadata correctly contains list for authors") |
| |
|
| | except Exception as e: |
| | print(f" β Failed: {str(e)}") |
| | import traceback |
| | traceback.print_exc() |
| | return False |
| |
|
| | print("\n" + "="*80) |
| | print("β PDF PROCESSOR RESILIENCE TESTS PASSED") |
| | print("="*80) |
| | return True |
| |
|
| |
|
| | if __name__ == "__main__": |
| | print("\n" + "="*80) |
| | print("DATA VALIDATION FIX VERIFICATION TESTS") |
| | print("="*80) |
| | print("\nThese tests verify that the fixes for malformed MCP data work correctly:") |
| | print("- Paper schema validators normalize dict fields to proper types") |
| | print("- PDF processor handles validated Paper objects without errors") |
| | print("="*80) |
| |
|
| | test1_pass = test_paper_schema_validators() |
| | test2_pass = test_pdf_processor_resilience() |
| |
|
| | print("\n" + "="*80) |
| | print("FINAL RESULTS") |
| | print("="*80) |
| | print(f"Paper Schema Validators: {'β PASS' if test1_pass else 'β FAIL'}") |
| | print(f"PDF Processor Resilience: {'β PASS' if test2_pass else 'β FAIL'}") |
| | print("="*80) |
| |
|
| | if test1_pass and test2_pass: |
| | print("\nβ ALL TESTS PASSED - The data validation fixes are working correctly!") |
| | print("\nThe system should now handle malformed MCP responses gracefully.") |
| | sys.exit(0) |
| | else: |
| | print("\nβ SOME TESTS FAILED - Please review the errors above") |
| | sys.exit(1) |
| |
|