""" Debug script to test document analysis extraction """ import asyncio import sys from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent)) from src.llm.langchain_ollama_client import get_langchain_client from src.agents.scenario1.document_analysis_agent import DocumentAnalysisAgent from loguru import logger async def main(): # Get a test patent path patent_path = "uploads/patents" # We'll need to find an actual patent file # Find an actual patent file patent_files = list(Path(patent_path).glob("*.pdf")) if not patent_files: logger.error(f"No patent PDFs found in {patent_path}") return test_patent = str(patent_files[0]) logger.info(f"Testing with patent: {test_patent}") # Initialize LLM client and agent llm_client = get_langchain_client(default_complexity='standard') agent = DocumentAnalysisAgent(llm_client) # Extract text logger.info("Step 1: Extracting text...") patent_text = await agent._extract_patent_text(test_patent) logger.info(f"Extracted text length: {len(patent_text)} characters") logger.info(f"First 500 chars: {patent_text[:500]}") # Test structure extraction logger.info("\nStep 2: Extracting structure...") from langchain_core.output_parsers import JsonOutputParser parser = JsonOutputParser() try: structure = await agent.structure_chain.ainvoke({ "patent_text": patent_text[:8000], "format_instructions": parser.get_format_instructions() }) logger.info(f"\nExtracted structure:") logger.info(f" Title: {structure.get('title', 'NOT FOUND')}") logger.info(f" Abstract: {structure.get('abstract', 'NOT FOUND')[:200] if structure.get('abstract') else 'NOT FOUND'}") logger.info(f" Patent ID: {structure.get('patent_id', 'NOT FOUND')}") logger.info(f" Independent claims: {len(structure.get('independent_claims', []))}") logger.info(f" Dependent claims: {len(structure.get('dependent_claims', []))}") logger.info(f"\nFull structure keys: {structure.keys()}") except Exception as e: logger.error(f"Structure extraction failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": asyncio.run(main())