File size: 2,320 Bytes
a9dc537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
Debug script to test document analysis extraction
"""
import asyncio
import sys
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent))
from src.llm.langchain_ollama_client import get_langchain_client
from src.agents.scenario1.document_analysis_agent import DocumentAnalysisAgent
from loguru import logger
async def main():
# Get a test patent path
patent_path = "uploads/patents" # We'll need to find an actual patent file
# Find an actual patent file
patent_files = list(Path(patent_path).glob("*.pdf"))
if not patent_files:
logger.error(f"No patent PDFs found in {patent_path}")
return
test_patent = str(patent_files[0])
logger.info(f"Testing with patent: {test_patent}")
# Initialize LLM client and agent
llm_client = get_langchain_client(default_complexity='standard')
agent = DocumentAnalysisAgent(llm_client)
# Extract text
logger.info("Step 1: Extracting text...")
patent_text = await agent._extract_patent_text(test_patent)
logger.info(f"Extracted text length: {len(patent_text)} characters")
logger.info(f"First 500 chars: {patent_text[:500]}")
# Test structure extraction
logger.info("\nStep 2: Extracting structure...")
from langchain_core.output_parsers import JsonOutputParser
parser = JsonOutputParser()
try:
structure = await agent.structure_chain.ainvoke({
"patent_text": patent_text[:8000],
"format_instructions": parser.get_format_instructions()
})
logger.info(f"\nExtracted structure:")
logger.info(f" Title: {structure.get('title', 'NOT FOUND')}")
logger.info(f" Abstract: {structure.get('abstract', 'NOT FOUND')[:200] if structure.get('abstract') else 'NOT FOUND'}")
logger.info(f" Patent ID: {structure.get('patent_id', 'NOT FOUND')}")
logger.info(f" Independent claims: {len(structure.get('independent_claims', []))}")
logger.info(f" Dependent claims: {len(structure.get('dependent_claims', []))}")
logger.info(f"\nFull structure keys: {structure.keys()}")
except Exception as e:
logger.error(f"Structure extraction failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())
|