#!/usr/bin/env python3 """ Test script for /generate/pdf endpoint (sync API with tracking & GDrive upload). Tests the complete flow with all features enabled: - Handwriting insertion - Visual elements (stamps, logos, figures, barcodes, photos) - OCR processing - Ground truth verification - Analysis and debug visualization - Dataset export - Google Drive upload Usage: python test_sync_pdf_api.py The script uses hardcoded tokens and polls continuously for status updates. """ import requests import time import sys import zipfile import io # Configuration BASE_URL = "http://localhost:8000" POLL_INTERVAL = 10 # seconds between status checks # Test payload with all features enabled PAYLOAD = { "user_id": 123, "google_drive_token": "ya29.a0ATkoCc5wSA3DqNSI35d2EOCfLku0NWULKJYNMPhngjTwcnEKrcNcut1vawhiErgauHc85BrZdF5pug1xzp9Zu1oWATlzIMrMo5jqKDaXWThC0GuRifayOstjOetZnRLPRxVlmjx4k_xm7rto_pN6mT1CUrnte0Qkwf7FJVtF08JzJqaCG9Vvamag4OkkOhy-LB8MsUQaCgYKAXASARISFQHGX2MiAX_4jMvIlv2OkO7WurUUVA0206", "google_drive_refresh_token": "1//03aLYGLUIYIl0CgYIARAAGAMSNwF-L9IrCfdJ-QHJHisqG86UjBvaEalyhWZdDcwbfLENt4V1ckik_wIkmsgjRwC9-SFeHrj-Yk4", "seed_images": [ "https://ocr.space/Content/Images/receipt-ocr-original.webp" ], "prompt_params": { "language": "English", "doc_type": "business and administrative", "gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.", "gt_format": "{\"\": \"\", \"\": \"\", ...}", "num_solutions": 1, "enable_handwriting": True, "handwriting_ratio": 0.3, "enable_visual_elements": True, "visual_element_types": [ "stamp", "logo", "figure", "barcode", "photo" ], "seed": None, # Use None for random behavior, or set to integer for reproducibility "enable_ocr": True, "ocr_language": "en", "enable_bbox_normalization": True, "enable_gt_verification": True, "enable_analysis": True, "enable_debug_visualization": True, "enable_dataset_export": True, "dataset_export_format": "msgpack", "output_detail": "dataset" } } def test_health(): """Test API health endpoint""" print("=" * 80) print("TESTING API HEALTH") print("=" * 80) try: response = requests.get(f"{BASE_URL}/health", timeout=5) response.raise_for_status() print(f"✓ API is healthy: {response.json()}\n") return True except Exception as e: print(f"✗ Health check failed: {e}\n") return False def test_sync_endpoint(): """Test sync /generate/pdf endpoint with continuous polling""" print("=" * 80) print("TESTING SYNC /generate/pdf ENDPOINT") print("=" * 80) print("\nConfiguration:") print(f" User ID: {PAYLOAD['user_id']}") print(f" Seed Images: {len(PAYLOAD['seed_images'])}") print(f" Num Solutions: {PAYLOAD['prompt_params']['num_solutions']}") print(f" Handwriting: {PAYLOAD['prompt_params']['enable_handwriting']} (ratio: {PAYLOAD['prompt_params']['handwriting_ratio']})") print(f" Visual Elements: {PAYLOAD['prompt_params']['enable_visual_elements']} (types: {len(PAYLOAD['prompt_params']['visual_element_types'])})") print(f" OCR: {PAYLOAD['prompt_params']['enable_ocr']}") print(f" GT Verification: {PAYLOAD['prompt_params']['enable_gt_verification']}") print(f" Analysis: {PAYLOAD['prompt_params']['enable_analysis']}") print(f" Debug Viz: {PAYLOAD['prompt_params']['enable_debug_visualization']}") print(f" Dataset Export: {PAYLOAD['prompt_params']['enable_dataset_export']}") print(f" Google Drive Upload: Yes") print() try: print("⏳ Calling /generate/pdf...") print(" (This will return immediately, then we'll poll for status)\n") start_time = time.time() response = requests.post( f"{BASE_URL}/generate/pdf", json=PAYLOAD, timeout=180, # 3 minutes max for initial response stream=True ) response.raise_for_status() elapsed_time = time.time() - start_time # Check response headers print(f"✓ Response received in {elapsed_time:.1f} seconds") print("\nResponse Headers:") request_id = response.headers.get('X-Request-ID') status_url = response.headers.get('X-Status-URL') if request_id: print(f" ✓ X-Request-ID: {request_id}") else: print(f" ⚠ X-Request-ID: NOT SET") if status_url: print(f" ✓ X-Status-URL: {status_url}") else: print(f" ⚠ X-Status-URL: NOT SET") # Verify ZIP file zip_data = response.content zip_size_mb = len(zip_data) / (1024 * 1024) print(f"\n✓ ZIP file size: {zip_size_mb:.2f} MB") # Validate ZIP structure try: zip_buffer = io.BytesIO(zip_data) with zipfile.ZipFile(zip_buffer, 'r') as zip_file: file_list = zip_file.namelist() print(f"✓ ZIP contains {len(file_list)} files") # Show directory structure print("\nDataset Structure:") dirs = set() for filepath in file_list: parts = filepath.split('/') if len(parts) > 1: dirs.add(parts[0] + '/' + parts[1] if len(parts) > 2 else parts[0]) for dir_name in sorted(dirs): file_count = sum(1 for f in file_list if f.startswith(dir_name + '/') and f != dir_name + '/') if file_count > 0: print(f" 📁 {dir_name}/ ({file_count} files)") # Check for essential files if 'docgenie_documents/metadata.json' in file_list: print("\n ✓ metadata.json present") if 'docgenie_documents/README.md' in file_list: print(" ✓ README.md present") except zipfile.BadZipFile as e: print(f"✗ Invalid ZIP file: {e}") return False # Continuous polling if we have request_id if request_id: print("\n" + "=" * 80) print("CONTINUOUS STATUS POLLING") print("=" * 80) print(f"Request ID: {request_id}") print(f"Polling every {POLL_INTERVAL} seconds...\n") poll_count = 0 last_status = None last_progress = None while True: poll_count += 1 timestamp = time.strftime("%H:%M:%S") try: status_response = requests.get( f"{BASE_URL}/jobs/{request_id}/status", timeout=10 ) status_response.raise_for_status() status_data = status_response.json() current_status = status_data.get('status') current_progress = status_data.get('progress') # Only print if status or progress changed if current_status != last_status or current_progress != last_progress: print(f"[{timestamp}] Poll #{poll_count}: {current_status.upper()}", end="") if current_progress: print(f" - {current_progress}", end="") print() last_status = current_status last_progress = current_progress # Check for terminal states if current_status == "completed": print("\n" + "=" * 80) print("✓ JOB COMPLETED!") print("=" * 80) results = status_data.get('results', {}) download_url = results.get('download_url') if download_url: print(f" ✓ Google Drive URL: {download_url}") else: print(f" ⏳ Google Drive upload may still be in progress") if results.get('file_size_mb'): print(f" File Size: {results['file_size_mb']:.2f} MB") print(f" Document Count: {results.get('document_count', 'N/A')}") print(f" Created: {status_data.get('created_at')}") print(f" Completed: {status_data.get('updated_at')}") break elif current_status == "failed": print("\n" + "=" * 80) print("✗ JOB FAILED!") print("=" * 80) print(f" Error: {status_data.get('error_message', 'Unknown error')}") return False # Wait before next poll time.sleep(POLL_INTERVAL) except KeyboardInterrupt: print("\n\n⚠ Polling interrupted by user") print(f"You can continue polling manually:") print(f" GET {BASE_URL}/jobs/{request_id}/status") break except Exception as e: print(f"\n⚠ Error polling status: {e}") time.sleep(POLL_INTERVAL) print("\n" + "=" * 80) print("✅ TEST COMPLETED SUCCESSFULLY") print("=" * 80) print(f"✓ ZIP received in {elapsed_time:.1f} seconds") print(f"✓ ZIP size: {zip_size_mb:.2f} MB") print(f"✓ Dataset structure validated") print(f"✓ Google Drive upload tracked") return True except requests.exceptions.Timeout: print(f"✗ Request timed out") return False except Exception as e: print(f"✗ Test failed: {e}") import traceback traceback.print_exc() return False def main(): print("\n" + "=" * 80) print(" " * 15 + "SYNC PDF API TEST - FULL FEATURE SET") print("=" * 80) print(f"Base URL: {BASE_URL}") print("=" * 80) print() # Step 1: Health check if not test_health(): print("\n❌ API is not accessible. Make sure the server is running.") print(f" Expected URL: {BASE_URL}") sys.exit(1) # Step 2: Test sync endpoint success = test_sync_endpoint() # Summary print("\n" + "=" * 80) print(" " * 30 + "SUMMARY") print("=" * 80) if success: print("✅ ALL TESTS PASSED!") print("\nFeatures tested:") print(" ✓ Handwriting insertion") print(" ✓ Visual elements (5 types)") print(" ✓ OCR processing") print(" ✓ Ground truth verification") print(" ✓ Analysis & debug visualization") print(" ✓ Dataset export") print(" ✓ Google Drive upload") print(" ✓ Continuous status polling") else: print("❌ TEST FAILED") print("=" * 80) sys.exit(0 if success else 1) if __name__ == "__main__": main()