""" Configuration settings for DocGenie API """ import os from typing import Optional, List class Settings: """API configuration settings""" # ==================== LLM Configuration ==================== ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "") CLAUDE_MODEL: str = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-5-20250929") # Backward compatibility LLM_MODEL: str = os.getenv("LLM_MODEL", CLAUDE_MODEL) # ==================== Handwriting Service (Stage 3) ==================== HANDWRITING_SERVICE_URL: str = os.getenv( "HANDWRITING_SERVICE_URL", "http://localhost:8080" ) RUNPOD_API_KEY: str = os.getenv("RUNPOD_API_KEY", "") HANDWRITING_SERVICE_TIMEOUT: int = int(os.getenv("HANDWRITING_SERVICE_TIMEOUT", "300")) HANDWRITING_SERVICE_MAX_RETRIES: int = int(os.getenv("HANDWRITING_SERVICE_MAX_RETRIES", "3")) HANDWRITING_SERVICE_ENABLED: bool = os.getenv("HANDWRITING_SERVICE_ENABLED", "false").lower() == "true" HANDWRITING_SERVICE_SUPPORTS_BATCH: bool = os.getenv("HANDWRITING_SERVICE_SUPPORTS_BATCH", "true").lower() == "true" # ==================== OCR Service (Stage 4) ==================== OCR_SERVICE_URL: str = os.getenv("OCR_SERVICE_URL", "http://localhost:8000") OCR_SERVICE_TIMEOUT: int = int(os.getenv("OCR_SERVICE_TIMEOUT", "30")) OCR_SERVICE_ENABLED: bool = os.getenv("OCR_SERVICE_ENABLED", "false").lower() == "true" OCR_ENGINE: str = os.getenv("OCR_ENGINE", "microsoft_di") OCR_DPI: int = int(os.getenv("OCR_DPI", "300")) # DPI for PDF to image conversion # Local Tesseract OCR (alternative to remote service) OCR_USE_LOCAL: bool = os.getenv("OCR_USE_LOCAL", "false").lower() == "true" OCR_TESSERACT_LANG: str = os.getenv("OCR_TESSERACT_LANG", "eng") # Tesseract language OCR_TESSERACT_CONFIG: str = os.getenv("OCR_TESSERACT_CONFIG", "--psm 3") # Tesseract config # ==================== Stage 5: Dataset Packaging ==================== # Stage 16: BBox normalization BBOX_NORMALIZATION_ENABLED: bool = os.getenv("BBOX_NORMALIZATION_ENABLED", "false").lower() == "true" BBOX_NORMALIZATION_SCALE: str = os.getenv("BBOX_NORMALIZATION_SCALE", "0-1") # "0-1" or "0-1000" # Stage 17: GT verification GT_VERIFICATION_ENABLED: bool = os.getenv("GT_VERIFICATION_ENABLED", "false").lower() == "true" GT_VERIFICATION_SIMILARITY_CUTOFF: float = float(os.getenv("GT_VERIFICATION_SIMILARITY_CUTOFF", "0.8")) GT_VERIFICATION_OVERLAP_THRESHOLD: float = float(os.getenv("GT_VERIFICATION_OVERLAP_THRESHOLD", "0.5")) # Stage 18: Analysis ANALYSIS_ENABLED: bool = os.getenv("ANALYSIS_ENABLED", "false").lower() == "true" ANALYSIS_MIN_ANNOTATION_COUNT: int = int(os.getenv("ANALYSIS_MIN_ANNOTATION_COUNT", "1")) # Stage 19: Debug visualization DEBUG_VISUALIZATION_ENABLED: bool = os.getenv("DEBUG_VISUALIZATION_ENABLED", "false").lower() == "true" DEBUG_SHOW_TEXT_IN_BBOX: bool = os.getenv("DEBUG_SHOW_TEXT_IN_BBOX", "true").lower() == "true" DEBUG_BBOX_COLOR_RGB: str = os.getenv("DEBUG_BBOX_COLOR_RGB", "255,0,0") # Red default # Dataset export DATASET_EXPORT_ENABLED: bool = os.getenv("DATASET_EXPORT_ENABLED", "false").lower() == "true" DATASET_EXPORT_FORMAT: str = os.getenv("DATASET_EXPORT_FORMAT", "msgpack") # msgpack, coco, huggingface DATASET_EXPORT_DIR: str = os.getenv("DATASET_EXPORT_DIR", "/tmp/docgenie_datasets") DATASET_RESIZE_IMAGES: bool = os.getenv("DATASET_RESIZE_IMAGES", "false").lower() == "true" DATASET_CLIP_BBOXES_TO_FOREGROUND: bool = os.getenv("DATASET_CLIP_BBOXES_TO_FOREGROUND", "false").lower() == "true" # ==================== API Server Configuration ==================== API_HOST: str = os.getenv("API_HOST", "0.0.0.0") API_PORT: int = int(os.getenv("API_PORT", "8000")) DEBUG_MODE: bool = os.getenv("DEBUG_MODE", "false").lower() == "true" # ==================== CORS Configuration ==================== CORS_ORIGINS: List[str] = [ origin.strip() for origin in os.getenv("CORS_ORIGINS", "*").split(",") if origin.strip() ] or ["*"] # ==================== File Storage ==================== TEMP_DIR: str = os.getenv("TEMP_DIR", "/tmp/docgenie_api") # ==================== Logging ==================== LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO") # ==================== Database (Optional) ==================== DATABASE_URL: Optional[str] = os.getenv("DATABASE_URL", None) REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0") # ==================== Supabase ==================== SUPABASE_URL: str = os.getenv("SUPABASE_URL", "") SUPABASE_KEY: str = os.getenv("SUPABASE_KEY", "") # ==================== Background Jobs ==================== RQ_QUEUE_NAME: str = os.getenv("RQ_QUEUE_NAME", "docgenie") BATCH_POLL_INTERVAL: int = int(os.getenv("BATCH_POLL_INTERVAL", "30")) # seconds BATCH_PROMPT_CHUNK_SIZE: int = int(os.getenv("BATCH_PROMPT_CHUNK_SIZE", "4")) # documents per prompt BATCH_DATA_DIR: str = os.getenv("BATCH_DATA_DIR", "/tmp/docgenie_batches") MESSAGE_DATA_DIR: str = os.getenv("MESSAGE_DATA_DIR", "/tmp/docgenie_messages") # ==================== Google Drive ==================== GOOGLE_DRIVE_FOLDER_NAME: str = os.getenv("GOOGLE_DRIVE_FOLDER_NAME", "DocGenie Documents") GOOGLE_CLIENT_ID: Optional[str] = os.getenv("GOOGLE_CLIENT_ID", None) # For token refresh only GOOGLE_CLIENT_SECRET: Optional[str] = os.getenv("GOOGLE_CLIENT_SECRET", None) # For token refresh only # ==================== Monitoring ==================== SENTRY_DSN: Optional[str] = os.getenv("SENTRY_DSN", None) ENABLE_METRICS: bool = os.getenv("ENABLE_METRICS", "false").lower() == "true" METRICS_PORT: int = int(os.getenv("METRICS_PORT", "9090")) # ==================== AWS (Optional) ==================== AWS_ACCESS_KEY_ID: Optional[str] = os.getenv("AWS_ACCESS_KEY_ID", None) AWS_SECRET_ACCESS_KEY: Optional[str] = os.getenv("AWS_SECRET_ACCESS_KEY", None) AWS_REGION: str = os.getenv("AWS_REGION", "us-east-1") S3_BUCKET: Optional[str] = os.getenv("S3_BUCKET", None) @classmethod def validate(cls) -> bool: """Validate required settings""" if not cls.ANTHROPIC_API_KEY: raise ValueError("ANTHROPIC_API_KEY environment variable is required") return True @classmethod def get_cors_origins(cls) -> List[str]: """Get CORS origins list""" return cls.CORS_ORIGINS if cls.CORS_ORIGINS != ["*"] else ["*"] settings = Settings()