Docgenie-API / api /config.py
Ahadhassan-2003
deploy: update HF Space
dc4e6da
"""
Configuration settings for DocGenie API
"""
import os
from typing import Optional, List
class Settings:
"""API configuration settings"""
# ==================== LLM Configuration ====================
ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "")
CLAUDE_MODEL: str = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
# Backward compatibility
LLM_MODEL: str = os.getenv("LLM_MODEL", CLAUDE_MODEL)
# ==================== Handwriting Service (Stage 3) ====================
HANDWRITING_SERVICE_URL: str = os.getenv(
"HANDWRITING_SERVICE_URL",
"http://localhost:8080"
)
RUNPOD_API_KEY: str = os.getenv("RUNPOD_API_KEY", "")
HANDWRITING_SERVICE_TIMEOUT: int = int(os.getenv("HANDWRITING_SERVICE_TIMEOUT", "300"))
HANDWRITING_SERVICE_MAX_RETRIES: int = int(os.getenv("HANDWRITING_SERVICE_MAX_RETRIES", "3"))
HANDWRITING_SERVICE_ENABLED: bool = os.getenv("HANDWRITING_SERVICE_ENABLED", "false").lower() == "true"
HANDWRITING_SERVICE_SUPPORTS_BATCH: bool = os.getenv("HANDWRITING_SERVICE_SUPPORTS_BATCH", "true").lower() == "true"
# ==================== OCR Service (Stage 4) ====================
OCR_SERVICE_URL: str = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
OCR_SERVICE_TIMEOUT: int = int(os.getenv("OCR_SERVICE_TIMEOUT", "30"))
OCR_SERVICE_ENABLED: bool = os.getenv("OCR_SERVICE_ENABLED", "false").lower() == "true"
OCR_ENGINE: str = os.getenv("OCR_ENGINE", "microsoft_di")
OCR_DPI: int = int(os.getenv("OCR_DPI", "300")) # DPI for PDF to image conversion
# Local Tesseract OCR (alternative to remote service)
OCR_USE_LOCAL: bool = os.getenv("OCR_USE_LOCAL", "false").lower() == "true"
OCR_TESSERACT_LANG: str = os.getenv("OCR_TESSERACT_LANG", "eng") # Tesseract language
OCR_TESSERACT_CONFIG: str = os.getenv("OCR_TESSERACT_CONFIG", "--psm 3") # Tesseract config
# ==================== Stage 5: Dataset Packaging ====================
# Stage 16: BBox normalization
BBOX_NORMALIZATION_ENABLED: bool = os.getenv("BBOX_NORMALIZATION_ENABLED", "false").lower() == "true"
BBOX_NORMALIZATION_SCALE: str = os.getenv("BBOX_NORMALIZATION_SCALE", "0-1") # "0-1" or "0-1000"
# Stage 17: GT verification
GT_VERIFICATION_ENABLED: bool = os.getenv("GT_VERIFICATION_ENABLED", "false").lower() == "true"
GT_VERIFICATION_SIMILARITY_CUTOFF: float = float(os.getenv("GT_VERIFICATION_SIMILARITY_CUTOFF", "0.8"))
GT_VERIFICATION_OVERLAP_THRESHOLD: float = float(os.getenv("GT_VERIFICATION_OVERLAP_THRESHOLD", "0.5"))
# Stage 18: Analysis
ANALYSIS_ENABLED: bool = os.getenv("ANALYSIS_ENABLED", "false").lower() == "true"
ANALYSIS_MIN_ANNOTATION_COUNT: int = int(os.getenv("ANALYSIS_MIN_ANNOTATION_COUNT", "1"))
# Stage 19: Debug visualization
DEBUG_VISUALIZATION_ENABLED: bool = os.getenv("DEBUG_VISUALIZATION_ENABLED", "false").lower() == "true"
DEBUG_SHOW_TEXT_IN_BBOX: bool = os.getenv("DEBUG_SHOW_TEXT_IN_BBOX", "true").lower() == "true"
DEBUG_BBOX_COLOR_RGB: str = os.getenv("DEBUG_BBOX_COLOR_RGB", "255,0,0") # Red default
# Dataset export
DATASET_EXPORT_ENABLED: bool = os.getenv("DATASET_EXPORT_ENABLED", "false").lower() == "true"
DATASET_EXPORT_FORMAT: str = os.getenv("DATASET_EXPORT_FORMAT", "msgpack") # msgpack, coco, huggingface
DATASET_EXPORT_DIR: str = os.getenv("DATASET_EXPORT_DIR", "/tmp/docgenie_datasets")
DATASET_RESIZE_IMAGES: bool = os.getenv("DATASET_RESIZE_IMAGES", "false").lower() == "true"
DATASET_CLIP_BBOXES_TO_FOREGROUND: bool = os.getenv("DATASET_CLIP_BBOXES_TO_FOREGROUND", "false").lower() == "true"
# ==================== API Server Configuration ====================
API_HOST: str = os.getenv("API_HOST", "0.0.0.0")
API_PORT: int = int(os.getenv("API_PORT", "8000"))
DEBUG_MODE: bool = os.getenv("DEBUG_MODE", "false").lower() == "true"
# ==================== CORS Configuration ====================
CORS_ORIGINS: List[str] = [
origin.strip()
for origin in os.getenv("CORS_ORIGINS", "*").split(",")
if origin.strip()
] or ["*"]
# ==================== File Storage ====================
TEMP_DIR: str = os.getenv("TEMP_DIR", "/tmp/docgenie_api")
# ==================== Logging ====================
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
# ==================== Database (Optional) ====================
DATABASE_URL: Optional[str] = os.getenv("DATABASE_URL", None)
REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")
# ==================== Supabase ====================
SUPABASE_URL: str = os.getenv("SUPABASE_URL", "")
SUPABASE_KEY: str = os.getenv("SUPABASE_KEY", "")
# ==================== Background Jobs ====================
RQ_QUEUE_NAME: str = os.getenv("RQ_QUEUE_NAME", "docgenie")
BATCH_POLL_INTERVAL: int = int(os.getenv("BATCH_POLL_INTERVAL", "30")) # seconds
BATCH_PROMPT_CHUNK_SIZE: int = int(os.getenv("BATCH_PROMPT_CHUNK_SIZE", "4")) # documents per prompt
BATCH_DATA_DIR: str = os.getenv("BATCH_DATA_DIR", "/tmp/docgenie_batches")
MESSAGE_DATA_DIR: str = os.getenv("MESSAGE_DATA_DIR", "/tmp/docgenie_messages")
# ==================== Google Drive ====================
GOOGLE_DRIVE_FOLDER_NAME: str = os.getenv("GOOGLE_DRIVE_FOLDER_NAME", "DocGenie Documents")
GOOGLE_CLIENT_ID: Optional[str] = os.getenv("GOOGLE_CLIENT_ID", None) # For token refresh only
GOOGLE_CLIENT_SECRET: Optional[str] = os.getenv("GOOGLE_CLIENT_SECRET", None) # For token refresh only
# ==================== Monitoring ====================
SENTRY_DSN: Optional[str] = os.getenv("SENTRY_DSN", None)
ENABLE_METRICS: bool = os.getenv("ENABLE_METRICS", "false").lower() == "true"
METRICS_PORT: int = int(os.getenv("METRICS_PORT", "9090"))
# ==================== AWS (Optional) ====================
AWS_ACCESS_KEY_ID: Optional[str] = os.getenv("AWS_ACCESS_KEY_ID", None)
AWS_SECRET_ACCESS_KEY: Optional[str] = os.getenv("AWS_SECRET_ACCESS_KEY", None)
AWS_REGION: str = os.getenv("AWS_REGION", "us-east-1")
S3_BUCKET: Optional[str] = os.getenv("S3_BUCKET", None)
@classmethod
def validate(cls) -> bool:
"""Validate required settings"""
if not cls.ANTHROPIC_API_KEY:
raise ValueError("ANTHROPIC_API_KEY environment variable is required")
return True
@classmethod
def get_cors_origins(cls) -> List[str]:
"""Get CORS origins list"""
return cls.CORS_ORIGINS if cls.CORS_ORIGINS != ["*"] else ["*"]
settings = Settings()