USF00's picture
Initial deployment setup for Recommendation_Deploy
4b5c25d
"""
LITVISION Recommendation API — Utility Module
===============================================
Production logging, device management, CUDA OOM handling,
and temp/cache cleanup helpers.
"""
import os
import gc
import logging
import shutil
from typing import Optional
import torch
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
LOG_FORMAT = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
def setup_logging(level: int = logging.INFO) -> None:
"""Configure production-grade structured logging."""
logging.basicConfig(
level=level,
format=LOG_FORMAT,
datefmt=LOG_DATE_FORMAT,
force=True,
)
# Silence overly chatty third-party loggers
for noisy in ("transformers", "sentence_transformers", "faiss", "urllib3"):
logging.getLogger(noisy).setLevel(logging.WARNING)
logger = logging.getLogger("litvision.recommendation")
# ---------------------------------------------------------------------------
# Device helpers
# ---------------------------------------------------------------------------
def get_device() -> str:
"""Return the best available torch device string."""
if torch.cuda.is_available():
device = "cuda"
gpu_name = torch.cuda.get_device_name(0)
mem = torch.cuda.get_device_properties(0).total_mem / (1024 ** 3)
logger.info(f"CUDA device detected: {gpu_name} ({mem:.1f} GB)")
else:
device = "cpu"
logger.info("No CUDA device — running on CPU")
return device
def safe_cuda_empty_cache() -> None:
"""Clear CUDA cache if available; silently no-op on CPU."""
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
logger.info("CUDA cache cleared")
def handle_cuda_oom(exc: Exception) -> str:
"""Handle a CUDA OOM exception: clear caches and return a user message."""
safe_cuda_empty_cache()
msg = (
"GPU out of memory during recommendation generation. "
"The CUDA cache has been cleared. Please retry with a smaller request."
)
logger.error(f"CUDA OOM: {exc}")
return msg
# ---------------------------------------------------------------------------
# Temp / cache cleanup
# ---------------------------------------------------------------------------
_TEMP_DIRS = [
os.environ.get("HF_HOME", "/tmp/huggingface"),
]
def cleanup_temp_files() -> None:
"""Remove transient cache artefacts that are safe to delete."""
for d in _TEMP_DIRS:
cache_dir = os.path.join(d, "hub", ".locks")
if os.path.isdir(cache_dir):
try:
shutil.rmtree(cache_dir, ignore_errors=True)
logger.info(f"Cleaned lock dir: {cache_dir}")
except Exception as e:
logger.warning(f"Could not clean {cache_dir}: {e}")
# ---------------------------------------------------------------------------
# Validation helpers
# ---------------------------------------------------------------------------
def validate_positive_int(value: int, name: str, max_val: Optional[int] = None) -> int:
"""Ensure *value* is a positive integer, optionally capped at *max_val*."""
if not isinstance(value, int) or value < 1:
raise ValueError(f"{name} must be a positive integer, got {value!r}")
if max_val is not None and value > max_val:
raise ValueError(f"{name} must be ≤ {max_val}, got {value}")
return value