Spaces:

USF00
/

Recommendation_Deploy

Running

App Files Files Community

Recommendation_Deploy / utils.py

USF00

Initial deployment setup for Recommendation_Deploy

4b5c25d 12 days ago

raw

history blame contribute delete

3.59 kB

	"""
	LITVISION Recommendation API — Utility Module
	===============================================
	Production logging, device management, CUDA OOM handling,
	and temp/cache cleanup helpers.
	"""

	import os
	import gc
	import logging
	import shutil
	from typing import Optional

	import torch

	# ---------------------------------------------------------------------------
	# Logging
	# ---------------------------------------------------------------------------

	LOG_FORMAT = "%(asctime)s \| %(levelname)-8s \| %(name)s \| %(message)s"
	LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"


	def setup_logging(level: int = logging.INFO) -> None:
	"""Configure production-grade structured logging."""
	logging.basicConfig(
	level=level,
	format=LOG_FORMAT,
	datefmt=LOG_DATE_FORMAT,
	force=True,
	)
	# Silence overly chatty third-party loggers
	for noisy in ("transformers", "sentence_transformers", "faiss", "urllib3"):
	logging.getLogger(noisy).setLevel(logging.WARNING)


	logger = logging.getLogger("litvision.recommendation")

	# ---------------------------------------------------------------------------
	# Device helpers
	# ---------------------------------------------------------------------------


	def get_device() -> str:
	"""Return the best available torch device string."""
	if torch.cuda.is_available():
	device = "cuda"
	gpu_name = torch.cuda.get_device_name(0)
	mem = torch.cuda.get_device_properties(0).total_mem / (1024 ** 3)
	logger.info(f"CUDA device detected: {gpu_name} ({mem:.1f} GB)")
	else:
	device = "cpu"
	logger.info("No CUDA device — running on CPU")
	return device


	def safe_cuda_empty_cache() -> None:
	"""Clear CUDA cache if available; silently no-op on CPU."""
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	logger.info("CUDA cache cleared")


	def handle_cuda_oom(exc: Exception) -> str:
	"""Handle a CUDA OOM exception: clear caches and return a user message."""
	safe_cuda_empty_cache()
	msg = (
	"GPU out of memory during recommendation generation. "
	"The CUDA cache has been cleared. Please retry with a smaller request."
	)
	logger.error(f"CUDA OOM: {exc}")
	return msg

	# ---------------------------------------------------------------------------
	# Temp / cache cleanup
	# ---------------------------------------------------------------------------

	_TEMP_DIRS = [
	os.environ.get("HF_HOME", "/tmp/huggingface"),
	]


	def cleanup_temp_files() -> None:
	"""Remove transient cache artefacts that are safe to delete."""
	for d in _TEMP_DIRS:
	cache_dir = os.path.join(d, "hub", ".locks")
	if os.path.isdir(cache_dir):
	try:
	shutil.rmtree(cache_dir, ignore_errors=True)
	logger.info(f"Cleaned lock dir: {cache_dir}")
	except Exception as e:
	logger.warning(f"Could not clean {cache_dir}: {e}")

	# ---------------------------------------------------------------------------
	# Validation helpers
	# ---------------------------------------------------------------------------


	def validate_positive_int(value: int, name: str, max_val: Optional[int] = None) -> int:
	"""Ensure value is a positive integer, optionally capped at max_val."""
	if not isinstance(value, int) or value < 1:
	raise ValueError(f"{name} must be a positive integer, got {value!r}")
	if max_val is not None and value > max_val:
	raise ValueError(f"{name} must be ≤ {max_val}, got {value}")
	return value