""" Input validation and sanitization for backend services. Provides defensive validation layers for all external inputs to ensure system security and data integrity. """ from __future__ import annotations import re from pathlib import Path from typing import Optional # YouTube video ID format: 11 alphanumeric characters plus _ and - # This has been stable since 2006 but could theoretically change VIDEO_ID_PATTERN = re.compile(r'^[a-zA-Z0-9_-]{11}$') VIDEO_ID_LENGTH = 11 ALLOWED_AUDIO_EXTENSIONS = {'.mp3', '.wav', '.flac', '.ogg', '.m4a', '.webm', '.opus'} def validate_video_id(video_id: str) -> bool: """ Validate YouTube video ID format. YouTube video IDs are currently 11 characters long, consisting of alphanumeric characters, underscores, and hyphens. This format has been stable since 2006. Args: video_id: Video identifier to validate Returns: True if valid format, False otherwise """ if not video_id or not isinstance(video_id, str): return False if len(video_id) != VIDEO_ID_LENGTH: return False return bool(VIDEO_ID_PATTERN.match(video_id)) def validate_url(url: str) -> bool: """ Validate URL format and allowed domains. Args: url: URL string to validate Returns: True if valid and safe, False otherwise """ if not url or not isinstance(url, str): return False url = url.strip() if not url.startswith(('http://', 'https://')): return False if len(url) > 2048: return False dangerous_chars = ['<', '>', '"', "'", '`', '{', '}'] if any(char in url for char in dangerous_chars): return False from urllib.parse import urlparse try: parsed = urlparse(url) host = (parsed.hostname or '').lower() except Exception: return False allowed_hosts = { 'youtube.com', 'www.youtube.com', 'm.youtube.com', 'music.youtube.com', 'youtu.be', 'www.youtu.be', 'spotify.com', 'www.spotify.com', 'open.spotify.com', } if host not in allowed_hosts: return False return True def validate_audio_path(path: Path) -> tuple[bool, Optional[str]]: """ Validate audio file path for security and format. Args: path: File path to validate Returns: Tuple of (is_valid, error_message) """ if not path.exists(): return False, "file_not_found" if not path.is_file(): return False, "not_a_file" try: # resolve with strict=True validates path and prevents traversal resolved = path.resolve(strict=True) except (OSError, RuntimeError): return False, "invalid_path" extension = path.suffix.lower() if extension not in ALLOWED_AUDIO_EXTENSIONS: return False, f"unsupported_format_{extension}" try: file_size = path.stat().st_size if file_size < 1024: return False, "file_too_small" if file_size > 100 * 1024 * 1024: return False, "file_too_large" except OSError: return False, "cannot_read_file" return True, None def sanitize_filename(filename: str) -> str: """ Sanitize filename to prevent directory traversal and injection. Args: filename: Raw filename from user input Returns: Sanitized filename safe for use """ if not filename: return "unnamed" filename = filename.strip() dangerous_patterns = ['..', '/', '\\', '\x00', '\n', '\r'] for pattern in dangerous_patterns: filename = filename.replace(pattern, '_') filename = re.sub(r'[<>:"|?*]', '_', filename) if len(filename) > 255: name_part = filename[:200] ext_part = Path(filename).suffix[:55] filename = name_part + ext_part if not filename or filename in {'.', '..'}: filename = "unnamed" return filename def validate_threshold(value: float) -> bool: """ Validate threshold value is in acceptable range. Args: value: Threshold value to validate Returns: True if valid, False otherwise """ if not isinstance(value, (int, float)): return False return 0.0 <= value <= 1.0 def validate_timeout(seconds: float) -> bool: """ Validate timeout value is reasonable. Args: seconds: Timeout value in seconds Returns: True if valid, False otherwise """ if not isinstance(seconds, (int, float)): return False return 1.0 <= seconds <= 300.0