Spaces:
Running
Running
| """ | |
| YouTube URL parsing helpers with enhanced validation. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| import re | |
| from typing import Optional | |
| from urllib.parse import parse_qs, urlparse | |
| from .validation import validate_video_id, validate_url | |
| class ParsedYouTubeUrl: | |
| video_id: str | |
| normalized_url: str | |
| start_time_sec: Optional[int] = None | |
| def _parse_time_offset(raw: str) -> Optional[int]: | |
| if not raw: | |
| return None | |
| value = raw.strip().lower() | |
| if value.isdigit(): | |
| return int(value) | |
| total = 0 | |
| matches = re.findall(r"(\d+)(h|m|s)", value) | |
| if not matches: | |
| return None | |
| for amount, unit in matches: | |
| amount_int = int(amount) | |
| if unit == "h": | |
| total += amount_int * 3600 | |
| elif unit == "m": | |
| total += amount_int * 60 | |
| elif unit == "s": | |
| total += amount_int | |
| return total | |
| def _extract_video_id(parsed_url) -> Optional[str]: | |
| host = parsed_url.netloc.lower() | |
| path = parsed_url.path or "" | |
| query = parse_qs(parsed_url.query) | |
| if host in {"youtu.be", "www.youtu.be"}: | |
| candidate = path.strip("/").split("/")[0] | |
| return candidate or None | |
| if host in {"youtube.com", "www.youtube.com", "m.youtube.com", "music.youtube.com"}: | |
| if path == "/watch": | |
| return query.get("v", [None])[0] | |
| if path.startswith("/shorts/") or path.startswith("/live/") or path.startswith("/embed/"): | |
| parts = path.strip("/").split("/") | |
| return parts[1] if len(parts) > 1 else None | |
| return None | |
| def parse_youtube_url(url: str) -> ParsedYouTubeUrl: | |
| if not url or not url.strip(): | |
| raise ValueError("URL is empty.") | |
| if not validate_url(url): | |
| raise ValueError("Invalid or unsafe URL format.") | |
| parsed = urlparse(url.strip()) | |
| if parsed.scheme not in {"http", "https"}: | |
| raise ValueError("URL must start with http:// or https://") | |
| video_id = _extract_video_id(parsed) | |
| if not video_id: | |
| raise ValueError("Invalid or missing YouTube video ID.") | |
| if not validate_video_id(video_id): | |
| raise ValueError("Invalid video ID format.") | |
| query = parse_qs(parsed.query) | |
| start_raw = query.get("t", [None])[0] or query.get("start", [None])[0] or query.get("time_continue", [None])[0] | |
| start_time_sec = _parse_time_offset(start_raw) if start_raw else None | |
| normalized_url = f"https://www.youtube.com/watch?v={video_id}" | |
| if start_time_sec: | |
| normalized_url = f"{normalized_url}&t={start_time_sec}" | |
| return ParsedYouTubeUrl( | |
| video_id=video_id, | |
| normalized_url=normalized_url, | |
| start_time_sec=start_time_sec, | |
| ) | |