Spaces:

Rthur2003
/

crowncode-backend

Sleeping

File size: 2,747 Bytes

"""
YouTube URL parsing helpers with enhanced validation.
"""

from __future__ import annotations

from dataclasses import dataclass
import re
from typing import Optional
from urllib.parse import parse_qs, urlparse

from .validation import validate_video_id, validate_url


@dataclass(frozen=True)
class ParsedYouTubeUrl:
    video_id: str
    normalized_url: str
    start_time_sec: Optional[int] = None


def _parse_time_offset(raw: str) -> Optional[int]:
    if not raw:
        return None
    value = raw.strip().lower()
    if value.isdigit():
        return int(value)

    total = 0
    matches = re.findall(r"(\d+)(h|m|s)", value)
    if not matches:
        return None

    for amount, unit in matches:
        amount_int = int(amount)
        if unit == "h":
            total += amount_int * 3600
        elif unit == "m":
            total += amount_int * 60
        elif unit == "s":
            total += amount_int
    return total


def _extract_video_id(parsed_url) -> Optional[str]:
    host = parsed_url.netloc.lower()
    path = parsed_url.path or ""
    query = parse_qs(parsed_url.query)

    if host in {"youtu.be", "www.youtu.be"}:
        candidate = path.strip("/").split("/")[0]
        return candidate or None

    if host in {"youtube.com", "www.youtube.com", "m.youtube.com", "music.youtube.com"}:
        if path == "/watch":
            return query.get("v", [None])[0]
        if path.startswith("/shorts/") or path.startswith("/live/") or path.startswith("/embed/"):
            parts = path.strip("/").split("/")
            return parts[1] if len(parts) > 1 else None

    return None


def parse_youtube_url(url: str) -> ParsedYouTubeUrl:
    if not url or not url.strip():
        raise ValueError("URL is empty.")
    
    if not validate_url(url):
        raise ValueError("Invalid or unsafe URL format.")

    parsed = urlparse(url.strip())
    if parsed.scheme not in {"http", "https"}:
        raise ValueError("URL must start with http:// or https://")

    video_id = _extract_video_id(parsed)
    if not video_id:
        raise ValueError("Invalid or missing YouTube video ID.")
    
    if not validate_video_id(video_id):
        raise ValueError("Invalid video ID format.")

    query = parse_qs(parsed.query)
    start_raw = query.get("t", [None])[0] or query.get("start", [None])[0] or query.get("time_continue", [None])[0]
    start_time_sec = _parse_time_offset(start_raw) if start_raw else None

    normalized_url = f"https://www.youtube.com/watch?v={video_id}"
    if start_time_sec:
        normalized_url = f"{normalized_url}&t={start_time_sec}"

    return ParsedYouTubeUrl(
        video_id=video_id,
        normalized_url=normalized_url,
        start_time_sec=start_time_sec,
    )