crowncode-backend / app /services /url_parser.py
Rthur2003's picture
fix: enhance CORS origin handling and update validation for YouTube URLs
cbff5e0
"""
YouTube URL parsing helpers with enhanced validation.
"""
from __future__ import annotations
from dataclasses import dataclass
import re
from typing import Optional
from urllib.parse import parse_qs, urlparse
from .validation import validate_video_id, validate_url
@dataclass(frozen=True)
class ParsedYouTubeUrl:
video_id: str
normalized_url: str
start_time_sec: Optional[int] = None
def _parse_time_offset(raw: str) -> Optional[int]:
if not raw:
return None
value = raw.strip().lower()
if value.isdigit():
return int(value)
total = 0
matches = re.findall(r"(\d+)(h|m|s)", value)
if not matches:
return None
for amount, unit in matches:
amount_int = int(amount)
if unit == "h":
total += amount_int * 3600
elif unit == "m":
total += amount_int * 60
elif unit == "s":
total += amount_int
return total
def _extract_video_id(parsed_url) -> Optional[str]:
host = parsed_url.netloc.lower()
path = parsed_url.path or ""
query = parse_qs(parsed_url.query)
if host in {"youtu.be", "www.youtu.be"}:
candidate = path.strip("/").split("/")[0]
return candidate or None
if host in {"youtube.com", "www.youtube.com", "m.youtube.com", "music.youtube.com"}:
if path == "/watch":
return query.get("v", [None])[0]
if path.startswith("/shorts/") or path.startswith("/live/") or path.startswith("/embed/"):
parts = path.strip("/").split("/")
return parts[1] if len(parts) > 1 else None
return None
def parse_youtube_url(url: str) -> ParsedYouTubeUrl:
if not url or not url.strip():
raise ValueError("URL is empty.")
if not validate_url(url):
raise ValueError("Invalid or unsafe URL format.")
parsed = urlparse(url.strip())
if parsed.scheme not in {"http", "https"}:
raise ValueError("URL must start with http:// or https://")
video_id = _extract_video_id(parsed)
if not video_id:
raise ValueError("Invalid or missing YouTube video ID.")
if not validate_video_id(video_id):
raise ValueError("Invalid video ID format.")
query = parse_qs(parsed.query)
start_raw = query.get("t", [None])[0] or query.get("start", [None])[0] or query.get("time_continue", [None])[0]
start_time_sec = _parse_time_offset(start_raw) if start_raw else None
normalized_url = f"https://www.youtube.com/watch?v={video_id}"
if start_time_sec:
normalized_url = f"{normalized_url}&t={start_time_sec}"
return ParsedYouTubeUrl(
video_id=video_id,
normalized_url=normalized_url,
start_time_sec=start_time_sec,
)