Spaces:

eshameo045
/

LectureLens_AI

Running

App Files Files Community

LectureLens_AI / utils /transcript_handler.py

eshameo045

update

339760f 3 days ago

raw

history blame contribute delete

3.62 kB

	from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
	import re
	import requests
	import os

	def extract_video_id(url: str) -> str:
	patterns = [
	r"(?:v=\|\/)([0-9A-Za-z_-]{11}).*",
	r"(?:youtu\.be\/)([0-9A-Za-z_-]{11})",
	r"(?:embed\/)([0-9A-Za-z_-]{11})",
	]
	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	def get_transcript(url: str) -> dict:
	video_id = extract_video_id(url)

	if not video_id:
	return {"success": False, "error": "Invalid YouTube URL."}

	try:
	api_key = os.environ.get("SUPADATA_API_KEY")
	if not api_key:
	return {"success": False, "error": "SUPADATA_API_KEY not set!"}

	# Supadata se transcript lo
	response = requests.get(
	"https://api.supadata.ai/v1/youtube/transcript",
	params={"url": f"https://www.youtube.com/watch?v={video_id}", "text": True},
	headers={"x-api-key": api_key},
	timeout=30
	)

	if response.status_code != 200:
	error_data = response.json()
	details = error_data.get("details", error_data.get("message", "Unknown error"))
	if "unavailable" in str(details).lower():
	return {"success": False, "error": "⚠️ No transcript found. Please use a lecture video with captions!"}
	elif "live" in str(details).lower():
	return {"success": False, "error": "⚠️ Live streams not supported!"}
	else:
	return {"success": False, "error": f"⚠️ {details}"}

	data = response.json()

	# Transcript text join karo
	content = data.get("content", "")
	if isinstance(content, list):
	full_transcript = " ".join([
	item.get("text", "") if isinstance(item, dict) else str(item)
	for item in content
	])
	else:
	full_transcript = str(content)

	full_transcript = clean_transcript(full_transcript)

	if not full_transcript.strip():
	return {"success": False, "error": "⚠️ Transcript empty or not available."}

	# Video title lo
	try:
	title_response = requests.get(
	"https://api.supadata.ai/v1/youtube/video",
	params={"url": f"https://www.youtube.com/watch?v={video_id}"},
	headers={"x-api-key": api_key},
	timeout=15
	)
	if title_response.status_code == 200:
	video_title = title_response.json().get("title", f"Video {video_id}")
	else:
	video_title = f"Video {video_id}"
	except:
	video_title = f"Video {video_id}"

	return {
	"success": True,
	"transcript": full_transcript,
	"title": video_title,
	"video_id": video_id,
	}

	except Exception as e:
	return {"success": False, "error": f"⚠️ Error: {str(e)}"}


	def clean_transcript(text: str) -> str:
	text = re.sub(r'\[.*?\]', '', text)
	text = re.sub(r'\(.*?\)', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	text = text.replace('♪', '').replace('♫', '')
	return text


	def chunk_transcript(transcript: str, chunk_size: int = 500, overlap: int = 50) -> list:
	words = transcript.split()
	chunks = []
	for i in range(0, len(words), chunk_size - overlap):
	chunk = " ".join(words[i:i + chunk_size])
	if chunk:
	chunks.append(chunk)
	return chunks