File size: 3,618 Bytes
339760f
66d10f7
339760f
 
7c38093
66d10f7
 
 
 
 
 
 
 
 
 
 
 
 
 
339760f
66d10f7
 
339760f
66d10f7
339760f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65203a5
66d10f7
 
339760f
 
 
 
66d10f7
339760f
 
 
 
 
 
 
 
 
 
66d10f7
339760f
66d10f7
 
 
 
7c38093
66d10f7
 
 
 
339760f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
import re
import requests
import os

def extract_video_id(url: str) -> str:
    patterns = [
        r"(?:v=|\/)([0-9A-Za-z_-]{11}).*",
        r"(?:youtu\.be\/)([0-9A-Za-z_-]{11})",
        r"(?:embed\/)([0-9A-Za-z_-]{11})",
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_transcript(url: str) -> dict:
    video_id = extract_video_id(url)

    if not video_id:
        return {"success": False, "error": "Invalid YouTube URL."}

    try:
        api_key = os.environ.get("SUPADATA_API_KEY")
        if not api_key:
            return {"success": False, "error": "SUPADATA_API_KEY not set!"}

        # Supadata se transcript lo
        response = requests.get(
            "https://api.supadata.ai/v1/youtube/transcript",
            params={"url": f"https://www.youtube.com/watch?v={video_id}", "text": True},
            headers={"x-api-key": api_key},
            timeout=30
        )

        if response.status_code != 200:
            error_data = response.json()
            details = error_data.get("details", error_data.get("message", "Unknown error"))
            if "unavailable" in str(details).lower():
                return {"success": False, "error": "⚠️ No transcript found. Please use a lecture video with captions!"}
            elif "live" in str(details).lower():
                return {"success": False, "error": "⚠️ Live streams not supported!"}
            else:
                return {"success": False, "error": f"⚠️ {details}"}

        data = response.json()

        # Transcript text join karo
        content = data.get("content", "")
        if isinstance(content, list):
            full_transcript = " ".join([
                item.get("text", "") if isinstance(item, dict) else str(item)
                for item in content
            ])
        else:
            full_transcript = str(content)

        full_transcript = clean_transcript(full_transcript)

        if not full_transcript.strip():
            return {"success": False, "error": "⚠️ Transcript empty or not available."}

        # Video title lo
        try:
            title_response = requests.get(
                "https://api.supadata.ai/v1/youtube/video",
                params={"url": f"https://www.youtube.com/watch?v={video_id}"},
                headers={"x-api-key": api_key},
                timeout=15
            )
            if title_response.status_code == 200:
                video_title = title_response.json().get("title", f"Video {video_id}")
            else:
                video_title = f"Video {video_id}"
        except:
            video_title = f"Video {video_id}"

        return {
            "success": True,
            "transcript": full_transcript,
            "title": video_title,
            "video_id": video_id,
        }

    except Exception as e:
        return {"success": False, "error": f"⚠️ Error: {str(e)}"}


def clean_transcript(text: str) -> str:
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace('♪', '').replace('♫', '')
    return text


def chunk_transcript(transcript: str, chunk_size: int = 500, overlap: int = 50) -> list:
    words = transcript.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
    return chunks