Spaces:

joemartis
/

Video2Guide

Running

File size: 14,421 Bytes

from app.pipeline.transcript import align_segments, cue_bounds_per_segment, parse_transcript

VTT_SAMPLE = """WEBVTT

1
00:00:00.000 --> 00:00:04.500
Welcome to the lecture on photosynthesis.

2
00:00:04.500 --> 00:00:09.000
Today we will cover the light-dependent reactions.

3
00:00:09.000 --> 00:00:14.000
And then we will <b>discuss</b> the Calvin cycle.
"""

SRT_SAMPLE = """1
00:00:00,000 --> 00:00:03,000
First slide.

2
00:00:03,000 --> 00:00:07,500
Second slide content here.

3
00:00:07,500 --> 00:00:12,000
Third slide content here.
"""


def test_parse_vtt_basic():
    cues = parse_transcript(VTT_SAMPLE)
    assert len(cues) == 3
    assert cues[0].start == 0.0
    assert cues[0].end == 4.5
    assert "photosynthesis" in cues[0].text
    # HTML tags stripped
    assert "<b>" not in cues[2].text
    assert "discuss" in cues[2].text


def test_parse_srt_basic():
    cues = parse_transcript(SRT_SAMPLE)
    assert len(cues) == 3
    assert cues[1].start == 3.0
    assert cues[1].end == 7.5
    assert cues[2].text == "Third slide content here."


def test_parse_handles_crlf():
    cues = parse_transcript(VTT_SAMPLE.replace("\n", "\r\n"))
    assert len(cues) == 3


def test_align_segments_buckets_by_midpoint():
    cues = parse_transcript(VTT_SAMPLE)
    # Two visual segments: [0, 7) and [7, 14)
    boundaries = [0.0, 7.0]
    narrations = align_segments(cues, boundaries, total_duration=14.0)
    assert len(narrations) == 2
    seg0 = " ".join(narrations[0])
    seg1 = " ".join(narrations[1])
    # First two cues mid-points (2.25, 6.75) fall into segment 0.
    assert "photosynthesis" in seg0
    assert "light-dependent" in seg0
    # Third cue mid-point (11.5) falls into segment 1.
    assert "Calvin cycle" in seg1


def test_align_segments_no_boundaries_returns_empty():
    cues = parse_transcript(VTT_SAMPLE)
    assert align_segments(cues, [], total_duration=14.0) == []


def test_align_segments_handles_cues_outside_range():
    cues = parse_transcript(VTT_SAMPLE)
    # All three cues end by 14s. With boundaries starting at 100s, none align.
    boundaries = [100.0]
    narrations = align_segments(cues, boundaries, total_duration=200.0)
    assert narrations == [[]]


def test_align_segments_keeps_trailing_cues_when_total_duration_extends_past_last_boundary():
    """When the last visible segment ends before the video ends — e.g. the
    instructor-frame filter drops the final wrap-up shot — the transcript
    cues from that trailing portion must still get attached to the last
    kept segment, not silently dropped. The orchestrator passes the
    actual video duration as `total_duration` so this works.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:05.000\nIntro to topic.\n\n"
        "2\n00:00:10.000 --> 00:00:15.000\nMain point of the lecture.\n\n"
        "3\n00:00:50.000 --> 00:00:55.000\nWrap-up after last visual.\n"
    )
    cues = parse_transcript(vtt)
    # Two visual segments at 0s and 10s; the last kept frame's end is
    # 30s (its scene boundary), but the video actually runs to 60s and
    # there's a wrap-up cue at ~52s. With total_duration=60, the wrap-up
    # cue must land in the last (10s) segment.
    boundaries = [0.0, 10.0]
    narrations = align_segments(cues, boundaries, total_duration=60.0)
    assert "Intro to topic." in " ".join(narrations[0])
    last_text = " ".join(narrations[1])
    assert "Main point" in last_text
    assert "Wrap-up after last visual." in last_text


def test_align_segments_breaks_paragraphs_on_long_gap():
    # Two cues separated by a 5-second silence — should become two paragraphs.
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:03.000\nFirst point ends here.\n\n"
        "2\n00:00:08.000 --> 00:00:11.000\nSecond point starts now.\n"
    )
    cues = parse_transcript(vtt)
    paras = align_segments(cues, [0.0], total_duration=15.0)[0]
    assert len(paras) == 2
    assert paras[0].startswith("First point")
    assert paras[1].startswith("Second point")


def test_align_segments_breaks_on_speaker_change():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:02.000\nPROFESSOR: Welcome to class.\n\n"
        "2\n00:00:02.000 --> 00:00:04.000\nSTUDENT: Thank you.\n"
    )
    cues = parse_transcript(vtt)
    paras = align_segments(cues, [0.0], total_duration=5.0)[0]
    assert len(paras) == 2
    assert paras[0].startswith("PROFESSOR:")
    assert paras[1].startswith("STUDENT:")


def test_align_segments_breaks_on_length_at_sentence_boundary():
    body = "This is a sentence that fills the buffer. " * 20  # ~840 chars
    vtt = (
        "WEBVTT\n\n"
        f"1\n00:00:00.000 --> 00:00:30.000\n{body.strip()}\n\n"
        "2\n00:00:30.000 --> 00:00:32.000\nFinal short cue.\n"
    )
    cues = parse_transcript(vtt)
    paras = align_segments(cues, [0.0], total_duration=35.0)[0]
    # Expect at least two paragraphs once length threshold is exceeded.
    assert len(paras) >= 2


def test_cue_bounds_snap_audio_outward_to_full_cues():
    """Scene-change boundaries rarely align with sentence boundaries, so
    slicing audio at exactly each segment's [start, end) cuts mid-sentence.
    cue_bounds_per_segment returns the first-cue-start / last-cue-end of
    the cues bucketed into each segment so the orchestrator can extend
    the audio slice outward to whole cues. The first cue here straddles
    segment 0's start and the last cue extends past segment 1's end —
    both must be reflected in the bounds.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:09.000 --> 00:00:13.000\nFirst sentence crosses boundary.\n\n"
        "2\n00:00:13.500 --> 00:00:18.000\nMiddle sentence in segment one.\n\n"
        "3\n00:00:19.500 --> 00:00:22.500\nSecond segment opener.\n\n"
        "4\n00:00:28.500 --> 00:00:33.000\nFinal sentence runs past end.\n"
    )
    cues = parse_transcript(vtt)
    # Visual segment boundaries at 10 and 20; the last visual frame ends
    # at 30 but the final cue extends to 33.
    bounds = cue_bounds_per_segment(cues, [10.0, 20.0], total_duration=33.0)
    assert len(bounds) == 2
    assert bounds[0] is not None and bounds[1] is not None
    # Segment 0 must cover from cue 1 start (9.0) to cue 2 end (18.0),
    # i.e. extend backward past the segment start (10) to capture the
    # word that began at 9.0 — that's the "mid-sentence cut" fix.
    assert bounds[0][0] == 9.0
    assert bounds[0][1] == 18.0
    # Segment 1 must extend forward past the segment end (30) to 33.0
    # so the trailing sentence isn't truncated.
    assert bounds[1][0] == 19.5
    assert bounds[1][1] == 33.0


def test_cue_bounds_returns_none_for_segments_with_no_cues():
    """When no cue's midpoint lands in a segment, the orchestrator must
    fall back to the scene's own bounds; signal that with None.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:05.000\nIntro.\n"
    )
    cues = parse_transcript(vtt)
    bounds = cue_bounds_per_segment(cues, [0.0, 100.0], total_duration=200.0)
    assert bounds[0] is not None
    assert bounds[1] is None


def test_align_segments_stitches_sentence_across_scene_boundary():
    """A single sentence often spans two cues. When a scene boundary
    falls between them, the cues bucket into different segments by
    midpoint and the sentence shows up split across the rendered
    narration — A ends mid-clause and B opens mid-sentence. The
    stitcher absorbs the trailing cue into A so the sentence lands
    intact in the segment where it began.
    """
    vtt = (
        "WEBVTT\n\n"
        # cue 1: starts in segment A, opens a sentence (no terminator)
        "1\n00:00:08.000 --> 00:00:11.000\nNow the question is how do we\n\n"
        # cue 2: starts in segment B, finishes the sentence
        "2\n00:00:11.500 --> 00:00:14.000\nincrease heat pump installation rates.\n\n"
        # cue 3: clearly a new sentence, belongs to segment B
        "3\n00:00:15.000 --> 00:00:18.000\nLet's look at three approaches.\n\n"
        "4\n00:00:18.500 --> 00:00:21.000\nFirst, financial incentives.\n"
    )
    cues = parse_transcript(vtt)
    # Boundary at 12 splits cue 1 (mid=9.5, segment A) from cue 2
    # (mid=12.75, segment B). Without stitching, segment A ends with
    # "...how do we" and segment B begins with "increase heat pump...".
    boundaries = [0.0, 12.0]
    narrations = align_segments(cues, boundaries, total_duration=22.0)
    seg_a = " ".join(narrations[0])
    seg_b = " ".join(narrations[1])
    # The completed sentence must live entirely in segment A.
    assert "increase heat pump installation rates." in seg_a, seg_a
    assert "increase heat pump installation rates" not in seg_b, seg_b
    # Segment B must keep the genuinely-new sentences that started after
    # the stitched cue.
    assert "Let's look at three approaches." in seg_b
    assert "First, financial incentives." in seg_b


def test_align_segments_splits_continuation_cue_at_first_sentence_end():
    """When B starts with the final word of A's sentence plus a new
    sentence, only the completion fragment should move back to A.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:18.000\n"
        "Welcome to climate education and how it can impact your\n\n"
        "2\n00:00:19.000 --> 00:00:24.000\n"
        "community. Heat pumps can heat and cool your home affordably.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 19.0], total_duration=25.0)
    seg_a = " ".join(narrations[0])
    seg_b = " ".join(narrations[1])
    assert seg_a.endswith("impact your community.")
    assert "Heat pumps can heat and cool your home affordably." not in seg_a
    assert seg_b == "Heat pumps can heat and cool your home affordably."


def test_align_segments_leaves_clean_sentence_boundaries_unchanged():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:08.000\nFirst sentence ends cleanly.\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\nSecond segment starts cleanly.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0)
    assert narrations[0] == ["First sentence ends cleanly."]
    assert narrations[1] == ["Second segment starts cleanly."]


def test_align_segments_does_not_split_abbreviation_in_continuation():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:08.000\nWe talked with\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\nDr. Smith about heat pumps.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0)
    assert " ".join(narrations[0]) == "We talked with Dr. Smith about heat pumps."
    assert narrations[1] == []


def test_align_segments_does_not_split_decimal_in_continuation():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:08.000\nCosts can be\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\n2.5 times lower after rebates.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0)
    assert " ".join(narrations[0]) == "Costs can be 2.5 times lower after rebates."
    assert narrations[1] == []


def test_align_segments_stitcher_skips_when_no_terminator_in_reach():
    """If absorbing wouldn't actually complete a sentence — no cue in
    B's first few starts a sentence-terminator — the stitcher must
    leave both segments alone. Partial absorption would just shuffle
    the same mid-sentence cut to a different word with no benefit.
    """
    vtt = (
        "WEBVTT\n\n"
        # All cues lack sentence-terminating punctuation.
        "1\n00:00:00.000 --> 00:00:03.000\nThis sentence keeps going\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\nand still keeps going\n"
    )
    cues = parse_transcript(vtt)
    boundaries = [0.0, 10.0]
    narrations = align_segments(cues, boundaries, total_duration=15.0)
    # No terminator anywhere → no absorption; original bucketing stands.
    assert narrations[0] != [] and narrations[1] != []
    assert "This sentence keeps going" in " ".join(narrations[0])
    assert "and still keeps going" in " ".join(narrations[1])


def test_align_segments_stitcher_absorbs_lone_terminator_cue_from_b():
    """Real-world heat-pump case: A's last cue ends "...so here in the
    Boston" with no terminator, and B's only cue completes the sentence
    with "area, you can install a heat pump...freezing." The stitcher
    must absorb B's cue into A — even though that empties B — because
    the sentence belongs to A's slide (the speaker started it there).
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:05.000\nIntro sentence finishes here.\n\n"
        # A's last cue ends mid-sentence
        "2\n00:00:08.000 --> 00:00:11.000\nSo here in the Boston\n\n"
        # B's only cue completes the sentence
        "3\n00:00:11.500 --> 00:00:14.500\narea, heat pumps stay viable.\n"
    )
    cues = parse_transcript(vtt)
    boundaries = [0.0, 11.0]
    narrations = align_segments(cues, boundaries, total_duration=15.0)
    seg_a = " ".join(narrations[0])
    seg_b = " ".join(narrations[1])
    assert "area, heat pumps stay viable." in seg_a, seg_a
    assert "Boston" in seg_a and seg_a.rstrip().endswith("viable.")
    # B emptied — that's the trade-off and is acceptable; the editor
    # can still show the slide with no auto-narration.
    assert seg_b == ""


def test_cue_bounds_reflect_stitched_bucketing():
    """Audio bounds derive from the same buckets as text — so when the
    stitcher moves a cue across a boundary, the audio extends with it.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:08.000 --> 00:00:11.000\nNow the question is how do we\n\n"
        "2\n00:00:11.500 --> 00:00:14.000\nincrease heat pump installation rates.\n\n"
        "3\n00:00:15.000 --> 00:00:18.000\nLet's look at three approaches.\n\n"
        "4\n00:00:18.500 --> 00:00:21.000\nFirst, financial incentives.\n"
    )
    cues = parse_transcript(vtt)
    bounds = cue_bounds_per_segment(cues, [0.0, 12.0], total_duration=22.0)
    # Segment A's audio must extend to 14.0 to cover the absorbed cue.
    assert bounds[0] is not None and bounds[0][1] == 14.0
    # Segment B's audio starts at 15.0 (the next sentence), not 11.5.
    assert bounds[1] is not None and bounds[1][0] == 15.0