Spaces:
Running
Running
| from app.pipeline.transcript import align_segments, cue_bounds_per_segment, parse_transcript | |
| VTT_SAMPLE = """WEBVTT | |
| 1 | |
| 00:00:00.000 --> 00:00:04.500 | |
| Welcome to the lecture on photosynthesis. | |
| 2 | |
| 00:00:04.500 --> 00:00:09.000 | |
| Today we will cover the light-dependent reactions. | |
| 3 | |
| 00:00:09.000 --> 00:00:14.000 | |
| And then we will <b>discuss</b> the Calvin cycle. | |
| """ | |
| SRT_SAMPLE = """1 | |
| 00:00:00,000 --> 00:00:03,000 | |
| First slide. | |
| 2 | |
| 00:00:03,000 --> 00:00:07,500 | |
| Second slide content here. | |
| 3 | |
| 00:00:07,500 --> 00:00:12,000 | |
| Third slide content here. | |
| """ | |
| def test_parse_vtt_basic(): | |
| cues = parse_transcript(VTT_SAMPLE) | |
| assert len(cues) == 3 | |
| assert cues[0].start == 0.0 | |
| assert cues[0].end == 4.5 | |
| assert "photosynthesis" in cues[0].text | |
| # HTML tags stripped | |
| assert "<b>" not in cues[2].text | |
| assert "discuss" in cues[2].text | |
| def test_parse_srt_basic(): | |
| cues = parse_transcript(SRT_SAMPLE) | |
| assert len(cues) == 3 | |
| assert cues[1].start == 3.0 | |
| assert cues[1].end == 7.5 | |
| assert cues[2].text == "Third slide content here." | |
| def test_parse_handles_crlf(): | |
| cues = parse_transcript(VTT_SAMPLE.replace("\n", "\r\n")) | |
| assert len(cues) == 3 | |
| def test_align_segments_buckets_by_midpoint(): | |
| cues = parse_transcript(VTT_SAMPLE) | |
| # Two visual segments: [0, 7) and [7, 14) | |
| boundaries = [0.0, 7.0] | |
| narrations = align_segments(cues, boundaries, total_duration=14.0) | |
| assert len(narrations) == 2 | |
| seg0 = " ".join(narrations[0]) | |
| seg1 = " ".join(narrations[1]) | |
| # First two cues mid-points (2.25, 6.75) fall into segment 0. | |
| assert "photosynthesis" in seg0 | |
| assert "light-dependent" in seg0 | |
| # Third cue mid-point (11.5) falls into segment 1. | |
| assert "Calvin cycle" in seg1 | |
| def test_align_segments_no_boundaries_returns_empty(): | |
| cues = parse_transcript(VTT_SAMPLE) | |
| assert align_segments(cues, [], total_duration=14.0) == [] | |
| def test_align_segments_handles_cues_outside_range(): | |
| cues = parse_transcript(VTT_SAMPLE) | |
| # All three cues end by 14s. With boundaries starting at 100s, none align. | |
| boundaries = [100.0] | |
| narrations = align_segments(cues, boundaries, total_duration=200.0) | |
| assert narrations == [[]] | |
| def test_align_segments_keeps_trailing_cues_when_total_duration_extends_past_last_boundary(): | |
| """When the last visible segment ends before the video ends β e.g. the | |
| instructor-frame filter drops the final wrap-up shot β the transcript | |
| cues from that trailing portion must still get attached to the last | |
| kept segment, not silently dropped. The orchestrator passes the | |
| actual video duration as `total_duration` so this works. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:05.000\nIntro to topic.\n\n" | |
| "2\n00:00:10.000 --> 00:00:15.000\nMain point of the lecture.\n\n" | |
| "3\n00:00:50.000 --> 00:00:55.000\nWrap-up after last visual.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| # Two visual segments at 0s and 10s; the last kept frame's end is | |
| # 30s (its scene boundary), but the video actually runs to 60s and | |
| # there's a wrap-up cue at ~52s. With total_duration=60, the wrap-up | |
| # cue must land in the last (10s) segment. | |
| boundaries = [0.0, 10.0] | |
| narrations = align_segments(cues, boundaries, total_duration=60.0) | |
| assert "Intro to topic." in " ".join(narrations[0]) | |
| last_text = " ".join(narrations[1]) | |
| assert "Main point" in last_text | |
| assert "Wrap-up after last visual." in last_text | |
| def test_align_segments_breaks_paragraphs_on_long_gap(): | |
| # Two cues separated by a 5-second silence β should become two paragraphs. | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:03.000\nFirst point ends here.\n\n" | |
| "2\n00:00:08.000 --> 00:00:11.000\nSecond point starts now.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| paras = align_segments(cues, [0.0], total_duration=15.0)[0] | |
| assert len(paras) == 2 | |
| assert paras[0].startswith("First point") | |
| assert paras[1].startswith("Second point") | |
| def test_align_segments_breaks_on_speaker_change(): | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:02.000\nPROFESSOR: Welcome to class.\n\n" | |
| "2\n00:00:02.000 --> 00:00:04.000\nSTUDENT: Thank you.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| paras = align_segments(cues, [0.0], total_duration=5.0)[0] | |
| assert len(paras) == 2 | |
| assert paras[0].startswith("PROFESSOR:") | |
| assert paras[1].startswith("STUDENT:") | |
| def test_align_segments_breaks_on_length_at_sentence_boundary(): | |
| body = "This is a sentence that fills the buffer. " * 20 # ~840 chars | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| f"1\n00:00:00.000 --> 00:00:30.000\n{body.strip()}\n\n" | |
| "2\n00:00:30.000 --> 00:00:32.000\nFinal short cue.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| paras = align_segments(cues, [0.0], total_duration=35.0)[0] | |
| # Expect at least two paragraphs once length threshold is exceeded. | |
| assert len(paras) >= 2 | |
| def test_cue_bounds_snap_audio_outward_to_full_cues(): | |
| """Scene-change boundaries rarely align with sentence boundaries, so | |
| slicing audio at exactly each segment's [start, end) cuts mid-sentence. | |
| cue_bounds_per_segment returns the first-cue-start / last-cue-end of | |
| the cues bucketed into each segment so the orchestrator can extend | |
| the audio slice outward to whole cues. The first cue here straddles | |
| segment 0's start and the last cue extends past segment 1's end β | |
| both must be reflected in the bounds. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:09.000 --> 00:00:13.000\nFirst sentence crosses boundary.\n\n" | |
| "2\n00:00:13.500 --> 00:00:18.000\nMiddle sentence in segment one.\n\n" | |
| "3\n00:00:19.500 --> 00:00:22.500\nSecond segment opener.\n\n" | |
| "4\n00:00:28.500 --> 00:00:33.000\nFinal sentence runs past end.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| # Visual segment boundaries at 10 and 20; the last visual frame ends | |
| # at 30 but the final cue extends to 33. | |
| bounds = cue_bounds_per_segment(cues, [10.0, 20.0], total_duration=33.0) | |
| assert len(bounds) == 2 | |
| assert bounds[0] is not None and bounds[1] is not None | |
| # Segment 0 must cover from cue 1 start (9.0) to cue 2 end (18.0), | |
| # i.e. extend backward past the segment start (10) to capture the | |
| # word that began at 9.0 β that's the "mid-sentence cut" fix. | |
| assert bounds[0][0] == 9.0 | |
| assert bounds[0][1] == 18.0 | |
| # Segment 1 must extend forward past the segment end (30) to 33.0 | |
| # so the trailing sentence isn't truncated. | |
| assert bounds[1][0] == 19.5 | |
| assert bounds[1][1] == 33.0 | |
| def test_cue_bounds_returns_none_for_segments_with_no_cues(): | |
| """When no cue's midpoint lands in a segment, the orchestrator must | |
| fall back to the scene's own bounds; signal that with None. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:05.000\nIntro.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| bounds = cue_bounds_per_segment(cues, [0.0, 100.0], total_duration=200.0) | |
| assert bounds[0] is not None | |
| assert bounds[1] is None | |
| def test_align_segments_stitches_sentence_across_scene_boundary(): | |
| """A single sentence often spans two cues. When a scene boundary | |
| falls between them, the cues bucket into different segments by | |
| midpoint and the sentence shows up split across the rendered | |
| narration β A ends mid-clause and B opens mid-sentence. The | |
| stitcher absorbs the trailing cue into A so the sentence lands | |
| intact in the segment where it began. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| # cue 1: starts in segment A, opens a sentence (no terminator) | |
| "1\n00:00:08.000 --> 00:00:11.000\nNow the question is how do we\n\n" | |
| # cue 2: starts in segment B, finishes the sentence | |
| "2\n00:00:11.500 --> 00:00:14.000\nincrease heat pump installation rates.\n\n" | |
| # cue 3: clearly a new sentence, belongs to segment B | |
| "3\n00:00:15.000 --> 00:00:18.000\nLet's look at three approaches.\n\n" | |
| "4\n00:00:18.500 --> 00:00:21.000\nFirst, financial incentives.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| # Boundary at 12 splits cue 1 (mid=9.5, segment A) from cue 2 | |
| # (mid=12.75, segment B). Without stitching, segment A ends with | |
| # "...how do we" and segment B begins with "increase heat pump...". | |
| boundaries = [0.0, 12.0] | |
| narrations = align_segments(cues, boundaries, total_duration=22.0) | |
| seg_a = " ".join(narrations[0]) | |
| seg_b = " ".join(narrations[1]) | |
| # The completed sentence must live entirely in segment A. | |
| assert "increase heat pump installation rates." in seg_a, seg_a | |
| assert "increase heat pump installation rates" not in seg_b, seg_b | |
| # Segment B must keep the genuinely-new sentences that started after | |
| # the stitched cue. | |
| assert "Let's look at three approaches." in seg_b | |
| assert "First, financial incentives." in seg_b | |
| def test_align_segments_splits_continuation_cue_at_first_sentence_end(): | |
| """When B starts with the final word of A's sentence plus a new | |
| sentence, only the completion fragment should move back to A. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:18.000\n" | |
| "Welcome to climate education and how it can impact your\n\n" | |
| "2\n00:00:19.000 --> 00:00:24.000\n" | |
| "community. Heat pumps can heat and cool your home affordably.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| narrations = align_segments(cues, [0.0, 19.0], total_duration=25.0) | |
| seg_a = " ".join(narrations[0]) | |
| seg_b = " ".join(narrations[1]) | |
| assert seg_a.endswith("impact your community.") | |
| assert "Heat pumps can heat and cool your home affordably." not in seg_a | |
| assert seg_b == "Heat pumps can heat and cool your home affordably." | |
| def test_align_segments_leaves_clean_sentence_boundaries_unchanged(): | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:08.000\nFirst sentence ends cleanly.\n\n" | |
| "2\n00:00:11.000 --> 00:00:14.000\nSecond segment starts cleanly.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0) | |
| assert narrations[0] == ["First sentence ends cleanly."] | |
| assert narrations[1] == ["Second segment starts cleanly."] | |
| def test_align_segments_does_not_split_abbreviation_in_continuation(): | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:08.000\nWe talked with\n\n" | |
| "2\n00:00:11.000 --> 00:00:14.000\nDr. Smith about heat pumps.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0) | |
| assert " ".join(narrations[0]) == "We talked with Dr. Smith about heat pumps." | |
| assert narrations[1] == [] | |
| def test_align_segments_does_not_split_decimal_in_continuation(): | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:08.000\nCosts can be\n\n" | |
| "2\n00:00:11.000 --> 00:00:14.000\n2.5 times lower after rebates.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0) | |
| assert " ".join(narrations[0]) == "Costs can be 2.5 times lower after rebates." | |
| assert narrations[1] == [] | |
| def test_align_segments_stitcher_skips_when_no_terminator_in_reach(): | |
| """If absorbing wouldn't actually complete a sentence β no cue in | |
| B's first few starts a sentence-terminator β the stitcher must | |
| leave both segments alone. Partial absorption would just shuffle | |
| the same mid-sentence cut to a different word with no benefit. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| # All cues lack sentence-terminating punctuation. | |
| "1\n00:00:00.000 --> 00:00:03.000\nThis sentence keeps going\n\n" | |
| "2\n00:00:11.000 --> 00:00:14.000\nand still keeps going\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| boundaries = [0.0, 10.0] | |
| narrations = align_segments(cues, boundaries, total_duration=15.0) | |
| # No terminator anywhere β no absorption; original bucketing stands. | |
| assert narrations[0] != [] and narrations[1] != [] | |
| assert "This sentence keeps going" in " ".join(narrations[0]) | |
| assert "and still keeps going" in " ".join(narrations[1]) | |
| def test_align_segments_stitcher_absorbs_lone_terminator_cue_from_b(): | |
| """Real-world heat-pump case: A's last cue ends "...so here in the | |
| Boston" with no terminator, and B's only cue completes the sentence | |
| with "area, you can install a heat pump...freezing." The stitcher | |
| must absorb B's cue into A β even though that empties B β because | |
| the sentence belongs to A's slide (the speaker started it there). | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:00.000 --> 00:00:05.000\nIntro sentence finishes here.\n\n" | |
| # A's last cue ends mid-sentence | |
| "2\n00:00:08.000 --> 00:00:11.000\nSo here in the Boston\n\n" | |
| # B's only cue completes the sentence | |
| "3\n00:00:11.500 --> 00:00:14.500\narea, heat pumps stay viable.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| boundaries = [0.0, 11.0] | |
| narrations = align_segments(cues, boundaries, total_duration=15.0) | |
| seg_a = " ".join(narrations[0]) | |
| seg_b = " ".join(narrations[1]) | |
| assert "area, heat pumps stay viable." in seg_a, seg_a | |
| assert "Boston" in seg_a and seg_a.rstrip().endswith("viable.") | |
| # B emptied β that's the trade-off and is acceptable; the editor | |
| # can still show the slide with no auto-narration. | |
| assert seg_b == "" | |
| def test_cue_bounds_reflect_stitched_bucketing(): | |
| """Audio bounds derive from the same buckets as text β so when the | |
| stitcher moves a cue across a boundary, the audio extends with it. | |
| """ | |
| vtt = ( | |
| "WEBVTT\n\n" | |
| "1\n00:00:08.000 --> 00:00:11.000\nNow the question is how do we\n\n" | |
| "2\n00:00:11.500 --> 00:00:14.000\nincrease heat pump installation rates.\n\n" | |
| "3\n00:00:15.000 --> 00:00:18.000\nLet's look at three approaches.\n\n" | |
| "4\n00:00:18.500 --> 00:00:21.000\nFirst, financial incentives.\n" | |
| ) | |
| cues = parse_transcript(vtt) | |
| bounds = cue_bounds_per_segment(cues, [0.0, 12.0], total_duration=22.0) | |
| # Segment A's audio must extend to 14.0 to cover the absorbed cue. | |
| assert bounds[0] is not None and bounds[0][1] == 14.0 | |
| # Segment B's audio starts at 15.0 (the next sentence), not 11.5. | |
| assert bounds[1] is not None and bounds[1][0] == 15.0 | |