File size: 14,421 Bytes
cae500e
b6f361a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f697ee6
 
b6f361a
f697ee6
 
b6f361a
f697ee6
b6f361a
 
 
 
 
 
 
 
 
 
 
 
f697ee6
 
 
410a4b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f697ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cae500e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23524b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31770ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169b4ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816dd45
 
 
 
 
23524b2
 
 
 
 
 
 
 
 
 
816dd45
 
 
23524b2
 
 
816dd45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23524b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
from app.pipeline.transcript import align_segments, cue_bounds_per_segment, parse_transcript

VTT_SAMPLE = """WEBVTT

1
00:00:00.000 --> 00:00:04.500
Welcome to the lecture on photosynthesis.

2
00:00:04.500 --> 00:00:09.000
Today we will cover the light-dependent reactions.

3
00:00:09.000 --> 00:00:14.000
And then we will <b>discuss</b> the Calvin cycle.
"""

SRT_SAMPLE = """1
00:00:00,000 --> 00:00:03,000
First slide.

2
00:00:03,000 --> 00:00:07,500
Second slide content here.

3
00:00:07,500 --> 00:00:12,000
Third slide content here.
"""


def test_parse_vtt_basic():
    cues = parse_transcript(VTT_SAMPLE)
    assert len(cues) == 3
    assert cues[0].start == 0.0
    assert cues[0].end == 4.5
    assert "photosynthesis" in cues[0].text
    # HTML tags stripped
    assert "<b>" not in cues[2].text
    assert "discuss" in cues[2].text


def test_parse_srt_basic():
    cues = parse_transcript(SRT_SAMPLE)
    assert len(cues) == 3
    assert cues[1].start == 3.0
    assert cues[1].end == 7.5
    assert cues[2].text == "Third slide content here."


def test_parse_handles_crlf():
    cues = parse_transcript(VTT_SAMPLE.replace("\n", "\r\n"))
    assert len(cues) == 3


def test_align_segments_buckets_by_midpoint():
    cues = parse_transcript(VTT_SAMPLE)
    # Two visual segments: [0, 7) and [7, 14)
    boundaries = [0.0, 7.0]
    narrations = align_segments(cues, boundaries, total_duration=14.0)
    assert len(narrations) == 2
    seg0 = " ".join(narrations[0])
    seg1 = " ".join(narrations[1])
    # First two cues mid-points (2.25, 6.75) fall into segment 0.
    assert "photosynthesis" in seg0
    assert "light-dependent" in seg0
    # Third cue mid-point (11.5) falls into segment 1.
    assert "Calvin cycle" in seg1


def test_align_segments_no_boundaries_returns_empty():
    cues = parse_transcript(VTT_SAMPLE)
    assert align_segments(cues, [], total_duration=14.0) == []


def test_align_segments_handles_cues_outside_range():
    cues = parse_transcript(VTT_SAMPLE)
    # All three cues end by 14s. With boundaries starting at 100s, none align.
    boundaries = [100.0]
    narrations = align_segments(cues, boundaries, total_duration=200.0)
    assert narrations == [[]]


def test_align_segments_keeps_trailing_cues_when_total_duration_extends_past_last_boundary():
    """When the last visible segment ends before the video ends β€” e.g. the
    instructor-frame filter drops the final wrap-up shot β€” the transcript
    cues from that trailing portion must still get attached to the last
    kept segment, not silently dropped. The orchestrator passes the
    actual video duration as `total_duration` so this works.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:05.000\nIntro to topic.\n\n"
        "2\n00:00:10.000 --> 00:00:15.000\nMain point of the lecture.\n\n"
        "3\n00:00:50.000 --> 00:00:55.000\nWrap-up after last visual.\n"
    )
    cues = parse_transcript(vtt)
    # Two visual segments at 0s and 10s; the last kept frame's end is
    # 30s (its scene boundary), but the video actually runs to 60s and
    # there's a wrap-up cue at ~52s. With total_duration=60, the wrap-up
    # cue must land in the last (10s) segment.
    boundaries = [0.0, 10.0]
    narrations = align_segments(cues, boundaries, total_duration=60.0)
    assert "Intro to topic." in " ".join(narrations[0])
    last_text = " ".join(narrations[1])
    assert "Main point" in last_text
    assert "Wrap-up after last visual." in last_text


def test_align_segments_breaks_paragraphs_on_long_gap():
    # Two cues separated by a 5-second silence β€” should become two paragraphs.
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:03.000\nFirst point ends here.\n\n"
        "2\n00:00:08.000 --> 00:00:11.000\nSecond point starts now.\n"
    )
    cues = parse_transcript(vtt)
    paras = align_segments(cues, [0.0], total_duration=15.0)[0]
    assert len(paras) == 2
    assert paras[0].startswith("First point")
    assert paras[1].startswith("Second point")


def test_align_segments_breaks_on_speaker_change():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:02.000\nPROFESSOR: Welcome to class.\n\n"
        "2\n00:00:02.000 --> 00:00:04.000\nSTUDENT: Thank you.\n"
    )
    cues = parse_transcript(vtt)
    paras = align_segments(cues, [0.0], total_duration=5.0)[0]
    assert len(paras) == 2
    assert paras[0].startswith("PROFESSOR:")
    assert paras[1].startswith("STUDENT:")


def test_align_segments_breaks_on_length_at_sentence_boundary():
    body = "This is a sentence that fills the buffer. " * 20  # ~840 chars
    vtt = (
        "WEBVTT\n\n"
        f"1\n00:00:00.000 --> 00:00:30.000\n{body.strip()}\n\n"
        "2\n00:00:30.000 --> 00:00:32.000\nFinal short cue.\n"
    )
    cues = parse_transcript(vtt)
    paras = align_segments(cues, [0.0], total_duration=35.0)[0]
    # Expect at least two paragraphs once length threshold is exceeded.
    assert len(paras) >= 2


def test_cue_bounds_snap_audio_outward_to_full_cues():
    """Scene-change boundaries rarely align with sentence boundaries, so
    slicing audio at exactly each segment's [start, end) cuts mid-sentence.
    cue_bounds_per_segment returns the first-cue-start / last-cue-end of
    the cues bucketed into each segment so the orchestrator can extend
    the audio slice outward to whole cues. The first cue here straddles
    segment 0's start and the last cue extends past segment 1's end β€”
    both must be reflected in the bounds.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:09.000 --> 00:00:13.000\nFirst sentence crosses boundary.\n\n"
        "2\n00:00:13.500 --> 00:00:18.000\nMiddle sentence in segment one.\n\n"
        "3\n00:00:19.500 --> 00:00:22.500\nSecond segment opener.\n\n"
        "4\n00:00:28.500 --> 00:00:33.000\nFinal sentence runs past end.\n"
    )
    cues = parse_transcript(vtt)
    # Visual segment boundaries at 10 and 20; the last visual frame ends
    # at 30 but the final cue extends to 33.
    bounds = cue_bounds_per_segment(cues, [10.0, 20.0], total_duration=33.0)
    assert len(bounds) == 2
    assert bounds[0] is not None and bounds[1] is not None
    # Segment 0 must cover from cue 1 start (9.0) to cue 2 end (18.0),
    # i.e. extend backward past the segment start (10) to capture the
    # word that began at 9.0 β€” that's the "mid-sentence cut" fix.
    assert bounds[0][0] == 9.0
    assert bounds[0][1] == 18.0
    # Segment 1 must extend forward past the segment end (30) to 33.0
    # so the trailing sentence isn't truncated.
    assert bounds[1][0] == 19.5
    assert bounds[1][1] == 33.0


def test_cue_bounds_returns_none_for_segments_with_no_cues():
    """When no cue's midpoint lands in a segment, the orchestrator must
    fall back to the scene's own bounds; signal that with None.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:05.000\nIntro.\n"
    )
    cues = parse_transcript(vtt)
    bounds = cue_bounds_per_segment(cues, [0.0, 100.0], total_duration=200.0)
    assert bounds[0] is not None
    assert bounds[1] is None


def test_align_segments_stitches_sentence_across_scene_boundary():
    """A single sentence often spans two cues. When a scene boundary
    falls between them, the cues bucket into different segments by
    midpoint and the sentence shows up split across the rendered
    narration β€” A ends mid-clause and B opens mid-sentence. The
    stitcher absorbs the trailing cue into A so the sentence lands
    intact in the segment where it began.
    """
    vtt = (
        "WEBVTT\n\n"
        # cue 1: starts in segment A, opens a sentence (no terminator)
        "1\n00:00:08.000 --> 00:00:11.000\nNow the question is how do we\n\n"
        # cue 2: starts in segment B, finishes the sentence
        "2\n00:00:11.500 --> 00:00:14.000\nincrease heat pump installation rates.\n\n"
        # cue 3: clearly a new sentence, belongs to segment B
        "3\n00:00:15.000 --> 00:00:18.000\nLet's look at three approaches.\n\n"
        "4\n00:00:18.500 --> 00:00:21.000\nFirst, financial incentives.\n"
    )
    cues = parse_transcript(vtt)
    # Boundary at 12 splits cue 1 (mid=9.5, segment A) from cue 2
    # (mid=12.75, segment B). Without stitching, segment A ends with
    # "...how do we" and segment B begins with "increase heat pump...".
    boundaries = [0.0, 12.0]
    narrations = align_segments(cues, boundaries, total_duration=22.0)
    seg_a = " ".join(narrations[0])
    seg_b = " ".join(narrations[1])
    # The completed sentence must live entirely in segment A.
    assert "increase heat pump installation rates." in seg_a, seg_a
    assert "increase heat pump installation rates" not in seg_b, seg_b
    # Segment B must keep the genuinely-new sentences that started after
    # the stitched cue.
    assert "Let's look at three approaches." in seg_b
    assert "First, financial incentives." in seg_b


def test_align_segments_splits_continuation_cue_at_first_sentence_end():
    """When B starts with the final word of A's sentence plus a new
    sentence, only the completion fragment should move back to A.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:18.000\n"
        "Welcome to climate education and how it can impact your\n\n"
        "2\n00:00:19.000 --> 00:00:24.000\n"
        "community. Heat pumps can heat and cool your home affordably.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 19.0], total_duration=25.0)
    seg_a = " ".join(narrations[0])
    seg_b = " ".join(narrations[1])
    assert seg_a.endswith("impact your community.")
    assert "Heat pumps can heat and cool your home affordably." not in seg_a
    assert seg_b == "Heat pumps can heat and cool your home affordably."


def test_align_segments_leaves_clean_sentence_boundaries_unchanged():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:08.000\nFirst sentence ends cleanly.\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\nSecond segment starts cleanly.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0)
    assert narrations[0] == ["First sentence ends cleanly."]
    assert narrations[1] == ["Second segment starts cleanly."]


def test_align_segments_does_not_split_abbreviation_in_continuation():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:08.000\nWe talked with\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\nDr. Smith about heat pumps.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0)
    assert " ".join(narrations[0]) == "We talked with Dr. Smith about heat pumps."
    assert narrations[1] == []


def test_align_segments_does_not_split_decimal_in_continuation():
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:08.000\nCosts can be\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\n2.5 times lower after rebates.\n"
    )
    cues = parse_transcript(vtt)
    narrations = align_segments(cues, [0.0, 10.0], total_duration=15.0)
    assert " ".join(narrations[0]) == "Costs can be 2.5 times lower after rebates."
    assert narrations[1] == []


def test_align_segments_stitcher_skips_when_no_terminator_in_reach():
    """If absorbing wouldn't actually complete a sentence β€” no cue in
    B's first few starts a sentence-terminator β€” the stitcher must
    leave both segments alone. Partial absorption would just shuffle
    the same mid-sentence cut to a different word with no benefit.
    """
    vtt = (
        "WEBVTT\n\n"
        # All cues lack sentence-terminating punctuation.
        "1\n00:00:00.000 --> 00:00:03.000\nThis sentence keeps going\n\n"
        "2\n00:00:11.000 --> 00:00:14.000\nand still keeps going\n"
    )
    cues = parse_transcript(vtt)
    boundaries = [0.0, 10.0]
    narrations = align_segments(cues, boundaries, total_duration=15.0)
    # No terminator anywhere β†’ no absorption; original bucketing stands.
    assert narrations[0] != [] and narrations[1] != []
    assert "This sentence keeps going" in " ".join(narrations[0])
    assert "and still keeps going" in " ".join(narrations[1])


def test_align_segments_stitcher_absorbs_lone_terminator_cue_from_b():
    """Real-world heat-pump case: A's last cue ends "...so here in the
    Boston" with no terminator, and B's only cue completes the sentence
    with "area, you can install a heat pump...freezing." The stitcher
    must absorb B's cue into A β€” even though that empties B β€” because
    the sentence belongs to A's slide (the speaker started it there).
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:00.000 --> 00:00:05.000\nIntro sentence finishes here.\n\n"
        # A's last cue ends mid-sentence
        "2\n00:00:08.000 --> 00:00:11.000\nSo here in the Boston\n\n"
        # B's only cue completes the sentence
        "3\n00:00:11.500 --> 00:00:14.500\narea, heat pumps stay viable.\n"
    )
    cues = parse_transcript(vtt)
    boundaries = [0.0, 11.0]
    narrations = align_segments(cues, boundaries, total_duration=15.0)
    seg_a = " ".join(narrations[0])
    seg_b = " ".join(narrations[1])
    assert "area, heat pumps stay viable." in seg_a, seg_a
    assert "Boston" in seg_a and seg_a.rstrip().endswith("viable.")
    # B emptied β€” that's the trade-off and is acceptable; the editor
    # can still show the slide with no auto-narration.
    assert seg_b == ""


def test_cue_bounds_reflect_stitched_bucketing():
    """Audio bounds derive from the same buckets as text β€” so when the
    stitcher moves a cue across a boundary, the audio extends with it.
    """
    vtt = (
        "WEBVTT\n\n"
        "1\n00:00:08.000 --> 00:00:11.000\nNow the question is how do we\n\n"
        "2\n00:00:11.500 --> 00:00:14.000\nincrease heat pump installation rates.\n\n"
        "3\n00:00:15.000 --> 00:00:18.000\nLet's look at three approaches.\n\n"
        "4\n00:00:18.500 --> 00:00:21.000\nFirst, financial incentives.\n"
    )
    cues = parse_transcript(vtt)
    bounds = cue_bounds_per_segment(cues, [0.0, 12.0], total_duration=22.0)
    # Segment A's audio must extend to 14.0 to cover the absorbed cue.
    assert bounds[0] is not None and bounds[0][1] == 14.0
    # Segment B's audio starts at 15.0 (the next sentence), not 11.5.
    assert bounds[1] is not None and bounds[1][0] == 15.0