Claude commited on
Commit
cae500e
·
1 Parent(s): b3c6e2c

Snap per-segment audio bounds to whole cues so clips don't end mid-sentence

Browse files

Audio slicing used each scene's [start_seconds, end_seconds) verbatim,
but scene-change timestamps rarely fall on sentence boundaries —
clips routinely cut mid-word at the start and mid-sentence at the end.

Add cue_bounds_per_segment in transcript.py: for each segment, return
the first-cue-start / last-cue-end of the cues align_segments already
bucketed into it. The orchestrator now extends each slice outward to
those bounds so the clip contains the full sentences whose text is
shown under the segment, falling back to the scene bounds when no cue
landed in the bucket.

app/pipeline/orchestrator.py CHANGED
@@ -18,7 +18,7 @@ from .llm_extract import LLMToggles, extract_all, is_enabled as llm_enabled
18
  from .metadata import PageMetadata, SourceInfo, page_metadata_from_segments
19
  from .ocr import ocr_frame
20
  from .scenes import detect_and_extract, video_duration
21
- from .transcript import Cue, align_segments, parse_transcript
22
  from .types import SceneFrame
23
  from .html_gen import Segment, build_segments
24
 
@@ -196,9 +196,22 @@ def run_pipeline(
196
  # Audio shares the static/ folder with frames so the published bundle
197
  # has one asset directory. The two namespaces don't collide because
198
  # frames are scene_*.jpg and clips are segment_*.mp3.
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  audio_paths = slice_segment_audio(
200
  inputs.video_path,
201
- [(i, f.start_seconds, f.end_seconds) for i, f in enumerate(kept)],
202
  inputs.frames_dir,
203
  )
204
  log.info("Sliced %d/%d narration audio clips", len(audio_paths), len(kept))
 
18
  from .metadata import PageMetadata, SourceInfo, page_metadata_from_segments
19
  from .ocr import ocr_frame
20
  from .scenes import detect_and_extract, video_duration
21
+ from .transcript import Cue, align_segments, cue_bounds_per_segment, parse_transcript
22
  from .types import SceneFrame
23
  from .html_gen import Segment, build_segments
24
 
 
196
  # Audio shares the static/ folder with frames so the published bundle
197
  # has one asset directory. The two namespaces don't collide because
198
  # frames are scene_*.jpg and clips are segment_*.mp3.
199
+ # Snap each clip's bounds outward to whole cues so it doesn't end
200
+ # mid-sentence — scene-change boundaries rarely align with sentence
201
+ # boundaries. Falls back to scene bounds for segments with no cues.
202
+ cue_bounds = cue_bounds_per_segment(cues, boundaries, full_duration)
203
+ audio_segments: list[tuple[int, float, float]] = []
204
+ for i, f in enumerate(kept):
205
+ cb = cue_bounds[i] if i < len(cue_bounds) else None
206
+ if cb is None:
207
+ audio_segments.append((i, f.start_seconds, f.end_seconds))
208
+ else:
209
+ audio_segments.append(
210
+ (i, min(f.start_seconds, cb[0]), max(f.end_seconds, cb[1]))
211
+ )
212
  audio_paths = slice_segment_audio(
213
  inputs.video_path,
214
+ audio_segments,
215
  inputs.frames_dir,
216
  )
217
  log.info("Sliced %d/%d narration audio clips", len(audio_paths), len(kept))
app/pipeline/transcript.py CHANGED
@@ -121,14 +121,12 @@ def _group_into_paragraphs(
121
  return paragraphs
122
 
123
 
124
- def align_segments(
125
  cues: list[Cue], boundaries: list[float], total_duration: float
126
- ) -> list[list[str]]:
127
- """Assign cue text to each visual segment defined by `boundaries`.
128
-
129
- `boundaries` is an ascending list of segment-start timestamps (seconds).
130
- A cue belongs to the segment whose [start, next_start) window contains
131
- the cue's midpoint. Returns a list of paragraphs per segment.
132
  """
133
  if not boundaries:
134
  return []
@@ -140,4 +138,35 @@ def align_segments(
140
  if edges[i] <= mid < edges[i + 1]:
141
  buckets[i].append(cue)
142
  break
143
- return [_group_into_paragraphs(b) for b in buckets]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  return paragraphs
122
 
123
 
124
+ def _bucket_cues(
125
  cues: list[Cue], boundaries: list[float], total_duration: float
126
+ ) -> list[list[Cue]]:
127
+ """Bucket cues into the segment whose [start, next_start) window
128
+ contains the cue's midpoint. Shared by align_segments (text) and
129
+ cue_bounds_per_segment (audio).
 
 
130
  """
131
  if not boundaries:
132
  return []
 
138
  if edges[i] <= mid < edges[i + 1]:
139
  buckets[i].append(cue)
140
  break
141
+ return buckets
142
+
143
+
144
+ def align_segments(
145
+ cues: list[Cue], boundaries: list[float], total_duration: float
146
+ ) -> list[list[str]]:
147
+ """Assign cue text to each visual segment defined by `boundaries`.
148
+
149
+ `boundaries` is an ascending list of segment-start timestamps (seconds).
150
+ A cue belongs to the segment whose [start, next_start) window contains
151
+ the cue's midpoint. Returns a list of paragraphs per segment.
152
+ """
153
+ return [_group_into_paragraphs(b) for b in _bucket_cues(cues, boundaries, total_duration)]
154
+
155
+
156
+ def cue_bounds_per_segment(
157
+ cues: list[Cue], boundaries: list[float], total_duration: float
158
+ ) -> list[tuple[float, float] | None]:
159
+ """For each segment, return (first_cue_start, last_cue_end) of the cues
160
+ bucketed into it, or None if no cues landed there.
161
+
162
+ Used to snap per-segment audio slice bounds outward to whole cues so
163
+ clips don't end mid-sentence — scene-change boundaries rarely align
164
+ with sentence boundaries.
165
+ """
166
+ out: list[tuple[float, float] | None] = []
167
+ for b in _bucket_cues(cues, boundaries, total_duration):
168
+ if not b:
169
+ out.append(None)
170
+ else:
171
+ out.append((b[0].start, b[-1].end))
172
+ return out
tests/test_transcript.py CHANGED
@@ -1,4 +1,4 @@
1
- from app.pipeline.transcript import align_segments, parse_transcript
2
 
3
  VTT_SAMPLE = """WEBVTT
4
 
@@ -145,3 +145,50 @@ def test_align_segments_breaks_on_length_at_sentence_boundary():
145
  paras = align_segments(cues, [0.0], total_duration=35.0)[0]
146
  # Expect at least two paragraphs once length threshold is exceeded.
147
  assert len(paras) >= 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.pipeline.transcript import align_segments, cue_bounds_per_segment, parse_transcript
2
 
3
  VTT_SAMPLE = """WEBVTT
4
 
 
145
  paras = align_segments(cues, [0.0], total_duration=35.0)[0]
146
  # Expect at least two paragraphs once length threshold is exceeded.
147
  assert len(paras) >= 2
148
+
149
+
150
+ def test_cue_bounds_snap_audio_outward_to_full_cues():
151
+ """Scene-change boundaries rarely align with sentence boundaries, so
152
+ slicing audio at exactly each segment's [start, end) cuts mid-sentence.
153
+ cue_bounds_per_segment returns the first-cue-start / last-cue-end of
154
+ the cues bucketed into each segment so the orchestrator can extend
155
+ the audio slice outward to whole cues. The first cue here straddles
156
+ segment 0's start and the last cue extends past segment 1's end —
157
+ both must be reflected in the bounds.
158
+ """
159
+ vtt = (
160
+ "WEBVTT\n\n"
161
+ "1\n00:00:09.000 --> 00:00:13.000\nFirst sentence crosses boundary.\n\n"
162
+ "2\n00:00:13.500 --> 00:00:18.000\nMiddle sentence in segment one.\n\n"
163
+ "3\n00:00:19.500 --> 00:00:22.500\nSecond segment opener.\n\n"
164
+ "4\n00:00:28.500 --> 00:00:33.000\nFinal sentence runs past end.\n"
165
+ )
166
+ cues = parse_transcript(vtt)
167
+ # Visual segment boundaries at 10 and 20; the last visual frame ends
168
+ # at 30 but the final cue extends to 33.
169
+ bounds = cue_bounds_per_segment(cues, [10.0, 20.0], total_duration=33.0)
170
+ assert len(bounds) == 2
171
+ assert bounds[0] is not None and bounds[1] is not None
172
+ # Segment 0 must cover from cue 1 start (9.0) to cue 2 end (18.0),
173
+ # i.e. extend backward past the segment start (10) to capture the
174
+ # word that began at 9.0 — that's the "mid-sentence cut" fix.
175
+ assert bounds[0][0] == 9.0
176
+ assert bounds[0][1] == 18.0
177
+ # Segment 1 must extend forward past the segment end (30) to 33.0
178
+ # so the trailing sentence isn't truncated.
179
+ assert bounds[1][0] == 19.5
180
+ assert bounds[1][1] == 33.0
181
+
182
+
183
+ def test_cue_bounds_returns_none_for_segments_with_no_cues():
184
+ """When no cue's midpoint lands in a segment, the orchestrator must
185
+ fall back to the scene's own bounds; signal that with None.
186
+ """
187
+ vtt = (
188
+ "WEBVTT\n\n"
189
+ "1\n00:00:00.000 --> 00:00:05.000\nIntro.\n"
190
+ )
191
+ cues = parse_transcript(vtt)
192
+ bounds = cue_bounds_per_segment(cues, [0.0, 100.0], total_duration=200.0)
193
+ assert bounds[0] is not None
194
+ assert bounds[1] is None