Spaces:
Running
Running
Claude commited on
Commit ·
cae500e
1
Parent(s): b3c6e2c
Snap per-segment audio bounds to whole cues so clips don't end mid-sentence
Browse filesAudio slicing used each scene's [start_seconds, end_seconds) verbatim,
but scene-change timestamps rarely fall on sentence boundaries —
clips routinely cut mid-word at the start and mid-sentence at the end.
Add cue_bounds_per_segment in transcript.py: for each segment, return
the first-cue-start / last-cue-end of the cues align_segments already
bucketed into it. The orchestrator now extends each slice outward to
those bounds so the clip contains the full sentences whose text is
shown under the segment, falling back to the scene bounds when no cue
landed in the bucket.
- app/pipeline/orchestrator.py +15 -2
- app/pipeline/transcript.py +37 -8
- tests/test_transcript.py +48 -1
app/pipeline/orchestrator.py
CHANGED
|
@@ -18,7 +18,7 @@ from .llm_extract import LLMToggles, extract_all, is_enabled as llm_enabled
|
|
| 18 |
from .metadata import PageMetadata, SourceInfo, page_metadata_from_segments
|
| 19 |
from .ocr import ocr_frame
|
| 20 |
from .scenes import detect_and_extract, video_duration
|
| 21 |
-
from .transcript import Cue, align_segments, parse_transcript
|
| 22 |
from .types import SceneFrame
|
| 23 |
from .html_gen import Segment, build_segments
|
| 24 |
|
|
@@ -196,9 +196,22 @@ def run_pipeline(
|
|
| 196 |
# Audio shares the static/ folder with frames so the published bundle
|
| 197 |
# has one asset directory. The two namespaces don't collide because
|
| 198 |
# frames are scene_*.jpg and clips are segment_*.mp3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
audio_paths = slice_segment_audio(
|
| 200 |
inputs.video_path,
|
| 201 |
-
|
| 202 |
inputs.frames_dir,
|
| 203 |
)
|
| 204 |
log.info("Sliced %d/%d narration audio clips", len(audio_paths), len(kept))
|
|
|
|
| 18 |
from .metadata import PageMetadata, SourceInfo, page_metadata_from_segments
|
| 19 |
from .ocr import ocr_frame
|
| 20 |
from .scenes import detect_and_extract, video_duration
|
| 21 |
+
from .transcript import Cue, align_segments, cue_bounds_per_segment, parse_transcript
|
| 22 |
from .types import SceneFrame
|
| 23 |
from .html_gen import Segment, build_segments
|
| 24 |
|
|
|
|
| 196 |
# Audio shares the static/ folder with frames so the published bundle
|
| 197 |
# has one asset directory. The two namespaces don't collide because
|
| 198 |
# frames are scene_*.jpg and clips are segment_*.mp3.
|
| 199 |
+
# Snap each clip's bounds outward to whole cues so it doesn't end
|
| 200 |
+
# mid-sentence — scene-change boundaries rarely align with sentence
|
| 201 |
+
# boundaries. Falls back to scene bounds for segments with no cues.
|
| 202 |
+
cue_bounds = cue_bounds_per_segment(cues, boundaries, full_duration)
|
| 203 |
+
audio_segments: list[tuple[int, float, float]] = []
|
| 204 |
+
for i, f in enumerate(kept):
|
| 205 |
+
cb = cue_bounds[i] if i < len(cue_bounds) else None
|
| 206 |
+
if cb is None:
|
| 207 |
+
audio_segments.append((i, f.start_seconds, f.end_seconds))
|
| 208 |
+
else:
|
| 209 |
+
audio_segments.append(
|
| 210 |
+
(i, min(f.start_seconds, cb[0]), max(f.end_seconds, cb[1]))
|
| 211 |
+
)
|
| 212 |
audio_paths = slice_segment_audio(
|
| 213 |
inputs.video_path,
|
| 214 |
+
audio_segments,
|
| 215 |
inputs.frames_dir,
|
| 216 |
)
|
| 217 |
log.info("Sliced %d/%d narration audio clips", len(audio_paths), len(kept))
|
app/pipeline/transcript.py
CHANGED
|
@@ -121,14 +121,12 @@ def _group_into_paragraphs(
|
|
| 121 |
return paragraphs
|
| 122 |
|
| 123 |
|
| 124 |
-
def
|
| 125 |
cues: list[Cue], boundaries: list[float], total_duration: float
|
| 126 |
-
) -> list[list[
|
| 127 |
-
"""
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
A cue belongs to the segment whose [start, next_start) window contains
|
| 131 |
-
the cue's midpoint. Returns a list of paragraphs per segment.
|
| 132 |
"""
|
| 133 |
if not boundaries:
|
| 134 |
return []
|
|
@@ -140,4 +138,35 @@ def align_segments(
|
|
| 140 |
if edges[i] <= mid < edges[i + 1]:
|
| 141 |
buckets[i].append(cue)
|
| 142 |
break
|
| 143 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
return paragraphs
|
| 122 |
|
| 123 |
|
| 124 |
+
def _bucket_cues(
|
| 125 |
cues: list[Cue], boundaries: list[float], total_duration: float
|
| 126 |
+
) -> list[list[Cue]]:
|
| 127 |
+
"""Bucket cues into the segment whose [start, next_start) window
|
| 128 |
+
contains the cue's midpoint. Shared by align_segments (text) and
|
| 129 |
+
cue_bounds_per_segment (audio).
|
|
|
|
|
|
|
| 130 |
"""
|
| 131 |
if not boundaries:
|
| 132 |
return []
|
|
|
|
| 138 |
if edges[i] <= mid < edges[i + 1]:
|
| 139 |
buckets[i].append(cue)
|
| 140 |
break
|
| 141 |
+
return buckets
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def align_segments(
|
| 145 |
+
cues: list[Cue], boundaries: list[float], total_duration: float
|
| 146 |
+
) -> list[list[str]]:
|
| 147 |
+
"""Assign cue text to each visual segment defined by `boundaries`.
|
| 148 |
+
|
| 149 |
+
`boundaries` is an ascending list of segment-start timestamps (seconds).
|
| 150 |
+
A cue belongs to the segment whose [start, next_start) window contains
|
| 151 |
+
the cue's midpoint. Returns a list of paragraphs per segment.
|
| 152 |
+
"""
|
| 153 |
+
return [_group_into_paragraphs(b) for b in _bucket_cues(cues, boundaries, total_duration)]
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def cue_bounds_per_segment(
|
| 157 |
+
cues: list[Cue], boundaries: list[float], total_duration: float
|
| 158 |
+
) -> list[tuple[float, float] | None]:
|
| 159 |
+
"""For each segment, return (first_cue_start, last_cue_end) of the cues
|
| 160 |
+
bucketed into it, or None if no cues landed there.
|
| 161 |
+
|
| 162 |
+
Used to snap per-segment audio slice bounds outward to whole cues so
|
| 163 |
+
clips don't end mid-sentence — scene-change boundaries rarely align
|
| 164 |
+
with sentence boundaries.
|
| 165 |
+
"""
|
| 166 |
+
out: list[tuple[float, float] | None] = []
|
| 167 |
+
for b in _bucket_cues(cues, boundaries, total_duration):
|
| 168 |
+
if not b:
|
| 169 |
+
out.append(None)
|
| 170 |
+
else:
|
| 171 |
+
out.append((b[0].start, b[-1].end))
|
| 172 |
+
return out
|
tests/test_transcript.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from app.pipeline.transcript import align_segments, parse_transcript
|
| 2 |
|
| 3 |
VTT_SAMPLE = """WEBVTT
|
| 4 |
|
|
@@ -145,3 +145,50 @@ def test_align_segments_breaks_on_length_at_sentence_boundary():
|
|
| 145 |
paras = align_segments(cues, [0.0], total_duration=35.0)[0]
|
| 146 |
# Expect at least two paragraphs once length threshold is exceeded.
|
| 147 |
assert len(paras) >= 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.pipeline.transcript import align_segments, cue_bounds_per_segment, parse_transcript
|
| 2 |
|
| 3 |
VTT_SAMPLE = """WEBVTT
|
| 4 |
|
|
|
|
| 145 |
paras = align_segments(cues, [0.0], total_duration=35.0)[0]
|
| 146 |
# Expect at least two paragraphs once length threshold is exceeded.
|
| 147 |
assert len(paras) >= 2
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def test_cue_bounds_snap_audio_outward_to_full_cues():
|
| 151 |
+
"""Scene-change boundaries rarely align with sentence boundaries, so
|
| 152 |
+
slicing audio at exactly each segment's [start, end) cuts mid-sentence.
|
| 153 |
+
cue_bounds_per_segment returns the first-cue-start / last-cue-end of
|
| 154 |
+
the cues bucketed into each segment so the orchestrator can extend
|
| 155 |
+
the audio slice outward to whole cues. The first cue here straddles
|
| 156 |
+
segment 0's start and the last cue extends past segment 1's end —
|
| 157 |
+
both must be reflected in the bounds.
|
| 158 |
+
"""
|
| 159 |
+
vtt = (
|
| 160 |
+
"WEBVTT\n\n"
|
| 161 |
+
"1\n00:00:09.000 --> 00:00:13.000\nFirst sentence crosses boundary.\n\n"
|
| 162 |
+
"2\n00:00:13.500 --> 00:00:18.000\nMiddle sentence in segment one.\n\n"
|
| 163 |
+
"3\n00:00:19.500 --> 00:00:22.500\nSecond segment opener.\n\n"
|
| 164 |
+
"4\n00:00:28.500 --> 00:00:33.000\nFinal sentence runs past end.\n"
|
| 165 |
+
)
|
| 166 |
+
cues = parse_transcript(vtt)
|
| 167 |
+
# Visual segment boundaries at 10 and 20; the last visual frame ends
|
| 168 |
+
# at 30 but the final cue extends to 33.
|
| 169 |
+
bounds = cue_bounds_per_segment(cues, [10.0, 20.0], total_duration=33.0)
|
| 170 |
+
assert len(bounds) == 2
|
| 171 |
+
assert bounds[0] is not None and bounds[1] is not None
|
| 172 |
+
# Segment 0 must cover from cue 1 start (9.0) to cue 2 end (18.0),
|
| 173 |
+
# i.e. extend backward past the segment start (10) to capture the
|
| 174 |
+
# word that began at 9.0 — that's the "mid-sentence cut" fix.
|
| 175 |
+
assert bounds[0][0] == 9.0
|
| 176 |
+
assert bounds[0][1] == 18.0
|
| 177 |
+
# Segment 1 must extend forward past the segment end (30) to 33.0
|
| 178 |
+
# so the trailing sentence isn't truncated.
|
| 179 |
+
assert bounds[1][0] == 19.5
|
| 180 |
+
assert bounds[1][1] == 33.0
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def test_cue_bounds_returns_none_for_segments_with_no_cues():
|
| 184 |
+
"""When no cue's midpoint lands in a segment, the orchestrator must
|
| 185 |
+
fall back to the scene's own bounds; signal that with None.
|
| 186 |
+
"""
|
| 187 |
+
vtt = (
|
| 188 |
+
"WEBVTT\n\n"
|
| 189 |
+
"1\n00:00:00.000 --> 00:00:05.000\nIntro.\n"
|
| 190 |
+
)
|
| 191 |
+
cues = parse_transcript(vtt)
|
| 192 |
+
bounds = cue_bounds_per_segment(cues, [0.0, 100.0], total_duration=200.0)
|
| 193 |
+
assert bounds[0] is not None
|
| 194 |
+
assert bounds[1] is None
|