Spaces:

mvp-lab
/

SyncAI

Running on Zero

App Files Files Community

SyncAI / src /assembler.py

ICGenAIShare04

Upload 52 files

72f552e verified 16 days ago

raw

history blame contribute delete

21.5 kB

	"""FFmpeg video stitching, clip splitting/shuffling, lyrics overlay.

	Takes generated video clips (one per 4-beat segment), splits each into
	two halves, shuffles them with a distance constraint, builds a timeline
	with dynamic pacing (4-beat cuts before the drop, 2-beat after), overlays
	audio and lyrics text.
	"""

	import json
	import random
	import subprocess
	import tempfile
	from pathlib import Path


	def _get_audio_path(run_dir: Path) -> Path:
	"""Find the original audio file one level above the run directory."""
	song_dir = run_dir.parent
	for ext in [".wav", ".mp3", ".flac", ".m4a"]:
	candidates = list(song_dir.glob(f"*{ext}"))
	if candidates:
	return candidates[0]
	raise FileNotFoundError(f"No audio file found in {song_dir}")


	def _get_clip_duration(clip_path: Path) -> float:
	"""Get video duration in seconds using ffprobe."""
	result = subprocess.run([
	"ffprobe", "-v", "error",
	"-show_entries", "format=duration",
	"-of", "csv=p=0",
	str(clip_path),
	], capture_output=True, text=True, check=True)
	return float(result.stdout.strip())


	def _get_clip_fps(clip_path: Path) -> float:
	"""Get video frame rate using ffprobe."""
	result = subprocess.run([
	"ffprobe", "-v", "error",
	"-select_streams", "v:0",
	"-show_entries", "stream=r_frame_rate",
	"-of", "csv=p=0",
	str(clip_path),
	], capture_output=True, text=True, check=True)
	num, den = result.stdout.strip().split("/")
	return int(num) / int(den)


	def _trim_clip(clip_path: Path, start: float, duration: float, output_path: Path):
	"""Trim a video clip from a start point to a duration using FFmpeg."""
	cmd = [
	"ffmpeg", "-y",
	"-ss", f"{start:.3f}",
	"-i", str(clip_path),
	"-t", f"{duration:.3f}",
	"-c:v", "libx264", "-preset", "fast",
	"-an",
	str(output_path),
	]
	subprocess.run(cmd, check=True, capture_output=True)


	# ---------------------------------------------------------------------------
	# Ken Burns effects — subtle pan/zoom applied per slot for added motion
	# ---------------------------------------------------------------------------

	# Zoom factor: 8% total movement over the clip duration
	_KB_ZOOM = 0.45

	KEN_BURNS_EFFECTS = [
	"zoom_in",
	"zoom_out",
	]


	def _ken_burns_filter(
	effect: str, n_frames: int, width: int, height: int,
	) -> str:
	"""Build an FFmpeg filter for a smooth Ken Burns zoom effect on video.

	Upscales the video 4x before applying zoompan with d=1 (one output
	frame per input frame), then scales back to original size. The 4x
	upscale makes integer rounding in zoompan negligible, eliminating
	visible jitter.
	"""
	z = _KB_ZOOM
	N = max(n_frames, 1)
	W, H = width, height
	# Upscale factor — higher = smoother but slower
	UP = 8
	UW, UH = W * UP, H * UP

	if effect == "zoom_in":
	zoom_expr = f"1+{z}*on/{N}"
	elif effect == "zoom_out":
	zoom_expr = f"1+{z}-{z}*on/{N}"
	else:
	return f"scale={W}:{H}"

	return (
	f"scale={UW}:{UH}:flags=lanczos,"
	f"zoompan=z='{zoom_expr}':"
	f"x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':"
	f"d=1:s={UW}x{UH},"
	f"scale={W}:{H}:flags=lanczos"
	)


	def _get_clip_dimensions(clip_path: Path) -> tuple[int, int]:
	"""Get width and height of a video clip."""
	result = subprocess.run(
	["ffprobe", "-v", "error", "-select_streams", "v:0",
	"-show_entries", "stream=width,height",
	"-of", "csv=s=x:p=0", str(clip_path)],
	capture_output=True, text=True, check=True,
	)
	w, h = result.stdout.strip().split("x")
	return int(w), int(h)


	def _split_clip(clip_path: Path, clip_id: int) -> dict:
	"""Register a clip's two halves without pre-splitting.

	The "first" half plays from the start, the "second" half plays from
	the end (offset back by the slot duration at trim time). This makes
	the two halves maximally different — no fixed midpoint split.

	Returns dict with the original path and full duration for each half.
	"""
	duration = _get_clip_duration(clip_path)

	return {
	"clip_id": clip_id,
	"first": clip_path,
	"second": clip_path,
	"first_duration": duration,
	"second_duration": duration,
	}


	def _build_sub_segments(segments: list[dict], drop_time: float \| None) -> list[dict]:
	"""Build the final timeline of sub-segments.

	Before the drop: one slot per 4-beat segment.
	After the drop: each 4-beat segment splits into two 2-beat slots
	using the beat timestamps stored in the segment.
	"""
	sub_segments = []

	for seg in segments:
	beats = seg.get("beats", [seg["start"], seg["end"]])
	is_after_drop = drop_time is not None and seg["start"] >= drop_time

	if is_after_drop and len(beats) >= 3:
	# Split at midpoint beat (beat 2 of 4)
	mid_idx = len(beats) // 2
	mid_time = beats[mid_idx]

	sub_segments.append({
	"start": seg["start"],
	"end": mid_time,
	"duration": round(mid_time - seg["start"], 3),
	"lyrics": seg.get("lyrics", ""),
	"parent_segment": seg["segment"],
	})
	sub_segments.append({
	"start": mid_time,
	"end": seg["end"],
	"duration": round(seg["end"] - mid_time, 3),
	"lyrics": "", # lyrics stay on the first half
	"parent_segment": seg["segment"],
	})
	else:
	# Before drop: one slot for the full 4-beat segment
	sub_segments.append({
	"start": seg["start"],
	"end": seg["end"],
	"duration": seg["duration"],
	"lyrics": seg.get("lyrics", ""),
	"parent_segment": seg["segment"],
	})

	return sub_segments


	def _shuffle_with_distance(pool: list[tuple], n_slots: int) -> list[tuple]:
	"""Select n_slots sub-clips maximising clip diversity and spacing.

	Shuffles clip IDs once, then repeats that order to fill all slots.
	First pass uses "first" halves, second pass uses "second" halves.
	Same clip is always exactly n_clips positions apart — maximum spacing.

	Each item is (clip_id, half_label, path, duration).
	"""
	by_clip: dict[int, list[tuple]] = {}
	for item in pool:
	by_clip.setdefault(item[0], []).append(item)

	clip_ids = list(by_clip.keys())
	random.shuffle(clip_ids)

	# Repeat the shuffled order: [4,5,1,2,6,3, 4,5,1,2,6,3, ...]
	result = []
	cycle = 0
	while len(result) < n_slots:
	for cid in clip_ids:
	if len(result) >= n_slots:
	break
	halves = by_clip[cid]
	# First cycle uses "first" half, second cycle uses "second", etc.
	half_idx = cycle % len(halves)
	result.append(halves[half_idx])
	cycle += 1

	return result


	# Font registry — maps display names to .ttf filenames in fonts/
	FONTS = {
	"Bebas Neue": "BebasNeue-Regular.ttf",
	"Teko": "Teko-Bold.ttf",
	"Russo One": "RussoOne-Regular.ttf",
	"Staatliches": "Staatliches-Regular.ttf",
	}

	DEFAULT_FONT = "Bebas Neue"
	DEFAULT_FONT_COLOR = "#FFF7D4"

	_FONTS_DIR = Path(__file__).resolve().parent.parent / "fonts"


	def font_names() -> list[str]:
	"""Return list of available font display names."""
	return list(FONTS.keys())


	def _get_font_path(font_name: str) -> Path:
	"""Resolve a font display name to its .ttf file path."""
	filename = FONTS.get(font_name, FONTS[DEFAULT_FONT])
	return _FONTS_DIR / filename


	_SPOTIFY_BADGE = Path(__file__).resolve().parent.parent / "assets" / "spotify_badge.png"


	def _add_lyrics_overlay(
	video_path: Path,
	segments: list[dict],
	output_path: Path,
	audio_offset: float,
	font_name: str = DEFAULT_FONT,
	font_color: str = DEFAULT_FONT_COLOR,
	cover_art: Path \| None = None,
	drop_time: float \| None = None,
	song_name: str = "",
	):
	"""Add lyrics text and optional cover art overlay using FFmpeg filters."""
	font_path = _get_font_path(font_name)

	# If cover art provided, lyrics stop at the drop
	lyrics_cutoff = None
	if cover_art is not None and drop_time is not None:
	lyrics_cutoff = drop_time

	# Collect all words with timestamps
	all_words = []
	for seg in segments:
	for word_info in seg.get("words", []):
	word = word_info["word"].strip().lower()
	if not word:
	continue
	w_start = word_info["start"]
	w_end = word_info["end"]
	# Skip words that start after the cutoff
	if lyrics_cutoff is not None and w_start >= lyrics_cutoff:
	continue
	# Clamp end to cutoff for words that span the drop
	if lyrics_cutoff is not None and w_end > lyrics_cutoff:
	w_end = lyrics_cutoff
	all_words.append({"word": word, "start": w_start, "end": w_end})

	# Close small gaps: both words meet in the middle of the gap
	gap_threshold = 0.5
	for i in range(len(all_words) - 1):
	gap = all_words[i + 1]["start"] - all_words[i]["end"]
	if 0 < gap < gap_threshold:
	mid = all_words[i]["end"] + gap / 2
	all_words[i]["end"] = mid
	all_words[i + 1]["start"] = mid

	# Build drawtext filter chain — one filter per word, timed to speech
	drawtext_filters = []
	for w in all_words:
	escaped = (w["word"]
	.replace("\\", "\\\\")
	.replace("'", "\u2019")
	.replace('"', '\\"')
	.replace(":", "\\:")
	.replace("%", "%%")
	.replace("[", "\\[")
	.replace("]", "\\]"))

	start = w["start"] - audio_offset
	end = w["end"] - audio_offset

	drawtext_filters.append(
	f"drawtext=text='{escaped}'"
	f":fontfile='{font_path}'"
	f":fontsize=36"
	f":fontcolor={font_color}"
	f":x=(w-text_w)/2:y=(h-text_h)/2"
	f":enable='between(t,{start:.3f},{end:.3f})'"
	)

	has_cover = cover_art is not None and drop_time is not None
	has_lyrics = len(drawtext_filters) > 0

	if not has_cover and not has_lyrics:
	subprocess.run([
	"ffmpeg", "-y", "-i", str(video_path),
	"-c", "copy", str(output_path),
	], check=True, capture_output=True)
	return

	if has_cover:
	drop_start = drop_time - audio_offset
	enable = f"enable='gte(t,{drop_start:.3f})'"

	# --- Cover art layout (change these to adjust) ---
	art_h = 270 # cover art height in px
	art_y_offset = 10 # px below center (positive = down)
	badge_h = 56 # spotify badge height in px

	# Probe video height for position calculations
	vid_h = int(subprocess.run([
	"ffprobe", "-v", "error", "-select_streams", "v:0",
	"-show_entries", "stream=height", "-of", "csv=p=0",
	str(video_path),
	], capture_output=True, text=True, check=True).stdout.strip())
	art_center = vid_h / 2 + art_y_offset
	art_top = art_center - art_h / 2
	art_bottom = art_center + art_h / 2

	# Square = 9:16 crop region (side = vid_h * 9/16)
	sq_side = vid_h * 9 / 16
	sq_top = (vid_h - sq_side) / 2
	sq_bottom = (vid_h + sq_side) / 2

	# Badge centered between square top and art top
	badge_center_y = (sq_top + art_top) / 2
	badge_y = int(badge_center_y - badge_h / 2)

	# Title centered between art bottom and square bottom
	title_center_y = int((art_bottom + sq_bottom) / 2)

	art_overlay_y = int(art_center - art_h / 2)

	parts = [
	f"[1:v]scale=-2:{art_h}:flags=lanczos[art]",
	f"[2:v]scale=-2:{badge_h}:flags=lanczos[badge]",
	f"[0:v][art]overlay=(W-w)/2:{art_overlay_y}:{enable}[v1]",
	f"[v1][badge]overlay=(W-w)/2:{badge_y}:{enable}",
	]

	# Add song title drawtext below cover art
	title_escaped = (song_name
	.replace("\\", "\\\\")
	.replace("'", "\u2019")
	.replace('"', '\\"')
	.replace(":", "\\:")
	.replace("%", "%%"))
	title_text = f'\\"{title_escaped}\\" out now!'.lower()
	parts[-1] += (
	f",drawtext=text='{title_text}'"
	f":fontfile='{font_path}'"
	f":fontsize=40"
	f":fontcolor={font_color}"
	f":x=(w-text_w)/2:y={title_center_y}-text_h/2"
	f":{enable}"
	)

	# Chain drawtext lyrics filters
	if has_lyrics:
	parts[-1] += "," + ",".join(drawtext_filters)
	filter_chain = ";".join(parts)

	cmd = [
	"ffmpeg", "-y",
	"-i", str(video_path),
	"-i", str(cover_art),
	"-i", str(_SPOTIFY_BADGE),
	"-filter_complex", filter_chain,
	"-c:v", "libx264", "-preset", "fast",
	"-c:a", "copy",
	str(output_path),
	]
	subprocess.run(cmd, check=True, capture_output=True)
	else:
	# Lyrics only, no cover art
	filter_chain = ",".join(drawtext_filters)
	subprocess.run([
	"ffmpeg", "-y",
	"-i", str(video_path),
	"-vf", filter_chain,
	"-c:v", "libx264", "-preset", "fast",
	"-c:a", "copy",
	str(output_path),
	], check=True, capture_output=True)


	def assemble(
	run_dir: str \| Path,
	audio_path: str \| Path \| None = None,
	font_name: str = DEFAULT_FONT,
	font_color: str = DEFAULT_FONT_COLOR,
	cover_art: str \| Path \| None = None,
	) -> Path:
	"""Assemble final video with dynamic pacing, clip shuffling, and lyrics.

	Args:
	run_dir: Run directory containing clips/, segments.json, drop.json.
	audio_path: Path to the original audio. Auto-detected if None.
	font_name: Display name of the font for lyrics overlay.
	font_color: Hex color for lyrics text (e.g. '#FFF7D4').
	cover_art: Path to cover art image. Overlayed from the drop onwards.

	Returns:
	Path to the final video file.
	"""
	run_dir = Path(run_dir)
	clips_dir = run_dir / "clips"
	output_dir = run_dir / "output"
	output_dir.mkdir(parents=True, exist_ok=True)

	with open(run_dir / "segments.json") as f:
	segments = json.load(f)

	# Load drop time
	drop_time = None
	drop_path = run_dir / "drop.json"
	if drop_path.exists():
	with open(drop_path) as f:
	drop_time = json.load(f).get("drop_time")
	print(f" Drop at {drop_time:.3f}s")
	else:
	print(" No drop detected — using uniform pacing")

	if audio_path is None:
	audio_path = _get_audio_path(run_dir)
	audio_path = Path(audio_path)

	# --- Step 1: Register clip halves (no pre-splitting needed) ---
	sub_clips = [] # list of (clip_id, half, path, full_duration)
	for seg in segments:
	idx = seg["segment"]
	clip_path = clips_dir / f"clip_{idx:03d}.mp4"
	if not clip_path.exists():
	print(f" Warning: {clip_path.name} not found, skipping")
	continue

	halves = _split_clip(clip_path, idx)
	sub_clips.append((idx, "first", halves["first"], halves["first_duration"]))
	sub_clips.append((idx, "second", halves["second"], halves["second_duration"]))
	print(f" Registered {clip_path.name} ({halves['first_duration']:.1f}s)")

	if not sub_clips:
	raise FileNotFoundError(f"No clips found in {clips_dir}")

	# --- Step 2: Build sub-segment timeline ---
	sub_segments = _build_sub_segments(segments, drop_time)
	print(f" Timeline: {len(sub_segments)} slots "
	f"({len([s for s in sub_segments if s['duration'] < 1.5])} fast cuts)")

	# --- Step 3: Shuffle sub-clips into slots ---
	assigned = _shuffle_with_distance(sub_clips.copy(), n_slots=len(sub_segments))

	# --- Step 4: Frame-accurate trim of each sub-clip to slot duration ---
	# Detect FPS from first available sub-clip
	fps = _get_clip_fps(assigned[0][2])
	print(f" Source FPS: {fps}")

	trimmed_dir = run_dir / "clips_trimmed"
	trimmed_dir.mkdir(exist_ok=True)
	trimmed_paths = []

	# Get clip dimensions from the first available clip (all clips share resolution)
	clip_width, clip_height = _get_clip_dimensions(assigned[0][2])
	print(f" Clip resolution: {clip_width}x{clip_height}")

	# Track cumulative frames to prevent drift between cuts and beats
	cumulative_frames = 0
	cumulative_target = 0.0

	for i, (sub_seg, (clip_id, half, clip_path, clip_dur)) in enumerate(
	zip(sub_segments, assigned)
	):
	slot_dur = sub_seg["duration"]
	cumulative_target += min(slot_dur, clip_dur)
	target_frame = round(cumulative_target * fps)
	n_frames = max(1, target_frame - cumulative_frames)
	cumulative_frames = target_frame

	# "first" half starts from 0, "second" half starts from end minus slot duration
	# This makes the two halves show maximally different frames
	if half == "second":
	ss = max(0, clip_dur - slot_dur)
	else:
	ss = 0

	# Apply Ken Burns effect — cycle through effects per slot
	effect = KEN_BURNS_EFFECTS[i % len(KEN_BURNS_EFFECTS)]
	vf = _ken_burns_filter(effect, n_frames, clip_width, clip_height)

	trimmed_path = trimmed_dir / f"slot_{i:03d}.mp4"
	cmd = [
	"ffmpeg", "-y",
	"-ss", f"{ss:.3f}",
	"-i", str(clip_path),
	"-frames:v", str(n_frames),
	"-vf", vf,
	"-c:v", "libx264", "-preset", "fast",
	"-r", str(int(fps)),
	"-an",
	str(trimmed_path),
	]
	subprocess.run(cmd, check=True, capture_output=True)
	trimmed_paths.append(trimmed_path)
	actual_dur = n_frames / fps
	print(f" Slot {i}: clip {clip_id} ({half}, ss={ss:.1f}s, {effect}) → "
	f"{n_frames}f/{actual_dur:.3f}s (target {slot_dur:.3f}s)")

	# --- Step 5: Concatenate (copy, no re-encode to preserve timing) ---
	with tempfile.NamedTemporaryFile(
	mode="w", suffix=".txt", delete=False, dir=str(run_dir)
	) as f:
	for p in trimmed_paths:
	f.write(f"file '{p.resolve()}'\n")
	concat_list = f.name

	concat_path = output_dir / "video_only.mp4"
	subprocess.run([
	"ffmpeg", "-y",
	"-f", "concat", "-safe", "0",
	"-i", concat_list,
	"-c", "copy",
	str(concat_path),
	], check=True, capture_output=True)

	# --- Step 6: Overlay audio ---
	audio_start = segments[0]["start"]
	video_duration = cumulative_frames / fps # actual frame-accurate duration

	with_audio_path = output_dir / "with_audio.mp4"
	subprocess.run([
	"ffmpeg", "-y",
	"-i", str(concat_path),
	"-ss", f"{audio_start:.3f}",
	"-i", str(audio_path),
	"-t", f"{video_duration:.3f}",
	"-c:v", "copy",
	"-c:a", "aac", "-b:a", "192k",
	"-map", "0:v:0", "-map", "1:a:0",
	"-shortest",
	str(with_audio_path),
	], check=True, capture_output=True)

	# --- Step 7: Lyrics + cover art overlay ---
	overlay_path = output_dir / "with_overlay.mp4"
	cover_path = Path(cover_art) if cover_art else None
	song_name = run_dir.parent.name
	_add_lyrics_overlay(with_audio_path, segments, overlay_path, audio_start,
	font_name=font_name, font_color=font_color,
	cover_art=cover_path, drop_time=drop_time,
	song_name=song_name)

	# --- Step 8: Crop to exact 9:16 ---
	final_path = output_dir / "final.mp4"
	subprocess.run([
	"ffmpeg", "-y",
	"-i", str(overlay_path),
	"-vf", "crop=2floor(ih9/16/2):ih:(iw-2floor(ih9/16/2))/2:0",
	"-c:v", "libx264", "-preset", "fast",
	"-c:a", "copy",
	str(final_path),
	], check=True, capture_output=True)

	# Clean up
	Path(concat_list).unlink(missing_ok=True)

	print(f"\nFinal video: {final_path}")
	print(f" Duration: {video_duration:.2f}s")
	print(f" Slots: {len(sub_segments)} ({len(segments)} original segments)")
	return final_path


	def run(
	run_dir: str \| Path,
	font_name: str = DEFAULT_FONT,
	font_color: str = DEFAULT_FONT_COLOR,
	cover_art: str \| Path \| None = None,
	) -> Path:
	"""Assemble final video from clips + audio.

	Args:
	run_dir: Run directory (e.g. data/Gone/run_001/).
	font_name: Display name of the font for lyrics overlay.
	font_color: Hex color for lyrics text.
	cover_art: Path to cover art image (optional).

	Returns:
	Path to final video.
	"""
	print("Assembling final video...")
	return assemble(run_dir, font_name=font_name, font_color=font_color,
	cover_art=cover_art)


	if __name__ == "__main__":
	import sys

	if len(sys.argv) < 2:
	print("Usage: python -m src.assembler <run_dir>")
	print(" e.g. python -m src.assembler data/Gone/run_001")
	sys.exit(1)

	run(sys.argv[1])