| """Lyrics-to-beat mapping: group beats into segments and assign lyrics.""" |
|
|
| import json |
| from pathlib import Path |
| from typing import Optional |
|
|
|
|
| def segment_lyrics( |
| beats: list[dict], |
| lyrics: list[dict], |
| beats_per_segment: int = 4, |
| ) -> list[dict]: |
| """Map timestamped lyrics onto beat-grouped segments. |
| |
| Groups consecutive beats into segments (e.g. 4 beats = 1 bar in 4/4 time) |
| and assigns words to the segment where they start. |
| |
| Args: |
| beats: List of beat dicts with "beat" and "time" keys. |
| lyrics: List of word dicts with "word", "start", "end" keys. |
| beats_per_segment: Number of beats per segment. 4 = one bar in 4/4 time. |
| |
| Returns: |
| List of segment dicts with keys: |
| - segment: 1-indexed segment number |
| - start: start time in seconds |
| - end: end time in seconds |
| - duration: segment duration in seconds |
| - lyrics: raw lyrics text for this segment (may be empty) |
| - words: list of word dicts that fall in this segment |
| """ |
| beat_times = [b["time"] for b in beats] |
|
|
| |
| segments = [] |
| seg_num = 1 |
| for i in range(0, len(beat_times) - 1, beats_per_segment): |
| start = beat_times[i] |
| |
| end_idx = min(i + beats_per_segment, len(beat_times) - 1) |
| end = beat_times[end_idx] |
|
|
| |
| seg_beat_times = [ |
| round(beat_times[j], 3) |
| for j in range(i, min(i + beats_per_segment + 1, len(beat_times))) |
| ] |
|
|
| segments.append({ |
| "segment": seg_num, |
| "start": round(start, 3), |
| "end": round(end, 3), |
| "duration": round(end - start, 3), |
| "beats": seg_beat_times, |
| "lyrics": "", |
| "words": [], |
| }) |
| seg_num += 1 |
|
|
| |
| for word in lyrics: |
| word_start = word["start"] |
| for seg in segments: |
| if seg["start"] <= word_start < seg["end"]: |
| seg["words"].append(word) |
| break |
| else: |
| |
| if segments and word_start >= segments[-1]["start"]: |
| segments[-1]["words"].append(word) |
|
|
| |
| for seg in segments: |
| seg["lyrics"] = " ".join(w["word"] for w in seg["words"]) |
|
|
| return segments |
|
|
|
|
| def save_segments( |
| segments: list[dict], |
| output_path: str | Path, |
| ) -> Path: |
| """Save segments to a JSON file. |
| |
| Args: |
| segments: List of segment dicts. |
| output_path: Path to save the JSON file. |
| |
| Returns: |
| Path to the saved JSON file. |
| """ |
| output_path = Path(output_path) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| with open(output_path, "w") as f: |
| json.dump(segments, f, indent=2) |
|
|
| return output_path |
|
|
|
|
| def run( |
| data_dir: str | Path, |
| beats_per_segment: int = 4, |
| ) -> list[dict]: |
| """Full segmentation pipeline: load beats + lyrics, segment, and save. |
| |
| Args: |
| data_dir: Song data directory containing beats.json and lyrics.json |
| (e.g. data/Gone/). |
| beats_per_segment: Number of beats per segment (4 = one bar). |
| |
| Returns: |
| List of segment dicts. |
| """ |
| data_dir = Path(data_dir) |
|
|
| with open(data_dir / "beats.json") as f: |
| beats = json.load(f) |
|
|
| with open(data_dir / "lyrics.json") as f: |
| lyrics = json.load(f) |
|
|
| segments = segment_lyrics(beats, lyrics, beats_per_segment=beats_per_segment) |
| save_segments(segments, data_dir / "segments.json") |
|
|
| return segments |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
|
|
| if len(sys.argv) < 2: |
| print("Usage: python -m src.segmenter <data_dir>") |
| print(" e.g. python -m src.segmenter data/Gone") |
| sys.exit(1) |
|
|
| segments = run(sys.argv[1]) |
| print(f"Created {len(segments)} segments:\n") |
| for seg in segments: |
| lyrics_display = f'"{seg["lyrics"]}"' if seg["lyrics"] else "(instrumental)" |
| print(f" Seg {seg['segment']}: {seg['start']:.3f}s - {seg['end']:.3f}s " |
| f"({seg['duration']:.3f}s) {lyrics_display}") |
|
|