| | """ |
| | Audio synthesis utilities for beat tracking evaluation. |
| | |
| | This module provides functions to: |
| | - Generate click sounds for beats and downbeats |
| | - Mix click tracks with original audio |
| | - Save audio files with beat annotations |
| | |
| | Example usage: |
| | from exp.data.audio import create_click_track, mix_audio, save_audio |
| | |
| | # Create click track |
| | clicks = create_click_track( |
| | beat_times=pred_beats, |
| | downbeat_times=pred_downbeats, |
| | duration=30.0, |
| | sr=16000 |
| | ) |
| | |
| | # Mix with original audio |
| | mixed = mix_audio(original_audio, clicks, click_volume=0.5) |
| | |
| | # Save to file |
| | save_audio(mixed, "output.wav", sr=16000) |
| | """ |
| |
|
| | import numpy as np |
| | from pathlib import Path |
| |
|
| |
|
| | def generate_click( |
| | frequency: float = 1000.0, |
| | duration: float = 0.02, |
| | sr: int = 16000, |
| | attack: float = 0.002, |
| | decay: float = 0.018, |
| | ) -> np.ndarray: |
| | """ |
| | Generate a single click sound. |
| | |
| | Args: |
| | frequency: Frequency of the click tone in Hz |
| | duration: Duration of the click in seconds |
| | sr: Sample rate |
| | attack: Attack time in seconds |
| | decay: Decay time in seconds |
| | |
| | Returns: |
| | Click waveform as numpy array |
| | """ |
| | t = np.arange(int(duration * sr)) / sr |
| |
|
| | |
| | wave = np.sin(2 * np.pi * frequency * t) |
| |
|
| | |
| | envelope = np.ones_like(t) |
| | attack_samples = int(attack * sr) |
| | decay_samples = int(decay * sr) |
| |
|
| | if attack_samples > 0: |
| | envelope[:attack_samples] = np.linspace(0, 1, attack_samples) |
| | if decay_samples > 0: |
| | decay_start = len(t) - decay_samples |
| | if decay_start > 0: |
| | envelope[decay_start:] = np.linspace(1, 0, decay_samples) |
| |
|
| | return wave * envelope |
| |
|
| |
|
| | def create_click_track( |
| | beat_times: list[float] | np.ndarray, |
| | downbeat_times: list[float] | np.ndarray | None = None, |
| | duration: float | None = None, |
| | sr: int = 16000, |
| | beat_freq: float = 1000.0, |
| | downbeat_freq: float = 1500.0, |
| | click_duration: float = 0.03, |
| | ) -> np.ndarray: |
| | """ |
| | Create a click track from beat and downbeat times. |
| | |
| | Args: |
| | beat_times: List of beat times in seconds |
| | downbeat_times: List of downbeat times in seconds (optional) |
| | duration: Total duration in seconds (auto-detected if None) |
| | sr: Sample rate |
| | beat_freq: Frequency for beat clicks (Hz) |
| | downbeat_freq: Frequency for downbeat clicks (Hz) |
| | click_duration: Duration of each click in seconds |
| | |
| | Returns: |
| | Click track as numpy array |
| | """ |
| | beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([]) |
| | if downbeat_times is not None: |
| | downbeat_times = ( |
| | np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([]) |
| | ) |
| | else: |
| | downbeat_times = np.array([]) |
| |
|
| | |
| | if duration is None: |
| | all_times = np.concatenate([beat_times, downbeat_times]) |
| | if len(all_times) == 0: |
| | return np.array([]) |
| | duration = float(np.max(all_times)) + 1.0 |
| |
|
| | |
| | total_samples = int(duration * sr) |
| | output = np.zeros(total_samples, dtype=np.float32) |
| |
|
| | |
| | beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr) |
| | downbeat_click = generate_click( |
| | frequency=downbeat_freq, duration=click_duration, sr=sr |
| | ) |
| |
|
| | |
| | downbeat_set = set(np.round(downbeat_times, 3)) |
| |
|
| | |
| | for t in beat_times: |
| | sample_idx = int(t * sr) |
| | if sample_idx < 0 or sample_idx >= total_samples: |
| | continue |
| |
|
| | |
| | is_downbeat = np.round(t, 3) in downbeat_set |
| | click = downbeat_click if is_downbeat else beat_click |
| |
|
| | |
| | end_idx = min(sample_idx + len(click), total_samples) |
| | click_len = end_idx - sample_idx |
| | output[sample_idx:end_idx] += click[:click_len] |
| |
|
| | |
| | beat_set = set(np.round(beat_times, 3)) |
| | for t in downbeat_times: |
| | if np.round(t, 3) in beat_set: |
| | continue |
| |
|
| | sample_idx = int(t * sr) |
| | if sample_idx < 0 or sample_idx >= total_samples: |
| | continue |
| |
|
| | end_idx = min(sample_idx + len(downbeat_click), total_samples) |
| | click_len = end_idx - sample_idx |
| | output[sample_idx:end_idx] += downbeat_click[:click_len] |
| |
|
| | return output |
| |
|
| |
|
| | def mix_audio( |
| | audio: np.ndarray, |
| | click_track: np.ndarray, |
| | click_volume: float = 0.5, |
| | ) -> np.ndarray: |
| | """ |
| | Mix original audio with a click track. |
| | |
| | Args: |
| | audio: Original audio waveform |
| | click_track: Click track to overlay |
| | click_volume: Volume of clicks relative to audio (0.0 to 1.0) |
| | |
| | Returns: |
| | Mixed audio |
| | """ |
| | |
| | max_len = max(len(audio), len(click_track)) |
| | audio_padded = np.zeros(max_len, dtype=np.float32) |
| | click_padded = np.zeros(max_len, dtype=np.float32) |
| |
|
| | audio_padded[: len(audio)] = audio |
| | click_padded[: len(click_track)] = click_track |
| |
|
| | |
| | audio_max = np.abs(audio_padded).max() |
| | if audio_max > 0: |
| | audio_padded = audio_padded / audio_max * 0.8 |
| |
|
| | |
| | click_max = np.abs(click_padded).max() |
| | if click_max > 0: |
| | click_padded = click_padded / click_max * click_volume * 0.8 |
| |
|
| | |
| | mixed = audio_padded + click_padded |
| |
|
| | |
| | max_val = np.abs(mixed).max() |
| | if max_val > 1.0: |
| | mixed = mixed / max_val * 0.95 |
| |
|
| | return mixed.astype(np.float32) |
| |
|
| |
|
| | def create_comparison_audio( |
| | audio: np.ndarray, |
| | pred_beats: list[float], |
| | pred_downbeats: list[float], |
| | gt_beats: list[float], |
| | gt_downbeats: list[float], |
| | sr: int = 16000, |
| | click_volume: float = 0.5, |
| | ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: |
| | """ |
| | Create audio files for comparison: prediction clicks, ground truth clicks, and combined. |
| | |
| | Args: |
| | audio: Original audio waveform |
| | pred_beats: Predicted beat times |
| | pred_downbeats: Predicted downbeat times |
| | gt_beats: Ground truth beat times |
| | gt_downbeats: Ground truth downbeat times |
| | sr: Sample rate |
| | click_volume: Volume of clicks |
| | |
| | Returns: |
| | Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both) |
| | """ |
| | duration = len(audio) / sr |
| |
|
| | |
| | pred_clicks = create_click_track( |
| | pred_beats, |
| | pred_downbeats, |
| | duration=duration, |
| | sr=sr, |
| | beat_freq=1000.0, |
| | downbeat_freq=1500.0, |
| | ) |
| |
|
| | gt_clicks = create_click_track( |
| | gt_beats, |
| | gt_downbeats, |
| | duration=duration, |
| | sr=sr, |
| | beat_freq=800.0, |
| | downbeat_freq=1200.0, |
| | ) |
| |
|
| | |
| | audio_pred = mix_audio(audio, pred_clicks, click_volume) |
| | audio_gt = mix_audio(audio, gt_clicks, click_volume) |
| | audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume) |
| |
|
| | return audio_pred, audio_gt, audio_both |
| |
|
| |
|
| | def save_audio( |
| | audio: np.ndarray, |
| | path: str | Path, |
| | sr: int = 16000, |
| | ) -> None: |
| | """ |
| | Save audio to a WAV file. |
| | |
| | Args: |
| | audio: Audio waveform |
| | path: Output file path |
| | sr: Sample rate |
| | """ |
| | import scipy.io.wavfile as wavfile |
| |
|
| | path = Path(path) |
| | path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | audio_int16 = (audio * 32767).astype(np.int16) |
| | wavfile.write(str(path), sr, audio_int16) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | print("Audio synthesis demo...") |
| |
|
| | |
| | sr = 16000 |
| | duration = 10.0 |
| | t = np.arange(int(duration * sr)) / sr |
| | music = np.sin(2 * np.pi * 220 * t) * 0.3 |
| |
|
| | |
| | beats = np.arange(0, duration, 0.5).tolist() |
| | downbeats = np.arange(0, duration, 2.0).tolist() |
| |
|
| | |
| | clicks = create_click_track(beats, downbeats, duration=duration, sr=sr) |
| |
|
| | |
| | mixed = mix_audio(music, clicks, click_volume=0.6) |
| |
|
| | print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)") |
| | print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}") |
| |
|
| | |
| | save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr) |
| | print("Saved demo to /tmp/beat_click_demo.wav") |
| |
|