project / autotune_script.py

Upload autotune_script.py

4d1a0a6 verified over 1 year ago

5.89 kB

	#!/usr/bin/python3
	from functools import partial
	from pathlib import Path
	import argparse
	import librosa
	import librosa.display
	import numpy as np
	import matplotlib.pyplot as plt
	import soundfile as sf
	import scipy.signal as sig
	import psola


	SEMITONES_IN_OCTAVE = 12


	def degrees_from(scale: str):
	"""Return the pitch classes (degrees) that correspond to the given scale"""
	degrees = librosa.key_to_degrees(scale)
	# To properly perform pitch rounding to the nearest degree from the scale, we need to repeat
	# the first degree raised by an octave. Otherwise, pitches slightly lower than the base degree
	# would be incorrectly assigned.
	degrees = np.concatenate((degrees, [degrees[0] + SEMITONES_IN_OCTAVE]))
	return degrees


	def closest_pitch(f0):
	"""Round the given pitch values to the nearest MIDI note numbers"""
	midi_note = np.around(librosa.hz_to_midi(f0))
	# To preserve the nan values.
	nan_indices = np.isnan(f0)
	midi_note[nan_indices] = np.nan
	# Convert back to Hz.
	return librosa.midi_to_hz(midi_note)


	def closest_pitch_from_scale(f0, scale):
	"""Return the pitch closest to f0 that belongs to the given scale"""
	# Preserve nan.
	if np.isnan(f0):
	return np.nan
	degrees = degrees_from(scale)
	midi_note = librosa.hz_to_midi(f0)
	# Subtract the multiplicities of 12 so that we have the real-valued pitch class of the
	# input pitch.
	degree = midi_note % SEMITONES_IN_OCTAVE
	# Find the closest pitch class from the scale.
	degree_id = np.argmin(np.abs(degrees - degree))
	# Calculate the difference between the input pitch class and the desired pitch class.
	degree_difference = degree - degrees[degree_id]
	# Shift the input MIDI note number by the calculated difference.
	midi_note -= degree_difference
	# Convert to Hz.
	return librosa.midi_to_hz(midi_note)


	def aclosest_pitch_from_scale(f0, scale):
	"""Map each pitch in the f0 array to the closest pitch belonging to the given scale."""
	sanitized_pitch = np.zeros_like(f0)
	for i in np.arange(f0.shape[0]):
	sanitized_pitch[i] = closest_pitch_from_scale(f0[i], scale)
	# Perform median filtering to additionally smooth the corrected pitch.
	smoothed_sanitized_pitch = sig.medfilt(sanitized_pitch, kernel_size=11)
	# Remove the additional NaN values after median filtering.
	smoothed_sanitized_pitch[np.isnan(smoothed_sanitized_pitch)] = sanitized_pitch[np.isnan(smoothed_sanitized_pitch)]
	return smoothed_sanitized_pitch


	def autotune(audio, sr, correction_function, plot=False):
	# Set some basis parameters.
	frame_length = 2048
	hop_length = frame_length // 4
	fmin = librosa.note_to_hz('C2')
	fmax = librosa.note_to_hz('C7')

	# Pitch tracking using the PYIN algorithm.
	f0, voiced_flag, voiced_probabilities = librosa.pyin(audio,
	frame_length=frame_length,
	hop_length=hop_length,
	sr=sr,
	fmin=fmin,
	fmax=fmax)

	# Apply the chosen adjustment strategy to the pitch.
	corrected_f0 = correction_function(f0)

	if plot:
	# Plot the spectrogram, overlaid with the original pitch trajectory and the adjusted
	# pitch trajectory.
	stft = librosa.stft(audio, n_fft=frame_length, hop_length=hop_length)
	time_points = librosa.times_like(stft, sr=sr, hop_length=hop_length)
	log_stft = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
	fig, ax = plt.subplots()
	img = librosa.display.specshow(log_stft, x_axis='time', y_axis='log', ax=ax, sr=sr, hop_length=hop_length, fmin=fmin, fmax=fmax)
	fig.colorbar(img, ax=ax, format="%+2.f dB")
	ax.plot(time_points, f0, label='original pitch', color='cyan', linewidth=2)
	ax.plot(time_points, corrected_f0, label='corrected pitch', color='orange', linewidth=1)
	ax.legend(loc='upper right')
	plt.ylabel('Frequency [Hz]')
	plt.xlabel('Time [M:SS]')
	plt.savefig('pitch_correction.png', dpi=300, bbox_inches='tight')

	# Pitch-shifting using the PSOLA algorithm.
	return psola.vocode(audio, sample_rate=int(sr), target_pitch=corrected_f0, fmin=fmin, fmax=fmax)


	def main():
	# Parse the command line arguments.
	ap = argparse.ArgumentParser()
	ap.add_argument('vocals_file')
	ap.add_argument('--plot', '-p', action='store_true', default=False,
	help='if set, will produce a plot of the results')
	ap.add_argument('--correction-method', '-c', choices=['closest', 'scale'], default='closest')
	ap.add_argument('--scale', '-s', type=str, help='see librosa.key_to_degrees;'
	' used only for the "scale" correction'
	' method')
	args = ap.parse_args()

	filepath = Path(args.vocals_file)

	# Load the audio file.
	y, sr = librosa.load(str(filepath), sr=None, mono=False)

	# Only mono-files are handled. If stereo files are supplied, only the first channel is used.
	if y.ndim > 1:
	y = y[0, :]

	# Pick the pitch adjustment strategy according to the arguments.
	correction_function = closest_pitch if args.correction_method == 'closest' else partial(aclosest_pitch_from_scale, scale=args.scale)

	# Perform the auto-tuning.
	pitch_corrected_y = autotune(y, sr, correction_function, args.plot)

	# Write the corrected audio to an output file.
	filepath = filepath.parent / (filepath.stem + '_pitch_corrected' + filepath.suffix)
	sf.write(str(filepath), pitch_corrected_y, sr)


	if __name__ == '__main__':
	main()