In [1]:
import numpy, scipy, librosa, IPython.display
Load an audio file.
In [2]:
x, fs = librosa.load('zigeunerweisen.wav')
Play the audio file.
In [3]:
IPython.display.Audio(x, rate=fs)
Out[3]:
Goal: to identify the pitch of each note and replace each note with a pure tone of that pitch.
Detect onsets:
In [4]:
def get_onset_times(x, fs):
onset_frames = librosa.onset.onset_detect(x, fs)
return librosa.frames_to_time(onset_frames, fs)
Estimate pitch using the autocorrelation method:
In [5]:
def estimate_pitch(segment, fs, fmin=50.0, fmax=2000.0):
i_min = fs/fmax
i_max = fs/fmin
r = librosa.autocorrelate(segment)
r[:i_min] = 0
r[i_max:] = 0
i = r.argmax()
f0 = float(fs)/i
return f0
Try it out on one frame:
In [6]:
f0 = estimate_pitch(x[:2048], fs)
print f0
Create a function to generate a pure tone at the specified frequency:
In [7]:
def generate_sine(f0, fs, n_duration):
n = numpy.arange(n_duration)
return 0.2*numpy.sin(2*numpy.pi*f0*n/float(fs))
Finally, write a function that puts it all together:
In [8]:
def transcribe_pitch(signal_in, fs):
# initialize output signal
signal_out = numpy.zeros(len(signal_in))
# get onsets
onsets = get_onset_times(signal_in, fs)
# get pitches
for i in range(len(onsets)-1):
n0 = int(onsets[i]*fs)
n1 = int(onsets[i+1]*fs)
pitch = estimate_pitch(signal_in[n0:n1], fs, fmin=60, fmax=4000)
signal_out[n0:n1] = generate_sine(pitch, fs, n1-n0)
return signal_out
Try it out on the input signal:
In [9]:
signal_out = transcribe_pitch(x, fs)
Play the synthesized transcription.
In [10]:
IPython.display.Audio(signal_out, rate=fs)
Out[10]: