In [ ]:
##Do all of the imports and setup inline plotting
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.interpolate import InterpolatedUnivariateSpline
from ripser import ripser
from persim import plot_diagrams
import scipy.io.wavfile
from IPython.display import clear_output
def getSlidingWindow(x, dim, Tau, dT):
"""
Return a sliding window of a time series,
using arbitrary sampling. Use linear interpolation
to fill in values in windows not on the original grid
Parameters
----------
x: ndarray(N)
The original time series
dim: int
Dimension of sliding window (number of lags+1)
Tau: float
Length between lags, in units of time series
dT: float
Length between windows, in units of time series
Returns
-------
X: ndarray(N, dim)
All sliding windows stacked up
"""
N = len(x)
NWindows = int(np.floor((N-dim*Tau)/dT))
if NWindows <= 0:
print("Error: Tau too large for signal extent")
return np.zeros((3, dim))
X = np.zeros((NWindows, dim))
spl = InterpolatedUnivariateSpline(np.arange(N), x)
for i in range(NWindows):
idxx = dT*i + Tau*np.arange(dim)
start = int(np.floor(idxx[0]))
end = int(np.ceil(idxx[-1]))+2
# Only take windows that are within range
if end >= len(x):
X = X[0:i, :]
break
X[i, :] = spl(idxx)
return X
Biphonation refers to the presence of two or more simultaneous frequencies in a signal which are "incommensurate"; that is, their frequencies are linearly independent over the rational numbers. In other words, the frequencies are "inharmonic." We saw a synthetic example in class 1 of cos(x) + cos(pi x). Today, we will examine how this manifests itself in biology with horse whinnies that occur during states of high emotional valence. During the steady state of a horse whinnie, biphonation is found
|
Figure 1: Audio of a horse whinnie. Courtesy of http://www.nature.com/articles/srep09989#s1 |
Let's now load the audio from the horse whinnie example, interactively plot the audio waveform, and listen to it.
In [ ]:
#Read in the audio file. Fs is the sample rate, and
#X is the audio signal
Fs, X = scipy.io.wavfile.read("horsewhinnie.wav")
plt.figure()
plt.plot(np.arange(len(X))/float(Fs), X)
plt.xlabel("Time (Seconds)")
plt.title("Horse Whinnie Waveform")
plt.show()
from IPython.display import Audio
# load a remote WAV file
Audio('horsewhinnie.wav')
The code below will extract a subsection of the signal and perform a sliding window embedding + 1D persistent homology. Using the interactive plot of the audio waveform above, find two different time ranges to plot:
In [ ]:
#These variables are used to adjust the window size
F0 = 493 #First fundamental frequency
G0 = 1433 #Second fundamental frequency
###TODO: Modify this variable (time in seconds)
time = 0.91
#Step 1: Extract an audio snippet starting at the chosen time
SigLen = 512 #The number of samples to take after the start time
iStart = int(round(time*Fs))
x = X[iStart:iStart + SigLen]
W = int(round(Fs/G0))
#Step 2: Get the sliding window embedding
Y = getSlidingWindow(x, W, 2, 2)
#Mean-center and normalize
Y = Y - np.mean(Y, 1)[:, None]
Y = Y/np.sqrt(np.sum(Y**2, 1))[:, None]
#Step 3: Do the 1D rips filtration
PDs = ripser(Y, maxdim=1)['dgms']
PD = PDs[1]
#Step 4: Figure out the second largest persistence
sP = 0
sPIdx = 0
if PD.shape[0] > 1:
Pers = PD[:, 1] - PD[:, 0]
sPIdx = np.argsort(-Pers)[1]
sP = Pers[sPIdx]
#Step 5: Plot the results
plt.figure(figsize=(8, 4))
plt.subplot(121)
plt.title("Starting At %g Seconds"%time)
plt.plot(time + np.arange(SigLen)/Fs, x)
plt.xlabel("Time")
plt.subplot(122)
plot_diagrams(PDs)
plt.plot([PD[sPIdx, 0]]*2, PD[sPIdx, :], 'r')
plt.scatter(PD[sPIdx, 0], PD[sPIdx, 1], 20, 'r')
plt.title("Second Largest Persistence: %g"%sP)
Music is full of repetition. For instance, there is usually a hierarchy of rhythm which determines how the music "pulses," or repeates itself in beat patterns. Often, a dominant rhythm level is deemed the "tempo" of the music. Typical tempos range from about 50 beats per minute to 200 beats per minute. Let's take a moderate tempo level of 120 beats per minute, for instance, which occurs in the song "Don't Stop Believin'" by Journey. This corresponds to a period of 0.5 seconds. As we saw in the horse example, sound is sampled at 44100 samples per second. This corresponds to an ideal sliding window interval length of 22050. Let's try to compute the sliding window embedding of the raw audio to see if the tempo manifests itself with TDA. First, we will load in "Don't Stop Believing" below:
In [ ]:
Fs, X = scipy.io.wavfile.read("journey.wav") #Don't Stop Believing
X = X/(2.0**15) #Loaded in as 16 bit shorts, convert to float
plt.figure()
plt.plot(np.arange(len(X))/float(Fs), X)
plt.xlabel("Time (Seconds)")
plt.title("Don't Stop Believin")
plt.show()
Audio('journey.wav')
Now let's do a sliding window with a window length equal to the sample rate over 2, corresponding to the fact that a beat period is a half of a second in this song. We will have to have a large Tau
and dT
, since there is such a high sampling rate, because otherwise the TDA code will grind to a halt with way too many points. We will also need to set up special sliding window code that skips the spline interpolation step, because this will also be prohibitively slow at this sampling rate. In other words, we will assume that Tau
and dT
are integers. Here's the code that does all of this on the first three seconds of audio:
In [ ]:
#Sliding window code here assumes integer x, dim, and Tau so no interpolation
#is needed (for computational efficiency)
def getSlidingWindowInteger(x, dim, Tau, dT):
N = len(x)
NWindows = int(np.floor((N-dim*Tau)/dT)) #The number of windows
if NWindows <= 0:
print("Error: Tau too large for signal extent")
return np.zeros((3, dim))
X = np.zeros((NWindows, dim)) #Create a 2D array which will store all windows
idx = np.arange(N)
for i in range(NWindows):
#Figure out the indices of the samples in this window
idxx = np.array(dT*i + Tau*np.arange(dim), dtype=np.int32)
X[i, :] = x[idxx]
return X
#Note that dim*Tau here spans a half a second of audio,
#since Fs is the sample rate
dim = round(Fs/200)
Tau = 100
dT = Fs/100
Y = getSlidingWindowInteger(X[0:Fs*3], dim, Tau, dT)
print("Y.shape = ", Y.shape)
#Mean-center and normalize
Y = Y - np.mean(Y, 1)[:, None]
Y = Y/np.sqrt(np.sum(Y**2, 1))[:, None]
PDs = ripser(Y, maxdim=1)['dgms']
pca = PCA()
Z = pca.fit_transform(Y)
plt.figure(figsize=(8, 4))
plt.subplot(121)
plt.title("2D PCA")
plt.scatter(Z[:, 0], Z[:, 1])
plt.subplot(122)
plot_diagrams(PDs)
plt.title("Persistence Diagram")
plt.show()
Unfortunately, the sample rate is just to high and the signal is just too messy for this algorithm to work. We will have to do some more sophisticated preprocessing before applying the algorithm
In [ ]:
from MusicFeatures import *
#Compute the power spectrogram and audio novelty function
winSize = 512
hopSize = 256
plt.figure()
(S, novFn) = getAudioNoveltyFn(X, Fs, winSize, hopSize)
plt.imshow(np.log(S.T), cmap = 'afmhot', aspect = 'auto')
plt.title('Log-frequency power spectrogram')
plt.show()
You might notice that there are vertical streaks in a semi-periodic pattern. These correspond to "broadband percussive events," or, on other words, likely onsets for beats when drums occur. An audio novelty function is derived from a spectrogram by looking at the difference between successive frames to try to pick up on this. The code below extracts the audio novelty function and displays it for the same audio snippet.
In [ ]:
plt.figure(figsize=(8, 4))
#Plot the spectrogram again
plt.subplot(211)
plt.imshow(np.log(S.T), cmap = 'afmhot', aspect = 'auto')
plt.ylabel('Frequency Bin')
plt.title('Log-frequency power spectrogram')
#Plot the audio novelty function
plt.subplot(212)
plt.plot(np.arange(len(novFn))*hopSize/float(Fs), novFn)
plt.xlabel("Time (Seconds)")
plt.ylabel('Audio Novelty')
plt.xlim([0, len(novFn)*float(hopSize)/Fs])
plt.show()
Not only is the audio novelty function a cleaner signal, but it is also at a much lower sample rate. Since the "hop size" between each spectrogram window is 256 samples, the temporal resolution is coarser by that factor.
Let's now try our sliding window with a snippet of the audio novelty function of the previous example instead of the raw audio:
In [ ]:
(S, novFn) = getAudioNoveltyFn(X, Fs, winSize, hopSize)
#Take the first 3 seconds of the novelty function
fac = int(Fs/hopSize)
novFn = novFn[fac*4:fac*7]
#Make sure the window size is half of a second, noting that
#the audio novelty function has been downsampled by a "hopSize" factor
dim = 20
Tau = (Fs/2)/(float(hopSize)*dim)
dT = 1
Y = getSlidingWindowInteger(novFn, dim, Tau, dT)
print("Y.shape = ", Y.shape)
#Mean-center and normalize
Y = Y - np.mean(Y, 1)[:, None]
Y = Y/np.sqrt(np.sum(Y**2, 1))[:, None]
PDs = ripser(Y, maxdim=1)['dgms']
pca = PCA()
Z = pca.fit_transform(Y)
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.title("2D PCA")
plt.scatter(Z[:, 0], Z[:, 1])
plt.subplot(122)
plot_diagrams(PDs)
plt.title("Persistence Diagram")
plt.show()
In [ ]:
#Read in the audio file. Fs is the sample rate, and
#X is the audio signal
Fs, X = scipy.io.wavfile.read("speech.wav")
X = X/(2.0**15)
(S, novFn) = getAudioNoveltyFn(X, Fs, winSize, hopSize)
plt.figure()
plt.plot(np.arange(len(novFn))*hopSize/float(Fs), novFn)
plt.xlabel("Time (Seconds)")
plt.title("Audio Novelty Function for Speech")
Audio('speech.wav')
plt.show()
In [ ]:
#Get the novelty function for the first three seconds, and use the
#exact same parameters as before
novFn = novFn[0:int((Fs/hopSize)*3)]
dim = 20
Tau = (Fs/2)/(float(hopSize)*dim)
dT = 1
Y = getSlidingWindowInteger(novFn, dim, Tau, dT)
print("Y.shape = ", Y.shape)
#Mean-center and normalize
Y = Y - np.mean(Y, 1)[:, None]
Y = Y/np.sqrt(np.sum(Y**2, 1))[:, None]
PDs = ripser(Y, maxdim=1)['dgms']
pca = PCA()
Z = pca.fit_transform(Y)
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.title("2D PCA")
plt.scatter(Z[:, 0], Z[:, 1])
plt.subplot(122)
plot_diagrams(PDs[1], labels=['H1'])
plt.title("Persistence Diagram")
plt.show()
In [ ]: