In [1]:
from essentia.standard import *
from pylab import plot, show, figure, imshow, axis, subplot
from numpy import *
#constants
sr = 44100
inputDir = 'input/'
outputDir = 'output/'
#inputFile = 'Safari_20141129_1443_CommercialDetection_irregular_short.aiff'
inputFile = 'Safari_20141212_1103_CommercialDetection_pandora.aiff'
print 'processing: ' + inputFile
# setup modules
audio = MonoLoader(filename = inputDir+inputFile)()
w = Windowing(type = 'hann')
spectrum = Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum
mfcc = MFCC()
rms = RMS()
levels = LevelExtractor(frameSize = 20480, hopSize = 5120)
In [2]:
levels
Out[2]:
In [3]:
loudness = Loudness()
In [3]:
pool = essentia.Pool()
In [4]:
pool.add('lowlevel.levels',levels(audio))
In [60]:
(pool['lowlevel.levels'][0][5400:5600] < 1.0e-5)
Out[60]:
In [62]:
(4*9+8)*.05
Out[62]:
In [5]:
%pylab
In [6]:
for frame in FrameGenerator(audio, frameSize = 2048, hopSize = 512):
mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
pool.add('lowlevel.mfcc',mfcc_coeffs)
rms512 = rms(w(frame))
pool.add('lowlevel.rms512',rms512)
In [164]:
subplot(4,1,1)
plot(pool['lowlevel.rms512'])
subplot(4,1,2)
plot(pool['lowlevel.levels'][0])
subplot(4,1,3)
plot(pool['silenceDetected'])
subplot(4,1,4)
plot(pool['lowlevel.variance'][0])
Out[164]:
In [41]:
plot(pool['lowlevel.mfcc'].T[3])
Out[41]:
In [42]:
sil = StartStopSilence()
In [43]:
pool.add('lowlevel.silence',sil(w(frame)))
In [48]:
pool['lowlevel.silence'].shape
Out[48]:
In [64]:
(pool['lowlevel.levels'][0][5400:5600] < 9.0e-5)
Out[64]:
In [65]:
(pool['lowlevel.levels'][0][5400:5600] < 1.0e-5)
Out[65]:
In [8]:
silenceThreshold = 1.0e-4
In [10]:
pool.remove('silenceDetected')
pool.set('silenceDetected',essentia.array(pool['lowlevel.levels'][0] < silenceThreshold))
In [11]:
essentia.array(pool['lowlevel.levels'][0][5400:5600] < silenceThreshold)
Out[11]:
In [13]:
plot(pool['silenceDetected'])
Out[13]:
In [141]:
pool['lowlevel.levels'].shape
Out[141]:
In [84]:
15595*0.0019349961666666668
Out[84]:
In [78]:
1810/60
Out[78]:
In [79]:
1810.0/60
Out[79]:
In [80]:
1/60
Out[80]:
In [82]:
.11609977/60
Out[82]:
In [85]:
0.0019349961666666668 * 5450
Out[85]:
In [86]:
15595*.11609977
Out[86]:
In [87]:
5450*.11609977
Out[87]:
In [142]:
pool['lowlevel.levels'][0][5454]
Out[142]:
In [92]:
pool['silenceDetected'][0][5454]
Out[92]:
In [143]:
pool['silenceDetected'].shape
Out[143]:
In [133]:
pool['silenceDetected'].T.shape
Out[133]:
In [145]:
pool['silenceDetected'].T[5454]
Out[145]:
In [99]:
30/.11609977
Out[99]:
In [100]:
15/.11609977
Out[100]:
In [101]:
16/.11609977
Out[101]:
In [102]:
18/.11609977
Out[102]:
In [103]:
17/.11609977
Out[103]:
In [108]:
16.75/.11609977
Out[108]:
In [109]:
# Threshold of 20% for 15s comercials or 3s (12-18s)
# Threshold of 15% for 30s commercials or 4.5s (25.5-34.5s)
In [110]:
# Commercial from 14715-14977 (~30s)
# Commercial from 14990-15142 (~15s)
In [111]:
14715*.11609977
Out[111]:
In [113]:
1708.0/60
Out[113]:
In [114]:
1702.5/.11609977
Out[114]:
In [151]:
pool['silenceDetected'][14677:14750]
Out[151]:
In [152]:
14678*.11609977
Out[152]:
In [16]:
s = pool['silenceDetected']
In [17]:
s
Out[17]:
In [45]:
ss.shape
In [156]:
var = Variance()
In [42]:
ss = s.copy
In [96]:
it = nditer([s, None],
flags=['c_index', 'refs_ok'],
op_flags=[['readonly'],['writeonly','allocate']])
if (it.index == 0):
it[1] = 0
it.iternext()
while not it.finished:
# print "index " + repr(it.index) + "..\n"
if (s[it.index] == 1.0):
if (s[it.index-1] == 0):
it[1] = 1; # beginning of silence
elif (s[it.index+1] == 0):
it[1] = 2; # end of silence
else:
it[1] = 0; # no change
else:
it[1] = 0;
it.iternext()
In [64]:
it.iternext()
Out[64]:
In [100]:
plot(it.operands[1])
Out[100]:
In [97]:
ss = it.operands[1]
In [53]:
s
Out[53]:
In [87]:
ss[0]
Out[87]:
In [55]:
s[0]
Out[55]:
In [98]:
s[14670:14750]
Out[98]:
In [76]:
ss = it.operands[1]
In [99]:
ss[14670:14750]
Out[99]:
In [ ]: