Imports
In [2]:
from __future__ import print_function
import os
import numpy as np
import sys
import subprocess
import tarfile
import xml.etree.ElementTree as ET
from IPython.display import display, Image
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from six.moves import range
from yaafelib import *
TODO:
In [8]:
class IsmisFeatures:
def __init__(self, audiofile):
self.audiofile = audiofile
self.temporal_centroid = None #1: Temporal Centroid
self.spectral_centroid = None #2: Spectral Centroid average value
self.ase_per_band_avg = [] #4-37: Audio Spectrum Envelope (ASE) average values in 34 frequency bands
self.ase_avg = None #38: ASE average value (averaged for all frequency bands)
self.ase_per_band_var = [] #39-72: ASE variance values in 34 frequency bands
self.ase_var_avg = None #73: averaged ASE variance parameters
self.centroid_avg = None #74: Audio Spectrum Centroid – average
self.centroid_var = None #75: Audio Spectrum Centroid – variance
self.spread_avg = None #76: Audio Spectrum Spread – average
self.spread_var = None #77: Audio Spectrum Spread – variance
self.sfm_per_band_avg = [] #78-101: Spectral Flatness Measure (SFM) average values for 24 frequency bands
self.sfm_avg = None #102: SFM average value (averaged for all frequency bands)
self.sfm_per_band_var = [] #103-126: Spectral Flatness Measure (SFM) variance values for 24 frequency bands
self.sfm_var_avg = None #127: averaged SFM variance parameters
self.mfcc = [] #128-147: 20 first mel cepstral coefficients average values
def extract_features(self):
if os.path.exists(self.audiofile):
print('Getting features from ' + self.audiofile)
else:
raise Exception('File ' + self.audiofile + ' not found')
self.extract_mpeg7_features()
self.extract_mfcc()
def extract_mpeg7_features(self):
ns = {'xmlns' : 'urn:mpeg:mpeg7:schema:2001',
'mpeg7' : 'urn:mpeg:mpeg7:schema:2001',
'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
'xsi:schemaLocation' : 'urn:mpeg:mpeg7:schema:2001 http://mpeg7audioenc.sourceforge.net/mpeg7audioenc.xsd'}
# TODO: make this work - no java at the docker image atm
#subprocess.call(['java', '-jar', 'MPEG7AudioEnc.jar', self.audiofile, 'mpeg7config.xml'], stdout='desc.xml')
tree = ET.parse('desc.xml')
root = tree.getroot()
temporal_centroid_xml = root.find(".//mpeg7:AudioDescriptor[@xsi:type='TemporalCentroidType']", ns)
spectral_cetroid_xml = root.find(".//mpeg7:AudioDescriptor[@xsi:type='SpectralCentroidType']", ns)
audio_spectrum_centroid = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumCentroidType']", ns)
audio_spectrum_spread = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumSpreadType']", ns)
audio_spectrum_envelope = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumEnvelopeType']", ns)
audio_spectrum_flatness = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumFlatnessType']", ns)
self.temporal_centroid = self.parse_xml_scalar(temporal_centroid_xml, ns)
self.spectral_centroid = self.parse_xml_scalar(spectral_cetroid_xml, ns)
envelope_values = self.parse_2d_xml_vector(audio_spectrum_envelope, ns)
self.ase_per_band_avg = [np.mean(band) for band in envelope_values]
self.ase_avg = np.mean(self.ase_per_band_avg)
self.ase_per_band_var = [np.var(band) for band in envelope_values]
self.ase_var_avg = np.mean(self.ase_per_band_var)
centroid_values = self.parse_xml_vector(audio_spectrum_centroid, ns)
self.centroid_avg = np.mean(centroid_values)
self.centroid_var = np.var(centroid_values)
spread_values = self.parse_xml_vector(audio_spectrum_spread, ns)
self.spread_avg = np.mean(spread_values)
self.spread_var = np.var(spread_values)
flatness_values = self.parse_2d_xml_vector(audio_spectrum_flatness, ns)
self.sfm_per_band_avg = [np.mean(band) for band in flatness_values]
self.sfm_avg = np.mean(self.sfm_per_band_avg)
self.sfm_per_band_var = [np.var(band) for band in flatness_values]
self.sfm_var_avg = np.mean(self.ase_per_band_var)
def parse_xml_scalar(self, element, ns):
return float(element.find(".//mpeg7:Scalar", ns).text)
def parse_xml_vector(self, element, ns):
values_splitted = element.find(".//mpeg7:Raw", ns).text.split()
return map((lambda x: float(x)), values_splitted)
def parse_2d_xml_vector(self, element, ns):
values_string = element.find(".//mpeg7:Raw", ns).text
values_splitted = [s.strip().split() for s in values_string.splitlines()]
values = [map((lambda x: float(x)), value) for value in values_splitted] #cast to float
transposed_values = list(map(list, zip(*values))) #transpose matrix to have 1 long vector per 1 band
return transposed_values
def extract_mfcc(self):
fp = FeaturePlan(sample_rate=22050, normalize=1)
fp.addFeature('mfcc: MFCC CepsNbCoeffs=20')
df = fp.getDataFlow()
engine = Engine()
engine.load(df)
afp = AudioFileProcessor()
afp.setOutputFormat('csv', 'features', {'Precision': '8', 'Metadata': 'False'})
afp.processFile(engine, self.audiofile)
engine.flush()
feats = engine.readAllOutputs()
self.mfcc = feats['mfcc']
In [9]:
test_track = 'data/genres/hiphop/hiphop.00049.au'
ismisFeature = IsmisFeatures(test_track)
ismisFeature.extract_features()
print(vars(ismisFeature))
In [ ]: