Imports


In [2]:
from __future__ import print_function
import os
import numpy as np
import sys
import subprocess
import tarfile
import xml.etree.ElementTree as ET
from IPython.display import display, Image
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from six.moves import range
from yaafelib import *

TODO:

  • install java on docker image

In [8]:
class IsmisFeatures:
    
    def __init__(self, audiofile):
        self.audiofile = audiofile
        
        self.temporal_centroid = None      #1: Temporal Centroid
        self.spectral_centroid = None      #2: Spectral Centroid average value
        self.ase_per_band_avg = []         #4-37: Audio Spectrum Envelope (ASE) average values in 34 frequency bands
        self.ase_avg = None                #38: ASE average value (averaged for all frequency bands)
        self.ase_per_band_var = []         #39-72: ASE variance values in 34 frequency bands
        self.ase_var_avg = None            #73: averaged ASE variance parameters
        self.centroid_avg = None           #74: Audio Spectrum Centroid – average
        self.centroid_var = None           #75: Audio Spectrum Centroid – variance
        self.spread_avg = None             #76: Audio Spectrum Spread – average
        self.spread_var = None             #77: Audio Spectrum Spread – variance
        self.sfm_per_band_avg = []         #78-101: Spectral Flatness Measure (SFM) average values for 24 frequency bands
        self.sfm_avg = None                #102: SFM average value (averaged for all frequency bands)
        self.sfm_per_band_var = []         #103-126: Spectral Flatness Measure (SFM) variance values for 24 frequency bands
        self.sfm_var_avg = None            #127: averaged SFM variance parameters
        self.mfcc = []                     #128-147: 20 first mel cepstral coefficients average values
        
    def extract_features(self):
        if os.path.exists(self.audiofile):
            print('Getting features from ' + self.audiofile)
        else:
            raise Exception('File ' + self.audiofile + ' not found')
            
        self.extract_mpeg7_features()
        self.extract_mfcc()
        
    def extract_mpeg7_features(self):
        ns = {'xmlns' : 'urn:mpeg:mpeg7:schema:2001',
              'mpeg7' : 'urn:mpeg:mpeg7:schema:2001',
              'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
              'xsi:schemaLocation' : 'urn:mpeg:mpeg7:schema:2001 http://mpeg7audioenc.sourceforge.net/mpeg7audioenc.xsd'} 

        # TODO: make this work - no java at the docker image atm
        #subprocess.call(['java', '-jar', 'MPEG7AudioEnc.jar', self.audiofile, 'mpeg7config.xml'], stdout='desc.xml')
        tree = ET.parse('desc.xml')
        root = tree.getroot()

        temporal_centroid_xml = root.find(".//mpeg7:AudioDescriptor[@xsi:type='TemporalCentroidType']", ns)
        spectral_cetroid_xml = root.find(".//mpeg7:AudioDescriptor[@xsi:type='SpectralCentroidType']", ns)
        audio_spectrum_centroid = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumCentroidType']", ns)
        audio_spectrum_spread = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumSpreadType']", ns)
        audio_spectrum_envelope = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumEnvelopeType']", ns)
        audio_spectrum_flatness = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumFlatnessType']", ns)

        self.temporal_centroid = self.parse_xml_scalar(temporal_centroid_xml, ns)
        self.spectral_centroid = self.parse_xml_scalar(spectral_cetroid_xml, ns)
        
        envelope_values = self.parse_2d_xml_vector(audio_spectrum_envelope, ns)          
        self.ase_per_band_avg = [np.mean(band) for band in envelope_values]
        self.ase_avg = np.mean(self.ase_per_band_avg)
        self.ase_per_band_var = [np.var(band) for band in envelope_values]
        self.ase_var_avg = np.mean(self.ase_per_band_var)
        
        centroid_values = self.parse_xml_vector(audio_spectrum_centroid, ns)
        self.centroid_avg = np.mean(centroid_values)
        self.centroid_var = np.var(centroid_values)
        
        spread_values = self.parse_xml_vector(audio_spectrum_spread, ns)
        self.spread_avg = np.mean(spread_values)
        self.spread_var = np.var(spread_values)
        
        flatness_values = self.parse_2d_xml_vector(audio_spectrum_flatness, ns)
        self.sfm_per_band_avg = [np.mean(band) for band in flatness_values]
        self.sfm_avg = np.mean(self.sfm_per_band_avg)
        self.sfm_per_band_var = [np.var(band) for band in flatness_values]
        self.sfm_var_avg = np.mean(self.ase_per_band_var)
    
    def parse_xml_scalar(self, element, ns):
        return float(element.find(".//mpeg7:Scalar", ns).text)
    
    def parse_xml_vector(self, element, ns):
        values_splitted = element.find(".//mpeg7:Raw", ns).text.split()
        return map((lambda x: float(x)), values_splitted)
    
    def parse_2d_xml_vector(self, element, ns):
        values_string = element.find(".//mpeg7:Raw", ns).text
        values_splitted = [s.strip().split() for s in values_string.splitlines()]
        values = [map((lambda x: float(x)), value) for value in values_splitted] #cast to float
        transposed_values = list(map(list, zip(*values))) #transpose matrix to have 1 long vector per 1 band
        return transposed_values 
    
    def extract_mfcc(self):
        fp = FeaturePlan(sample_rate=22050, normalize=1)
        fp.addFeature('mfcc: MFCC CepsNbCoeffs=20')
        df = fp.getDataFlow()
        engine = Engine()
        engine.load(df)
        afp = AudioFileProcessor()
        afp.setOutputFormat('csv', 'features', {'Precision': '8', 'Metadata': 'False'})
        afp.processFile(engine, self.audiofile)
        engine.flush()
        feats = engine.readAllOutputs()
        self.mfcc = feats['mfcc']

In [9]:
test_track = 'data/genres/hiphop/hiphop.00049.au'
ismisFeature = IsmisFeatures(test_track)
ismisFeature.extract_features()
print(vars(ismisFeature))


Getting features from data/genres/hiphop/hiphop.00049.au
{'ase_per_band_avg': [0.0010857544768678559, 0.00039213281849105998, 0.00033421980322000664, 0.00040626923972727238, 0.0005792853695078293, 0.00080933227739293432, 0.00084654510151727254, 0.0010896106786702903, 0.00087662898522397462, 0.000749538298503161, 0.00052391809180513506, 0.0014141276775084029, 0.0029324219731132846, 0.0015206782704930207, 0.0017674119847152885, 0.00085048636890920301, 0.00064789911162045687, 0.00074369662756742247, 0.00042117598722930984, 0.00029418995947052358, 0.00050997994913881293, 0.00034756748906218741, 0.00022736879393067691, 0.00014336703859843281, 7.8322788639313101e-05, 5.7067844108769591e-05, 3.5265915754944987e-05, 3.3010317372237415e-05, 4.1588755726782262e-05, 1.0612842408710569e-05, 8.5523252939203068e-08, 0.0, 0.0, 0.0], 'ase_per_band_var': [3.3738506042768895e-06, 4.2326187542293839e-07, 2.5573881917732766e-07, 3.2326409161520539e-07, 6.7865131110959754e-07, 1.3706366508553112e-06, 1.091033506687316e-06, 1.9962590413799428e-06, 1.459852536318557e-06, 1.5744995963653449e-06, 1.5653236031430118e-06, 1.3408904261005119e-05, 8.306847110185002e-05, 1.1713872019581845e-05, 2.7700827972494757e-05, 4.5387039292095166e-06, 4.8802559727357468e-06, 3.0404712812494362e-06, 9.6188236976682247e-07, 2.7615673273990605e-07, 7.986590806465434e-07, 3.9455352249673354e-07, 1.5185076931654297e-07, 4.1837735017117714e-08, 1.3151644185856916e-08, 9.7404117918481388e-09, 3.7649053154897566e-09, 3.171281972435384e-09, 7.9497417658625908e-09, 6.970306930339714e-10, 2.5998963985559434e-14, 0.0, 0.0, 0.0], 'sfm_avg': 0.58765642006129271, 'mfcc': array([[ 0.06149227,  1.59239392,  1.31847123,  0.20475314,  0.35536394,
        -0.16618184,  0.61475064,  0.66483616,  0.04542616,  0.05377903,
        -0.09706389, -0.02961214,  0.02836009,  0.07020499,  0.08470754,
        -0.01626443,  0.25405867,  0.15249978, -0.2412863 , -0.25502605]]), 'ase_var_avg': 4.8566851007701479e-06, 'spread_avg': 1.4055475127975992, 'spread_var': 0.14527359472479262, 'centroid_var': 0.53251738572863294, 'sfm_per_band_avg': [0.86543438910870285, 0.75802986507169068, 0.79211122239346432, 0.6623417045821941, 0.6813706554758252, 0.57798587836845605, 0.51608663680793598, 0.48131703259019676, 0.42596551774231417, 0.44776415093544514, 0.42908295638846283, 0.42206867141480492, 0.42709512136178718, 0.41723737562987662, 0.42282150120873618, 0.43555011409036343, 0.55714316307769263, 0.5439589901570524, 0.58891632924641546, 0.59017600955418481, 0.82617913061353787, 0.79886761425475161, 0.7612722181827275, 0.67497783321440485], 'sfm_var_avg': 4.8566851007701479e-06, 'spectral_centroid': 618.2943, 'centroid_avg': -1.5725602231954985, 'temporal_centroid': 15.086212, 'sfm_per_band_var': [0.032506848539913188, 0.041856168616402983, 0.038066156464866795, 0.053614191117236205, 0.050658153278515083, 0.052094395220570917, 0.057971248604839092, 0.050809729398631771, 0.055431714711792833, 0.052227972684423009, 0.046009545897614912, 0.039556175511269692, 0.04696200008311819, 0.046377166420900881, 0.041388748636332698, 0.046290995605446507, 0.047074091479148837, 0.042149539384548657, 0.038517426709039251, 0.035126171612995659, 0.0084725333273491538, 0.011332901738390825, 0.01296862420574597, 0.016191776380648445], 'audiofile': 'data/genres/hiphop/hiphop.00049.au', 'ase_avg': 0.00058145765763375037}

In [ ]: