Extract audio features from MSD


In [ ]:
%matplotlib inline

import os, sys, time
import pickle as pkl
import numpy as np
#from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
sys.path.append('src')
import hdf5_getters as h5getters

In [ ]:
data_dir = 'data/msd'
fsong2track   = os.path.join(data_dir, 'songID2TrackIDs.pkl')
ffeatures_msd = os.path.join(data_dir, 'songID2Features.pkl')

Data loading

Load playlists.


In [ ]:
#playlists_aotm = pkl.load(open(faotm, 'rb'))

In [ ]:
#print('#Playlists: %d' % len(playlists_aotm))

In [ ]:
#playlists_aotm[0]

In [ ]:
#song_set = sorted({songID for p in playlists_aotm for songID in p})

In [ ]:
#print('#Songs: %d' % len(song_set))

In [ ]:
#lengths = [len(p) for p in playlists_aotm]
#plt.hist(lengths, bins=20)
#print('Average playlist length: %.1f' % np.mean(lengths))

Song_id --> Song_name mapping.


In [ ]:
#songID2Name = {s[1]: s[0] for p in playlists_aotm for s in p['playlist']}

Load song_id --> track_id mapping: a song may correspond to multiple tracks.


In [ ]:
#song2TrackID = pkl.load(open(fmap, 'rb'))

In [ ]:
#len(song2TrackID)

In [ ]:
#{ k : song2TrackID[k] for k in list(song2TrackID.keys())[:10] }

In [ ]:
song2tracks = pkl.load(open(fsong2track, 'rb'))

In [ ]:
len(song2tracks)

In [ ]:
#song2tracks['SOAABRB12A58A792A3']

Load audio features

Load the audio features given songID, if a song has more than one trackID, simply use the first available track.


In [ ]:
def gen_h5dir(data_dir, trackID):
    return os.path.join(data_dir, '/'.join([trackID[ix] for ix in [2,3,4]]))

In [ ]:
trackID = 'TRARPDM128F14AE1CC'
msd_h5dir = gen_h5dir(data_dir, trackID)
h5 = h5getters.open_h5_file_read(os.path.join(msd_h5dir, trackID + '.h5'))
#print(h5getters.get_num_songs(h5))

#print(h5getters.get_artist_mbid(h5))
#print(h5getters.get_artist_mbtags(h5))  # SS: song specific
#print(h5getters.get_artist_mbtags_count(h5)) # SS array
#print(h5getters.get_artist_name(h5))
#print(h5getters.get_artist_playmeid(h5))
#print(h5getters.get_artist_terms(h5).shape) # SS, Y, text
#print(h5getters.get_artist_terms_freq(h5).shape) # SS, Y, numerical
#print(h5getters.get_artist_terms_weight(h5).shape) # SS, Y, numerical
#print(h5getters.get_audio_md5(h5))
#print(h5getters.get_bars_confidence(h5).shape) # SS, Y, numerical
#print(h5getters.get_bars_start(h5).shape) # SS, Y, numerical
#print(h5getters.get_beats_confidence(h5).shape) # SS, Y, numerical
#print(h5getters.get_beats_start(h5).shape) # SS, Y, numerical
#print(h5getters.get_danceability(h5)) # Y
#print(h5getters.get_duration(h5)) # Y, seconds
#print(h5getters.get_end_of_fade_in(h5)) # Y, seconds
#print(h5getters.get_energy(h5)) # Y
#print(h5getters.get_key(h5)) # Y
#print(h5getters.get_key_confidence(h5)) # Y
#print(h5getters.get_loudness(h5)) # Y
#print(h5getters.get_mode(h5)) # Y
#print(h5getters.get_mode_confidence(h5)) # Y
#print(h5getters.get_release(h5)) # album name
#print(h5getters.get_release_7digitalid(h5))
#print(h5getters.get_sections_confidence(h5).shape) # SS, Y, numerical
#print(h5getters.get_sections_start(h5).shape) # SS, Y, numerical
#print(h5getters.get_segments_confidence(h5).shape) # SS, Y, numerical
#print(h5getters.get_segments_loudness_max(h5).shape) # SS, Y, numerical
#print(h5getters.get_segments_loudness_max_time(h5).shape) # SS, Y, numerical
#print(h5getters.get_segments_loudness_start(h5).shape) # SS, Y, numerical
print(h5getters.get_segments_pitches(h5).shape) # SS, Y, numerical, matrix with 12 cols
#print(h5getters.get_segments_start(h5).shape) # SS, Y, numerical
#print(h5getters.get_segments_timbre(h5).shape) # SS, Y, numerical, matrix with 12 cols
#print(h5getters.get_similar_artists(h5).shape) # artist IDs
#print(h5getters.get_song_hotttnesss(h5)) # Y
#print(h5getters.get_song_id(h5)) # song ID
#print(h5getters.get_start_of_fade_out(h5)) # Y, seconds
#print(h5getters.get_tatums_confidence(h5).shape) # SS, Y, numerical
#print(h5getters.get_tatums_start(h5).shape) # SS, Y, numerical
#print(h5getters.get_tempo(h5)) # Y
#print(h5getters.get_time_signature(h5)) # Y, usual number of beats per bar
#print(h5getters.get_time_signature_confidence(h5)) # Y
#print(h5getters.get_title(h5)) # song title
#print(h5getters.get_track_7digitalid(h5))
#print(h5getters.get_track_id(h5))
#print(h5getters.get_year(h5)) # Y, year of release

#print('age:', time.gmtime().tm_year - h5getters.get_year(h5))
#h5.close()

In [ ]:
def extract_msd_track_features(ftrack):
    assert os.path.exists(ftrack)
    assert ftrack.endswith('.h5') or ftrack.endswith('.H5')
    
    features = []    
    h5 = h5getters.open_h5_file_read(ftrack)
    
    #print(h5getters.get_artist_terms(h5).shape) # SS, Y, text, word2vec?
    
    #artist_terms_freq = h5getters.get_artist_terms_freq(h5) # SS, Y, numerical
    #features.append(np.mean(artist_terms_freq))
    #features.append(np.var(artist_terms_freq))
    #print(artist_terms_freq) # can be empty
    
    #artist_terms_weight = h5getters.get_artist_terms_weight(h5) # SS, Y, numerical
    #features.append(np.mean(artist_terms_weight))
    #features.append(np.var(artist_terms_weight))
    #print(artist_terms_weight) # can be empty
    
    # use a few percentiles to approximate the distribution: min, 25th, median, 75th, max
    def stats_features(ndarray):
        if len(ndarray) == 0:
            return np.zeros(5).tolist()
        else:
            assert ndarray.ndim in [1,2]
            percentiles = [0, 25, 50, 75, 100]
            res = np.percentile(ndarray, q=percentiles, axis=0, interpolation='nearest')
            return res.reshape(-1, order='F').tolist()
        
    bars_confidence = h5getters.get_bars_confidence(h5) # SS, Y, numerical
    features += stats_features(bars_confidence)
    #print(bars_confidence) # can be empty
    # 0-4
    
    bars_start = h5getters.get_bars_start(h5) # SS, Y, numerical
    features += stats_features(bars_start)
    #print(bars_start) # can be empty
    # 5-9
    
    beats_confidence = h5getters.get_beats_confidence(h5) # SS, Y, numerical
    features += stats_features(beats_confidence)
    #print(beats_confidence) # can be empty
    # 10-14
    
    beats_start = h5getters.get_beats_start(h5) # SS, Y, numerical
    features += stats_features(beats_start)
    #print(beats_start) # can be empty
    # 15-19
    
    danceability = h5getters.get_danceability(h5) # Y
    features.append(danceability)
    # 20
    ### AF
    
    duration = h5getters.get_duration(h5) # Y, seconds
    features.append(duration)
    # 21
    ### AF
    
    end_of_fade_in = h5getters.get_end_of_fade_in(h5) # Y, seconds
    features.append(end_of_fade_in)
    features.append(end_of_fade_in / duration)
    # 22-23
    ### AF
    
    energy = h5getters.get_energy(h5) # Y
    features.append(energy)
    # 24
    ### AF
    
    key = h5getters.get_key(h5) # Y
    features.append(key)
    # 25
    ### AF
    
    key_confidence = h5getters.get_key_confidence(h5) # Y
    features.append(key_confidence)
    # 26
    ### AF
    
    loudness = h5getters.get_loudness(h5) # Y
    features.append(loudness)
    # 27
    ### AF
    
    mode = h5getters.get_mode(h5) # Y
    features.append(mode)
    # 28
    ### AF
    
    mode_confidence = h5getters.get_mode_confidence(h5) # Y
    features.append(mode_confidence)
    # 29
    ### AF
    
    sections_confidence = h5getters.get_sections_confidence(h5) # SS, Y, numerical
    features += stats_features(sections_confidence)
    #print(sections_confidence) # can be empty
    # 30-34
    
    sections_start = h5getters.get_sections_start(h5) # SS, Y, numerical
    features += stats_features(sections_start)
    #print(sections_start) # can be empty
    # 35-39
    
    segments_confidence = h5getters.get_segments_confidence(h5) # SS, Y, numerical
    features += stats_features(segments_confidence)
    #print(segments_confidence)
    # 40-44
    
    segments_loudness_max = h5getters.get_segments_loudness_max(h5) # SS, Y, numerical
    features += stats_features(segments_loudness_max)
    #print(segments_loudness_max)
    # 45-49
    
    segments_loudness_max_time = h5getters.get_segments_loudness_max_time(h5) # SS, Y, numerical
    features += stats_features(segments_loudness_max_time)
    #print(segments_loudness_max_time)
    # 50-54
    
    segments_loudness_start = h5getters.get_segments_loudness_start(h5) # SS, Y, numerical
    features += stats_features(segments_loudness_start)
    #print(segments_loudness_start)
    # 55-59
    
    segments_pitches = h5getters.get_segments_pitches(h5) # SS, Y, numerical, matrix with 12 cols
    features += stats_features(segments_pitches)
    #features = features + np.mean(segments_pitches, axis=0).tolist()
    #features = features + np.var(segments_pitches, axis=0).tolist()
    #print(segments_pitches)
    # 60-119
    
    segments_start = h5getters.get_segments_start(h5) # SS, Y, numerical
    features += stats_features(segments_start)
    #print(segments_start)
    # 120-124
    
    segments_timbre = h5getters.get_segments_timbre(h5) # SS, Y, numerical, matrix with 12 cols
    features += stats_features(segments_timbre)
    #features = features + np.mean(segments_timbre, axis=0).tolist()
    #features = features + np.var(segments_timbre, axis=0).tolist()
    #print(segments_timbre)
    # 125-184
    
    song_hotttnesss = h5getters.get_song_hotttnesss(h5) # Y
    features.append(song_hotttnesss)
    #print(song_hotttnesss) # can be NaN
    # 185
    ### AF
    
    start_of_fade_out = h5getters.get_start_of_fade_out(h5) # Y, seconds
    features.append(start_of_fade_out)
    features.append(start_of_fade_out / duration)
    # 186-187
    ### AF
    
    tatums_confidence = h5getters.get_tatums_confidence(h5) # SS, Y, numerical
    features += stats_features(tatums_confidence)
    #print(tatums_confidence) # can be empty
    # 188-192
    
    tatums_start = h5getters.get_tatums_start(h5) # SS, Y, numerical
    features += stats_features(tatums_start)
    #print(tatums_start) # can be empty
    # 193-197
    
    tempo = h5getters.get_tempo(h5) # Y
    features.append(tempo)
    # 198
    ### AF
    
    time_signature = h5getters.get_time_signature(h5) # Y, usual number of beats per bar
    features.append(time_signature)
    # 199
    ### AF
    
    time_signature_confidence = h5getters.get_time_signature_confidence(h5) # Y
    features.append(time_signature_confidence)
    # 200
    ### AF
    
    year_of_release = h5getters.get_year(h5) # Y, year of release
    age = time.gmtime().tm_year - year_of_release
    features.append(age)
    # 201
    ### AF
    
    h5.close()
    
    return np.nan_to_num(np.asarray(features), copy=False)

In [ ]:
#np.nan_to_num?

In [ ]:
#trackID = 'TRQVPBD128F1458060'
#trackID = 'TRZARKN128F92DE096'
#trackID = 'TRZEXLQ128F1491D17'
#gen_h5dir(data_dir, trackID)
#extract_msd_track_features(os.path.join(msd_h5dir, trackID + '.h5')).shape

In [ ]:
def gen_song_features(songID, msd_h5dir = msd_h5dir, song2TrackID = song2tracks):
    assert(songID in song2TrackID)
    trackIDs = song2TrackID[songID]
    for trackID in trackIDs:
        msd_h5dir = gen_h5dir(trackID)
        h5f = os.path.join(msd_h5dir, trackID + '.h5')
        if os.path.exists(h5f):
            return extract_msd_track_features(h5f)
        else:
            continue
        
    # no track available
    return None

In [ ]:
#songID = 'SOFDPDC12A58A7D198'
#songID = 'SOKMCJK12A6D4F6105'
#songID = 'SOGTGJR12A6310E08D'
#songID = song_set_msd[139]
#songID = song_set_msd[443]
#songID = song_set_msd[518]
#gen_song_features(songID)

In [ ]:
song_set_msd = sorted(song2tracks.keys())

In [ ]:
len(song_set_msd)

In [ ]:
#ffeatures = os.path.join(data_dir, 'features.pkl')
song2Feature = dict()
cnt = 0
for songID in song_set_msd:
    cnt += 1
    if cnt % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (cnt, len(song_set_msd)))
        sys.stdout.flush()
    #print(songID)
    
    features = gen_song_features(songID)
    #assert(features is not None)
    if features is not None:
        song2Feature[songID] = features

In [ ]:
len(song2Feature)

In [ ]:
#pkl.dump(song2Feature, open(ffeatures, 'wb'))

In [ ]:
pkl.dump(song2Feature, open(ffeatures_msd, 'wb'))