In [11]:
# This notebook trains logistic regression to predict tags
# of musical tags based on latent vector inputs.
#
# The tags come from the MSD LastFM tag subset.
#
# The latent vectors are produced by running WMF on 
# user-track playcounts from LFM-1b.

In [12]:
import numpy as np
import numpy
import os
import pickle
import scipy
from scipy import sparse
import sklearn
from sklearn import preprocessing

In [14]:
# Function for getting MSD tag data.
def get_tag_data(dataset):
    split_dir = '../../tags/MSD_split_for_tagging/'
    # Get dataset split
    train_x_fname = os.path.join(split_dir, 'train_x_msd_id.txt')
    valid_x_fname = os.path.join(split_dir, 'valid_x_msd_id.txt')
    test_x_fname = os.path.join(split_dir, 'test_x_msd_id.txt')

    train_y_fname = os.path.join(split_dir, 'train_y.npy')
    valid_y_fname = os.path.join(split_dir, 'valid_y.npy')
    test_y_fname = os.path.join(split_dir, 'test_y.npy')

    train_x = [line.strip() for line in open(train_x_fname)]
    valid_x = [line.strip() for line in open(valid_x_fname)]
    test_x = [line.strip() for line in open(test_x_fname)]

    train_y = np.load(train_y_fname)
    valid_y = np.load(valid_y_fname)
    test_y = np.load(test_y_fname)

    print([len(x) for x in [train_x, valid_x, test_x]])

    tr = [(x, y) for x, y in zip(train_x, train_y)]
    va = [(x, y) for x, y in zip(valid_x, valid_y)]
    te = [(x, y) for x, y in zip(test_x, test_y)]

    print([len(x) for x in [tr, va, te]])

    # Important - match the two datasets!!!!
    # We restrict the tag data to just tracks that are in both
    # MSD and LastFM-1b, using our simple matching criteria.
    # This intersects LFM-tags with LFM-1b.
    matched_dir = '../../matchings/msd_lfm-1b/'
    matched_msd_track_ids_fname = os.path.join(matched_dir, 'matched_msd_track_ids.txt')
    matched_track_ids = set([line.strip() for line in open(matched_msd_track_ids_fname)])

    tr = filter(lambda x: x[0] in matched_track_ids, tr)
    va = filter(lambda x: x[0] in matched_track_ids, va)
    te = filter(lambda x: x[0] in matched_track_ids, te)

    print([len(x) for x in [tr, va, te]])
    
    return tr, va, te

In [15]:
tr, va, te = get_tag_data(dataset='lfm-1b')


[201680, 12634, 28540]
[201680, 12634, 28540]
[181269, 11335, 25593]

In [16]:
tr[:3]


Out[16]:
[('TRAAAAK128F9318786',
  array([ True, False, False, False, False, False, False, False,  True,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
          True, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False], dtype=bool)),
 ('TRAAAAW128F429D538',
  array([False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False,  True, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False], dtype=bool)),
 ('TRAAABD128F429CF47',
  array([False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False,  True, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False,  True,
         False, False, False, False, False], dtype=bool))]

In [17]:
tr_x, tr_y = zip(*tr)
va_x, va_y = zip(*va)
te_x, te_y = zip(*te)

In [18]:
matrix_artist_tracknames_fname = '../../matchings/both/matched_artists_tracks.txt'
matrix_artist_tracknames = [line.strip() for line in open(matrix_artist_tracknames_fname)]
artist_trackname_to_matrix_index = {
    artist_trackname: index
    for index, artist_trackname in enumerate(matrix_artist_tracknames)
}

In [19]:
msd_artists_tracks_fname = '../../matchings/msd_lfm-1b/matched_artists_tracks.txt'
msd_track_ids_fname = '../../matchings/msd_lfm-1b/artist_trackname_to_msd_track_ids.txt'

msd_artist_tracks = [line.strip() for line in open(msd_artists_tracks_fname)]
msd_track_ids = [line.strip().split('\t') for line in open(msd_track_ids_fname)]

msd_track_id_to_artists_trackname = {
    msd_track_id: artist_trackname
    for msd_track_ids_list, artist_trackname in zip(msd_track_ids, msd_artist_tracks)
    for msd_track_id in msd_track_ids_list
}

In [21]:
msd_track_id_to_matrix_index = {
    msd_track_id: artist_trackname_to_matrix_index[artist_trackname]
    for msd_track_ids_list, artist_trackname in zip(msd_track_ids, msd_artist_tracks)
    for msd_track_id in msd_track_ids_list
}

In [22]:
# verify that have matrix index for each msd track id.
all_track_ids = set([x[0] for x in tr] + [x[0] for x in va] + [x[0] for x in te])
print(len(all_track_ids))
print(len(set(msd_track_id_to_matrix_index.keys()) & all_track_ids))


218197
218197

In [23]:
song_factors_fname = '../output/factors_merged_v.npy'
song_factors = np.load(song_factors_fname)
song_factors.shape


Out[23]:
(661392, 80)

In [24]:
tr_x_feats = np.array([
    song_factors[msd_track_id_to_matrix_index[msd_track_id]] for msd_track_id in tr_x
])
va_x_feats = np.array([
    song_factors[msd_track_id_to_matrix_index[msd_track_id]] for msd_track_id in va_x
])
te_x_feats = np.array([
    song_factors[msd_track_id_to_matrix_index[msd_track_id]] for msd_track_id in te_x
])

In [25]:
tr_y = np.array(tr_y)
va_y = np.array(va_y)
te_y = np.array(te_y)

In [26]:
from keras.models import Sequential
from keras.layers import Dense

# 50-class logistic regression in Keras
model = Sequential()
model.add(Dense(50, activation='sigmoid', input_dim=tr_x_feats.shape[1]))
model.compile(optimizer='rmsprop', loss='binary_crossentropy')


Using TensorFlow backend.

In [27]:
model.fit(tr_x_feats, tr_y, nb_epoch=1, validation_data=(va_x_feats, va_y))


/usr/local/lib/python2.7/dist-packages/keras/models.py:851: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.
  warnings.warn('The `nb_epoch` argument in `fit` '
Train on 181269 samples, validate on 11335 samples
Epoch 1/1
181269/181269 [==============================] - 8s - loss: 0.2006 - val_loss: 0.0997
Out[27]:
<keras.callbacks.History at 0x7fc75edd6e10>

In [28]:
model.fit(tr_x_feats, tr_y, nb_epoch=10, validation_data=(va_x_feats, va_y))


Train on 181269 samples, validate on 11335 samples
Epoch 1/10
181269/181269 [==============================] - 8s - loss: 0.0937 - val_loss: 0.0919
Epoch 2/10
181269/181269 [==============================] - 7s - loss: 0.0892 - val_loss: 0.0893
Epoch 3/10
181269/181269 [==============================] - 7s - loss: 0.0873 - val_loss: 0.0880
Epoch 4/10
181269/181269 [==============================] - 7s - loss: 0.0863 - val_loss: 0.0872
Epoch 5/10
181269/181269 [==============================] - 7s - loss: 0.0856 - val_loss: 0.0866
Epoch 6/10
181269/181269 [==============================] - 7s - loss: 0.0851 - val_loss: 0.0862
Epoch 7/10
181269/181269 [==============================] - 7s - loss: 0.0847 - val_loss: 0.0859
Epoch 8/10
181269/181269 [==============================] - 7s - loss: 0.0845 - val_loss: 0.0857
Epoch 9/10
181269/181269 [==============================] - 7s - loss: 0.0842 - val_loss: 0.0855
Epoch 10/10
181269/181269 [==============================] - 7s - loss: 0.0841 - val_loss: 0.0854
Out[28]:
<keras.callbacks.History at 0x7fc748cdf4d0>

In [29]:
proba_va = model.predict_proba(va_x_feats)
classes_va = np.round(proba_va)

proba_tr = model.predict_proba(tr_x_feats)
classes_tr = np.round(proba_tr)

proba_te = model.predict_proba(te_x_feats)
classes_te = np.round(proba_te)


23584/25593 [==========================>...] - ETA: 0s0s 

In [30]:
tags = ['rock', 'pop', 'alternative', 'indie', 'electronic',
        'female vocalists', 'dance', '00s', 'alternative rock', 'jazz',
        'beautiful', 'metal', 'chillout', 'male vocalists',
        'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica',
        '80s', 'folk', '90s', 'chill', 'instrumental', 'punk',
        'oldies', 'blues', 'hard rock', 'ambient', 'acoustic',
        'experimental', 'female vocalist', 'guitar', 'Hip-Hop',
        '70s', 'party', 'country', 'easy listening',
        'sexy', 'catchy', 'funk', 'electro', 'heavy metal',
        'Progressive rock', '60s', 'rnb', 'indie pop',
        'sad', 'House', 'happy']

In [31]:
from sklearn.metrics import roc_auc_score
indices = range(50)
tr_auc = roc_auc_score(tr_y[:,indices], proba_tr[:,indices])
print('training auc: {}'.format(tr_auc))
va_auc = roc_auc_score(va_y[:,indices], proba_va[:,indices])
print('validation auc: {}'.format(va_auc))
te_auc = roc_auc_score(te_y[:,indices], proba_te[:,indices])
print('test auc: {}'.format(te_auc))


training auc: 0.901394591309
validation auc: 0.897767398628
test auc: 0.895672089585

In [49]:
te_aucs = []
for i in range(50):
    te_aucs.append(roc_auc_score(te_y[:,[i]], proba_te[:,[i]]))
indices = np.argsort(te_aucs)
for i in indices:
    print('{0:.2f}\t'.format(te_aucs[i]) +  tags[i])


0.62	happy
0.73	sad
0.76	00s
0.76	catchy
0.76	beautiful
0.78	male vocalists
0.79	sexy
0.81	party
0.83	Mellow
0.83	chill
0.84	instrumental
0.84	female vocalist
0.86	guitar
0.86	rock
0.87	90s
0.87	acoustic
0.88	easy listening
0.88	alternative
0.89	alternative rock
0.89	female vocalists
0.90	electronica
0.90	chillout
0.91	pop
0.91	experimental
0.93	dance
0.93	indie pop
0.93	funk
0.94	electronic
0.94	indie
0.94	ambient
0.94	hard rock
0.94	70s
0.95	indie rock
0.95	folk
0.95	electro
0.95	80s
0.95	soul
0.95	House
0.95	classic rock
0.96	blues
0.96	rnb
0.97	punk
0.97	60s
0.97	country
0.97	Progressive rock
0.97	oldies
0.97	jazz
0.98	Hip-Hop
0.98	metal
0.99	heavy metal

In [ ]: