In [11]:
# This notebook trains logistic regression to predict tags
# of musical tags based on latent vector inputs.
#
# The tags come from the MSD LastFM tag subset.
#
# The latent vectors are produced by running WMF on
# user-track playcounts from LFM-1b.
In [12]:
import numpy as np
import numpy
import os
import pickle
import scipy
from scipy import sparse
import sklearn
from sklearn import preprocessing
In [14]:
# Function for getting MSD tag data.
def get_tag_data(dataset):
split_dir = '../../tags/MSD_split_for_tagging/'
# Get dataset split
train_x_fname = os.path.join(split_dir, 'train_x_msd_id.txt')
valid_x_fname = os.path.join(split_dir, 'valid_x_msd_id.txt')
test_x_fname = os.path.join(split_dir, 'test_x_msd_id.txt')
train_y_fname = os.path.join(split_dir, 'train_y.npy')
valid_y_fname = os.path.join(split_dir, 'valid_y.npy')
test_y_fname = os.path.join(split_dir, 'test_y.npy')
train_x = [line.strip() for line in open(train_x_fname)]
valid_x = [line.strip() for line in open(valid_x_fname)]
test_x = [line.strip() for line in open(test_x_fname)]
train_y = np.load(train_y_fname)
valid_y = np.load(valid_y_fname)
test_y = np.load(test_y_fname)
print([len(x) for x in [train_x, valid_x, test_x]])
tr = [(x, y) for x, y in zip(train_x, train_y)]
va = [(x, y) for x, y in zip(valid_x, valid_y)]
te = [(x, y) for x, y in zip(test_x, test_y)]
print([len(x) for x in [tr, va, te]])
# Important - match the two datasets!!!!
# We restrict the tag data to just tracks that are in both
# MSD and LastFM-1b, using our simple matching criteria.
# This intersects LFM-tags with LFM-1b.
matched_dir = '../../matchings/msd_lfm-1b/'
matched_msd_track_ids_fname = os.path.join(matched_dir, 'matched_msd_track_ids.txt')
matched_track_ids = set([line.strip() for line in open(matched_msd_track_ids_fname)])
tr = filter(lambda x: x[0] in matched_track_ids, tr)
va = filter(lambda x: x[0] in matched_track_ids, va)
te = filter(lambda x: x[0] in matched_track_ids, te)
print([len(x) for x in [tr, va, te]])
return tr, va, te
In [15]:
tr, va, te = get_tag_data(dataset='lfm-1b')
In [16]:
tr[:3]
Out[16]:
In [17]:
tr_x, tr_y = zip(*tr)
va_x, va_y = zip(*va)
te_x, te_y = zip(*te)
In [18]:
matrix_artist_tracknames_fname = '../../matchings/both/matched_artists_tracks.txt'
matrix_artist_tracknames = [line.strip() for line in open(matrix_artist_tracknames_fname)]
artist_trackname_to_matrix_index = {
artist_trackname: index
for index, artist_trackname in enumerate(matrix_artist_tracknames)
}
In [19]:
msd_artists_tracks_fname = '../../matchings/msd_lfm-1b/matched_artists_tracks.txt'
msd_track_ids_fname = '../../matchings/msd_lfm-1b/artist_trackname_to_msd_track_ids.txt'
msd_artist_tracks = [line.strip() for line in open(msd_artists_tracks_fname)]
msd_track_ids = [line.strip().split('\t') for line in open(msd_track_ids_fname)]
msd_track_id_to_artists_trackname = {
msd_track_id: artist_trackname
for msd_track_ids_list, artist_trackname in zip(msd_track_ids, msd_artist_tracks)
for msd_track_id in msd_track_ids_list
}
In [21]:
msd_track_id_to_matrix_index = {
msd_track_id: artist_trackname_to_matrix_index[artist_trackname]
for msd_track_ids_list, artist_trackname in zip(msd_track_ids, msd_artist_tracks)
for msd_track_id in msd_track_ids_list
}
In [22]:
# verify that have matrix index for each msd track id.
all_track_ids = set([x[0] for x in tr] + [x[0] for x in va] + [x[0] for x in te])
print(len(all_track_ids))
print(len(set(msd_track_id_to_matrix_index.keys()) & all_track_ids))
In [23]:
song_factors_fname = '../output/factors_merged_v.npy'
song_factors = np.load(song_factors_fname)
song_factors.shape
Out[23]:
In [24]:
tr_x_feats = np.array([
song_factors[msd_track_id_to_matrix_index[msd_track_id]] for msd_track_id in tr_x
])
va_x_feats = np.array([
song_factors[msd_track_id_to_matrix_index[msd_track_id]] for msd_track_id in va_x
])
te_x_feats = np.array([
song_factors[msd_track_id_to_matrix_index[msd_track_id]] for msd_track_id in te_x
])
In [25]:
tr_y = np.array(tr_y)
va_y = np.array(va_y)
te_y = np.array(te_y)
In [26]:
from keras.models import Sequential
from keras.layers import Dense
# 50-class logistic regression in Keras
model = Sequential()
model.add(Dense(50, activation='sigmoid', input_dim=tr_x_feats.shape[1]))
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
In [27]:
model.fit(tr_x_feats, tr_y, nb_epoch=1, validation_data=(va_x_feats, va_y))
Out[27]:
In [28]:
model.fit(tr_x_feats, tr_y, nb_epoch=10, validation_data=(va_x_feats, va_y))
Out[28]:
In [29]:
proba_va = model.predict_proba(va_x_feats)
classes_va = np.round(proba_va)
proba_tr = model.predict_proba(tr_x_feats)
classes_tr = np.round(proba_tr)
proba_te = model.predict_proba(te_x_feats)
classes_te = np.round(proba_te)
In [30]:
tags = ['rock', 'pop', 'alternative', 'indie', 'electronic',
'female vocalists', 'dance', '00s', 'alternative rock', 'jazz',
'beautiful', 'metal', 'chillout', 'male vocalists',
'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica',
'80s', 'folk', '90s', 'chill', 'instrumental', 'punk',
'oldies', 'blues', 'hard rock', 'ambient', 'acoustic',
'experimental', 'female vocalist', 'guitar', 'Hip-Hop',
'70s', 'party', 'country', 'easy listening',
'sexy', 'catchy', 'funk', 'electro', 'heavy metal',
'Progressive rock', '60s', 'rnb', 'indie pop',
'sad', 'House', 'happy']
In [31]:
from sklearn.metrics import roc_auc_score
indices = range(50)
tr_auc = roc_auc_score(tr_y[:,indices], proba_tr[:,indices])
print('training auc: {}'.format(tr_auc))
va_auc = roc_auc_score(va_y[:,indices], proba_va[:,indices])
print('validation auc: {}'.format(va_auc))
te_auc = roc_auc_score(te_y[:,indices], proba_te[:,indices])
print('test auc: {}'.format(te_auc))
In [49]:
te_aucs = []
for i in range(50):
te_aucs.append(roc_auc_score(te_y[:,[i]], proba_te[:,[i]]))
indices = np.argsort(te_aucs)
for i in indices:
print('{0:.2f}\t'.format(te_aucs[i]) + tags[i])
In [ ]: