In [1]:
%matplotlib inline
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
data_dir = 'data'
faotm = os.path.join(data_dir, 'aotm-2011/aotm-2011-subset.pkl')
fmap = os.path.join(data_dir, 'aotm-2011/map_song_track.pkl')
ftag = os.path.join(data_dir, 'msd/msd_tagtraum_cd2c.cls')
Load playlists.
In [3]:
playlists = pkl.load(open(faotm, 'rb'))
In [4]:
print('#Playlists: %d' % len(playlists))
In [5]:
playlists[0]
Out[5]:
In [6]:
print('#Songs: %d' % len({songID for p in playlists for songID in p['filtered_lists'][0]}))
In [7]:
lengths = [len(p['filtered_lists'][0]) for p in playlists]
#plt.hist(lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(lengths))
Load song_id
--> track_id
mapping: a song may correspond to multiple tracks.
In [8]:
song2TrackID = pkl.load(open(fmap, 'rb'))
In [9]:
{ k : song2TrackID[k] for k in list(song2TrackID.keys())[:10] }
Out[9]:
Load song tags, build track_id
--> tag
mapping.
In [10]:
track2Tags = dict()
In [11]:
with open(ftag) as f:
for line in f:
if line[0] == '#': continue
tid, tag = line.strip().split('\t')
#print(tid, tag)
track2Tags[tid] = tag
In [12]:
print('#(Track, Tag): %d' % len(track2Tags))
In [13]:
{ k : track2Tags[k] for k in list(track2Tags.keys())[:10] }
Out[13]:
Use the subset of playlist such that the first song (i.e. the seed song) in each playlist has tag(s).
In [14]:
subset_ix = []
In [89]:
seedSong2Tag = { }
for ix in range(len(playlists)):
# the list of song IDs in the playlist
songIDs = playlists[ix]['filtered_lists'][0]
# seed song
seedSongID = songIDs[0]
seedTrackIDs = song2TrackID[seedSongID]
# make sure that at least one track for the song has a corresponding tag
flag = [ (trackID in track2Tags) for trackID in seedTrackIDs]
if not np.any(flag):
continue
seedSong2Tag[playlists[ix]['mix_id']] = [ track2Tags[seedTrackIDs[i]] for i in range(0, len(flag)) if flag[i] == True ]
subset_ix.append(ix)
In [91]:
#seedSong2Tag
In [16]:
playlists_subset = [playlists[ix] for ix in subset_ix]
In [17]:
print('#Playlists used: %d' % len(subset_ix))
The set of unique songs, in multilabel learning, we have a label for each song in this set.
In [18]:
song_set = sorted({songID for p in playlists_subset for songID in p['filtered_lists'][0]})
In [19]:
print('#Songs used: %d' % len(song_set))
In [20]:
print(song_set[:10])
For the most part, playlists contain less than 10 songs. The most common playlist length is 2 songs.
In [21]:
playlist_lengths = [len(playlist['filtered_lists'][0]) for playlist in playlists_subset]
plt.hist(playlist_lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(playlist_lengths))
Song_id --> Song_name
mapping.
In [22]:
songID2Name = {s[1]: s[0] for p in playlists_subset for s in p['playlist']}
In [23]:
#songID2Name
Indicator of tags: tag
--> index
mapping.
In [24]:
# the set of unique tags
tag_set = sorted(set(track2Tags.values()))
In [25]:
print('#Tags: %d' % len(tag_set))
In [26]:
tag_indicator = { tag: ix for ix, tag in enumerate(tag_set) }
In [27]:
tag_indicator
Out[27]:
Build features (1-hot encoding of tag) for a song given its song_id
.
In [28]:
def gen_features(song_id, song2TrackID = song2TrackID, tag_indicator = tag_indicator):
"""
Generate one-hot feature vector for a given song ID
"""
features = np.zeros(len(tag_set), dtype = np.float)
trackIDs = song2TrackID[song_id]
cnt = 0
for trackID in trackIDs:
if trackID in track2Tags:
cnt += 1
tag = track2Tags[trackID]
tag_ix = tag_indicator[tag]
features[tag_ix] = 1
# must have at least one tag for the song, else useless
assert(cnt >= 1)
return features
In [29]:
def gen_feature_map(song_id, seed):
"""
Generate feature mapping for a given (label, query) pair
"""
#return gen_features(song_id) - gen_features(seed) # feature map
return gen_features(seed) # a trivial feature map
In [30]:
def gen_training_set(label_ix, playlists = playlists_subset, song_set = song_set):
"""
Create the labelled dataset for a given song index
Input:
- label_ix: song index, number in { 0, ..., # songs }
- playlists: which playlists to create features for
Output:
- (Feature, Label) pair (X, y), with # num playlists rows
X comprises the features for each seed song and the given song
y comprises the indicator of whether the given song is present in the respective playlist
"""
assert(label_ix >= 0)
assert(label_ix < len(song_set))
N = len(playlists)
d = len(tag_set)
X = np.zeros((N, d), dtype = np.float)
y = np.zeros(N, dtype = np.float)
whichSong = song_set[label_ix]
for i in range(len(playlists)):
playlist = playlists[i]['filtered_lists'][0]
seed = playlist[0]
X[i,:] = gen_feature_map(whichSong, seed)
y[i] = int(whichSong in playlist)
return X, y
In [31]:
gen_feature_map(song_set[100], playlists_subset[0]['filtered_lists'][0][0])
Out[31]:
Train a logistic regression model for each label.
In [32]:
classifiers = [LogisticRegression(class_weight='balanced') for i in range(len(song_set))]
In [75]:
allPreds = [ ]
allTruths = [ ]
coefMat = [ ]
labelIndices = [ ]
Y = np.NAN * np.ones((len(playlists_subset), len(song_set)))
for label_ix in range(len(song_set)):
X, y = gen_training_set(label_ix)
Y[:,label_ix] = y
# by fixing random seed, the same playlists will be in the test set each time
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, \
test_size = 0.33, \
random_state = 31)
if np.max(y_train) == 0.0: # or np.max(y_test) == 0.0:
continue
classifiers[label_ix].fit(X_train, y_train)
allPreds.append(classifiers[label_ix].decision_function(X_test))
allTruths.append(y_test)
coefMat.append(classifiers[label_ix].coef_.reshape(-1))
labelIndices.append(label_ix)
#print(classifiers[label_ix].coef_)
#print(classifiers[label_ix].intercept_)
In [76]:
allPreds = np.array(allPreds).T
allTruths = np.array(allTruths).T
print(allPreds.shape)
print(allTruths.shape)
Compute AUC.
In [35]:
aucs = [ ]
for i in range(0,allPreds.shape[0]):
pred = allPreds[i,:]
truth = allTruths[i,:]
if np.max(truth) == 0.0:
continue
aucs.append(sk.metrics.roc_auc_score(truth, pred))
print('Average AUC: %1.4f' % np.mean(aucs))
plt.hist(aucs, bins = 10);
Compute average precision.
Coefficient matrix (#Genres, #Songs)
.
In [36]:
coefMat = np.array(coefMat).T
In [37]:
coefMat.shape
Out[37]:
In [38]:
#sns.heatmap(coefMat[:, :30])
Top 10 songs of each genre (w.r.t.) the coefficients.
In [39]:
labelIndices = np.array(labelIndices)
In [65]:
Top10Songs_ix = [ ]
for i in range(coefMat.shape[0]):
ix = np.argsort(coefMat[i, :])[::-1][:10]
Top10Songs_ix.append(labelIndices[ix])
Bot10Songs_ix = [ ]
for i in range(coefMat.shape[0]):
ix = np.argsort(coefMat[i, :])[:10]
Bot10Songs_ix.append(labelIndices[ix])
In [41]:
#Top10Songs_ix
In [42]:
#np.array(song_set)[Top10Songs_ix[0]]
In [68]:
cols = ['Genre.Count'] + ['Top %d' % k for k in range(1, 11)] + ['Bot %d' % k for k in range(1, 11)]
Top10Songs = pd.DataFrame(np.zeros((len(tag_set), 21), dtype = object),
index = tag_set, columns = cols)
In [96]:
# number of appearances of playlists with each genre
S = X.sum(axis = 0)
idx = np.argsort(S)[::-1]
#[(tag_set[i], S[i]) for i in idx]
In [98]:
# number of appearances of each song in a playlist
plt.hist(Y.sum(axis = 0));
plt.xlabel('# of playlist appearances');
In [70]:
for i in range(len(tag_set)):
row = tag_set[i]
Top10Songs.loc[row, 'Genre.Count'] = S[i]
for j in range(10):
song_ix = Top10Songs_ix[i][j]
songID = song_set[song_ix]
songName = (songID, songID2Name[songID][0], songID2Name[songID][1])
col = 'Top %d' % (j+1)
Top10Songs.loc[row, col] = songName
song_ix = Bot10Songs_ix[i][j]
songID = song_set[song_ix]
songName = (songID, songID2Name[songID][0], songID2Name[songID][1])
col = 'Bot %d' % (j+1)
Top10Songs.loc[row, col] = songName
Top10Songs = Top10Songs.sort_values(['Genre.Count'], ascending=False)
In [73]:
Top10Songs.head(5)
Out[73]:
In [93]:
rapPlaylists = [ k for k in seedSong2Tag if 'Rap' in seedSong2Tag[k] ]
In [95]:
[ p['playlist'] for p in playlists_subset if p['mix_id'] in rapPlaylists ]
Out[95]:
In [ ]: