Map tracks in 30Music dataset to MSD



In [ ]:

    
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from urllib.parse import unquote_plus



In [ ]:

    
data_dir = 'data/30music'
ftrack = os.path.join(data_dir, 'tracks.csv')
fartist = os.path.join(data_dir, 'persons.csv')
flastfm_track = os.path.join(data_dir, 'lastfm/lastfm_tracks.pkl')

Load data

Tracks data.



In [ ]:

    
tracks = pd.read_csv(ftrack, sep=';', keep_default_na=False)  #, index_col='ID')



In [ ]:

    
#tracks.set_index('ID', inplace=True)



In [ ]:

    
print(tracks.shape[0])
print('#tracks:', tracks.index.unique().shape[0])
tracks.head()



In [ ]:

    
tracks.dtypes



In [ ]:

    
tnames = tracks['Name'].values



In [ ]:

    
tnames[9]



In [ ]:

    
unquote_plus(tnames[9])



In [ ]:

    
unquote_plus(tnames[9]).split('/')[-1].split('_')[-1]

Artist data.



In [ ]:

    
artists = pd.read_csv(fartist, index_col='ID', sep=';')



In [ ]:

    
print(artists.shape[0])
print('#artists:', artists.index.unique().shape[0])
artists.head()



In [ ]:

    
anames = artists['Name'].values



In [ ]:

    
unquote_plus(anames[1])

Build mapping

1. Match artist

LastFM artists.



In [ ]:

    
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))



In [ ]:

    
print(len(lastfm_tracks))
lastfm_tracks[0]



In [ ]:

    
lastfm_artists = sorted({str(t[2]).lower() for t in lastfm_tracks})



In [ ]:

    
print(len(lastfm_artists))
lastfm_artists[1100]



In [ ]:

    
dat = np.random.rand(3, 5)
dat



In [ ]:

    
np.mean(dat, axis=0)



In [ ]:

    
np.var(dat, axis=0)



In [ ]:

    
from scipy.stats import moment, kurtosis, skew, describe
moment(dat, moment=[1,2,3], axis=0)



In [ ]:

    
rset = describe(dat, axis=0)



In [ ]:

    
type(rset)



In [ ]:

    
dat



In [ ]:

    
rset



In [ ]:

    
describe([1,2,3,4,5])



In [ ]:

    
ab = np.zeros(6)



In [ ]:

    
ab.ndim



In [ ]:

    
np.zeros(6).tolist()



In [ ]:

    
aa = []
aa += [1, 2]
aa += [3, 4]
aa += [5, 6]
aa



In [ ]:

    
rset.kurtosis



In [ ]:

    
rset.skewness



In [ ]:

    
skew(dat, axis=0)

30Music artists.



In [ ]:

    
def parse_artist_name(artist_name):
    name = unquote_plus(artist_name).split('/')[-1].split('_')[-1].split('!')[-1]
    return name.strip()



In [ ]:

    
artists_30music = sorted({parse_artist_name(str(x)) for x in artists['Name'].values})



In [ ]:

    
print(len(artists_30music))
artists_30music[1700]

LastFM (title, artist) <--> [track_id, ...] mapping, one (title, artist) pair can have more than one tracks.



In [ ]:

    
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))



In [ ]:

    
ta2tid = dict()



In [ ]:

    
for i in range(len(lastfm_tracks)):
    if (i+1) % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(lastfm_tracks)))
        sys.stdout.flush()
        
    item = lastfm_tracks[i]
    tid = item[0]
    key = (item[1], item[2])
    try:
        ta2tid[key].append(tid)
    except KeyError:
        ta2tid[key] = [tid]



In [ ]:

    
len(ta2tid)



In [ ]:

    
np.sum([len(x) for x in ta2tid.values()])

30Music (title, artist) <--> track_id mapping.



In [ ]:

    
aa = 'hello (year 2018)'
aa



In [ ]:

    
bb = aa.replace('\(.*\)', '')
bb



In [ ]:

    
def parse_track_name(track_name):
    name = unquote_plus(track_name).split('/')[-1].split('_')[-1]
    name.replace



In [ ]:

    
ta2num = dict()



In [ ]:

    
for ix in tracks.index:
    if (ix+1) % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (ix+1, tracks.shape[0]))
        sys.stdout.flush()
        
    num, title, aid = tracks.loc[ix][['ID', 'Name', 'ArtistsID']]
    artist = artists.loc[aid, 'Name']
    key = (title, artist)
    try:
        ta2num[key].append(num)
    except KeyError:
        ta2num[key] = [num]



In [ ]:

    
len(ta2num)



In [ ]:

    
np.sum([len(x) for x in ta2num.values()])



In [ ]:

Match



In [ ]:

    
intersection = set(ta2tid.keys()) & set(ta2num.keys())



In [ ]:

    
len(intersection)

Playlist



In [ ]:

    
fplaylist = os.path.join(data_dir, 'playlist.csv')



In [ ]:

    
playlist = pd.read_csv(fplaylist, index_col='ID', sep=';')



In [ ]:

    
playlist.head()

Filtering out playlists without tracks data.



In [ ]:

    
#playlist[playlist['TracksID'].isin([np.nan])].head()
playlist[playlist['TracksID'].isnull()].head()



In [ ]:

    
playlist[playlist['TracksID'].notnull()].shape



In [ ]:

    
playlist = playlist[playlist['TracksID'].notnull()]
print(playlist.shape[0])
print('#playlist:', playlist.index.unique().shape[0])

Histogram of playlist length (i.e., the number of tracks/songs).



In [ ]:

    
ax = plt.subplot(111)
playlist['#Tracks'].hist(ax=ax)
ax.set_xlabel('playlist length')
ax.set_ylabel('#playlists')
ax.set_yscale('log')



In [ ]:

    
playlist['#Tracks'].describe()



In [ ]:

    
playlist['#Tracks'].median()