Map tracks in 30Music dataset to MSD


In [ ]:
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from urllib.parse import unquote_plus

In [ ]:
data_dir = 'data/30music'
ftrack = os.path.join(data_dir, 'tracks.csv')
fartist = os.path.join(data_dir, 'persons.csv')
flastfm_track = os.path.join(data_dir, 'lastfm/lastfm_tracks.pkl')

Load data

Tracks data.


In [ ]:
tracks = pd.read_csv(ftrack, sep=';', keep_default_na=False)  #, index_col='ID')

In [ ]:
#tracks.set_index('ID', inplace=True)

In [ ]:
print(tracks.shape[0])
print('#tracks:', tracks.index.unique().shape[0])
tracks.head()

In [ ]:
tracks.dtypes

In [ ]:
tnames = tracks['Name'].values

In [ ]:
tnames[9]

In [ ]:
unquote_plus(tnames[9])

In [ ]:
unquote_plus(tnames[9]).split('/')[-1].split('_')[-1]

Artist data.


In [ ]:
artists = pd.read_csv(fartist, index_col='ID', sep=';')

In [ ]:
print(artists.shape[0])
print('#artists:', artists.index.unique().shape[0])
artists.head()

In [ ]:
anames = artists['Name'].values

In [ ]:
unquote_plus(anames[1])

Build mapping

1. Match artist

LastFM artists.


In [ ]:
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))

In [ ]:
print(len(lastfm_tracks))
lastfm_tracks[0]

In [ ]:
lastfm_artists = sorted({str(t[2]).lower() for t in lastfm_tracks})

In [ ]:
print(len(lastfm_artists))
lastfm_artists[1100]

In [ ]:
dat = np.random.rand(3, 5)
dat

In [ ]:
np.mean(dat, axis=0)

In [ ]:
np.var(dat, axis=0)

In [ ]:
from scipy.stats import moment, kurtosis, skew, describe
moment(dat, moment=[1,2,3], axis=0)

In [ ]:
rset = describe(dat, axis=0)

In [ ]:
type(rset)

In [ ]:
dat

In [ ]:
rset

In [ ]:
describe([1,2,3,4,5])

In [ ]:
ab = np.zeros(6)

In [ ]:
ab.ndim

In [ ]:
np.zeros(6).tolist()

In [ ]:
aa = []
aa += [1, 2]
aa += [3, 4]
aa += [5, 6]
aa

In [ ]:
rset.kurtosis

In [ ]:
rset.skewness

In [ ]:
skew(dat, axis=0)

30Music artists.


In [ ]:
def parse_artist_name(artist_name):
    name = unquote_plus(artist_name).split('/')[-1].split('_')[-1].split('!')[-1]
    return name.strip()

In [ ]:
artists_30music = sorted({parse_artist_name(str(x)) for x in artists['Name'].values})

In [ ]:
print(len(artists_30music))
artists_30music[1700]

LastFM (title, artist) <--> [track_id, ...] mapping, one (title, artist) pair can have more than one tracks.


In [ ]:
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))

In [ ]:
ta2tid = dict()

In [ ]:
for i in range(len(lastfm_tracks)):
    if (i+1) % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(lastfm_tracks)))
        sys.stdout.flush()
        
    item = lastfm_tracks[i]
    tid = item[0]
    key = (item[1], item[2])
    try:
        ta2tid[key].append(tid)
    except KeyError:
        ta2tid[key] = [tid]

In [ ]:
len(ta2tid)

In [ ]:
np.sum([len(x) for x in ta2tid.values()])

30Music (title, artist) <--> track_id mapping.


In [ ]:
aa = 'hello (year 2018)'
aa

In [ ]:
bb = aa.replace('\(.*\)', '')
bb

In [ ]:
def parse_track_name(track_name):
    name = unquote_plus(track_name).split('/')[-1].split('_')[-1]
    name.replace

In [ ]:
ta2num = dict()

In [ ]:
for ix in tracks.index:
    if (ix+1) % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (ix+1, tracks.shape[0]))
        sys.stdout.flush()
        
    num, title, aid = tracks.loc[ix][['ID', 'Name', 'ArtistsID']]
    artist = artists.loc[aid, 'Name']
    key = (title, artist)
    try:
        ta2num[key].append(num)
    except KeyError:
        ta2num[key] = [num]

In [ ]:
len(ta2num)

In [ ]:
np.sum([len(x) for x in ta2num.values()])

In [ ]:

Match


In [ ]:
intersection = set(ta2tid.keys()) & set(ta2num.keys())

In [ ]:
len(intersection)

Playlist


In [ ]:
fplaylist = os.path.join(data_dir, 'playlist.csv')

In [ ]:
playlist = pd.read_csv(fplaylist, index_col='ID', sep=';')

In [ ]:
playlist.head()

Filtering out playlists without tracks data.


In [ ]:
#playlist[playlist['TracksID'].isin([np.nan])].head()
playlist[playlist['TracksID'].isnull()].head()

In [ ]:
playlist[playlist['TracksID'].notnull()].shape

In [ ]:
playlist = playlist[playlist['TracksID'].notnull()]
print(playlist.shape[0])
print('#playlist:', playlist.index.unique().shape[0])

Histogram of playlist length (i.e., the number of tracks/songs).


In [ ]:
ax = plt.subplot(111)
playlist['#Tracks'].hist(ax=ax)
ax.set_xlabel('playlist length')
ax.set_ylabel('#playlists')
ax.set_yscale('log')

In [ ]:
playlist['#Tracks'].describe()

In [ ]:
playlist['#Tracks'].median()