In [ ]:
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from urllib.parse import unquote_plus
In [ ]:
data_dir = 'data/30music'
ftrack = os.path.join(data_dir, 'tracks.csv')
fartist = os.path.join(data_dir, 'persons.csv')
flastfm_track = os.path.join(data_dir, 'lastfm/lastfm_tracks.pkl')
Tracks data.
In [ ]:
tracks = pd.read_csv(ftrack, sep=';', keep_default_na=False) #, index_col='ID')
In [ ]:
#tracks.set_index('ID', inplace=True)
In [ ]:
print(tracks.shape[0])
print('#tracks:', tracks.index.unique().shape[0])
tracks.head()
In [ ]:
tracks.dtypes
In [ ]:
tnames = tracks['Name'].values
In [ ]:
tnames[9]
In [ ]:
unquote_plus(tnames[9])
In [ ]:
unquote_plus(tnames[9]).split('/')[-1].split('_')[-1]
Artist data.
In [ ]:
artists = pd.read_csv(fartist, index_col='ID', sep=';')
In [ ]:
print(artists.shape[0])
print('#artists:', artists.index.unique().shape[0])
artists.head()
In [ ]:
anames = artists['Name'].values
In [ ]:
unquote_plus(anames[1])
LastFM artists.
In [ ]:
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))
In [ ]:
print(len(lastfm_tracks))
lastfm_tracks[0]
In [ ]:
lastfm_artists = sorted({str(t[2]).lower() for t in lastfm_tracks})
In [ ]:
print(len(lastfm_artists))
lastfm_artists[1100]
In [ ]:
dat = np.random.rand(3, 5)
dat
In [ ]:
np.mean(dat, axis=0)
In [ ]:
np.var(dat, axis=0)
In [ ]:
from scipy.stats import moment, kurtosis, skew, describe
moment(dat, moment=[1,2,3], axis=0)
In [ ]:
rset = describe(dat, axis=0)
In [ ]:
type(rset)
In [ ]:
dat
In [ ]:
rset
In [ ]:
describe([1,2,3,4,5])
In [ ]:
ab = np.zeros(6)
In [ ]:
ab.ndim
In [ ]:
np.zeros(6).tolist()
In [ ]:
aa = []
aa += [1, 2]
aa += [3, 4]
aa += [5, 6]
aa
In [ ]:
rset.kurtosis
In [ ]:
rset.skewness
In [ ]:
skew(dat, axis=0)
30Music artists.
In [ ]:
def parse_artist_name(artist_name):
name = unquote_plus(artist_name).split('/')[-1].split('_')[-1].split('!')[-1]
return name.strip()
In [ ]:
artists_30music = sorted({parse_artist_name(str(x)) for x in artists['Name'].values})
In [ ]:
print(len(artists_30music))
artists_30music[1700]
LastFM (title, artist) <--> [track_id, ...]
mapping, one (title, artist) pair can have more than one tracks.
In [ ]:
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))
In [ ]:
ta2tid = dict()
In [ ]:
for i in range(len(lastfm_tracks)):
if (i+1) % 1000 == 0:
sys.stdout.write('\r%d / %d' % (i+1, len(lastfm_tracks)))
sys.stdout.flush()
item = lastfm_tracks[i]
tid = item[0]
key = (item[1], item[2])
try:
ta2tid[key].append(tid)
except KeyError:
ta2tid[key] = [tid]
In [ ]:
len(ta2tid)
In [ ]:
np.sum([len(x) for x in ta2tid.values()])
30Music (title, artist) <--> track_id
mapping.
In [ ]:
aa = 'hello (year 2018)'
aa
In [ ]:
bb = aa.replace('\(.*\)', '')
bb
In [ ]:
def parse_track_name(track_name):
name = unquote_plus(track_name).split('/')[-1].split('_')[-1]
name.replace
In [ ]:
ta2num = dict()
In [ ]:
for ix in tracks.index:
if (ix+1) % 1000 == 0:
sys.stdout.write('\r%d / %d' % (ix+1, tracks.shape[0]))
sys.stdout.flush()
num, title, aid = tracks.loc[ix][['ID', 'Name', 'ArtistsID']]
artist = artists.loc[aid, 'Name']
key = (title, artist)
try:
ta2num[key].append(num)
except KeyError:
ta2num[key] = [num]
In [ ]:
len(ta2num)
In [ ]:
np.sum([len(x) for x in ta2num.values()])
In [ ]:
In [ ]:
intersection = set(ta2tid.keys()) & set(ta2num.keys())
In [ ]:
len(intersection)
In [ ]:
fplaylist = os.path.join(data_dir, 'playlist.csv')
In [ ]:
playlist = pd.read_csv(fplaylist, index_col='ID', sep=';')
In [ ]:
playlist.head()
Filtering out playlists without tracks data.
In [ ]:
#playlist[playlist['TracksID'].isin([np.nan])].head()
playlist[playlist['TracksID'].isnull()].head()
In [ ]:
playlist[playlist['TracksID'].notnull()].shape
In [ ]:
playlist = playlist[playlist['TracksID'].notnull()]
print(playlist.shape[0])
print('#playlist:', playlist.index.unique().shape[0])
Histogram of playlist length (i.e., the number of tracks/songs).
In [ ]:
ax = plt.subplot(111)
playlist['#Tracks'].hist(ax=ax)
ax.set_xlabel('playlist length')
ax.set_ylabel('#playlists')
ax.set_yscale('log')
In [ ]:
playlist['#Tracks'].describe()
In [ ]:
playlist['#Tracks'].median()