Common statistics for 30Music dataset.
In [1]:
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
data_dir = 'data/30music'
In [3]:
falbums = os.path.join(data_dir, 'albums.csv')
In [4]:
albums = pd.read_csv(falbums, index_col='ID', sep=';')
In [5]:
print(albums.shape[0])
print('#albums:', albums.index.unique().shape[0])
albums.head()
Out[5]:
In [6]:
fartists = os.path.join(data_dir, 'persons.csv')
In [7]:
artists = pd.read_csv(fartists, index_col='ID', sep=';')
In [8]:
print(artists.shape[0])
print('#artists:', artists.index.unique().shape[0])
artists.head()
Out[8]:
In [9]:
fusers = os.path.join(data_dir, 'users.csv')
In [10]:
users = pd.read_csv(fusers, index_col='ID', sep=';')
In [11]:
print(users.shape[0])
print('#users:', users.index.unique().shape[0])
users.head()
Out[11]:
In [12]:
ftags = os.path.join(data_dir, 'tags.csv')
In [13]:
tags = pd.read_csv(ftags, index_col='ID', sep=';')
In [14]:
print(tags.shape[0])
print('#tags:', tags.index.unique().shape[0])
tags.head()
Out[14]:
In [15]:
ftracks = os.path.join(data_dir, 'tracks.csv')
In [16]:
tracks = pd.read_csv(ftracks, sep=';', keep_default_na=False)
In [17]:
tracks.set_index('ID', inplace=True)
In [18]:
print(tracks.shape[0])
print('#tracks:', tracks.index.unique().shape[0])
tracks.head()
Out[18]:
In [19]:
fplaylist = os.path.join(data_dir, 'playlist.csv')
In [20]:
playlist = pd.read_csv(fplaylist, index_col='ID', sep=';')
In [21]:
playlist.head()
Out[21]:
Filtering out playlists without tracks data.
In [22]:
#playlist[playlist['TracksID'].isin([np.nan])].head()
playlist[playlist['TracksID'].isnull()].head()
Out[22]:
In [23]:
playlist[playlist['TracksID'].notnull()].shape
Out[23]:
In [24]:
playlist = playlist[playlist['TracksID'].notnull()]
print(playlist.shape[0])
print('#playlist:', playlist.index.unique().shape[0])
Histogram of playlist length (i.e., the number of tracks/songs).
In [25]:
ax = plt.subplot(111)
playlist['#Tracks'].hist(ax=ax)
ax.set_xlabel('playlist length')
ax.set_ylabel('#playlists')
ax.set_yscale('log')
In [26]:
playlist['#Tracks'].describe()
Out[26]:
In [27]:
playlist['#Tracks'].median()
Out[27]:
The number of users that created playlists.
In [28]:
nusers = playlist['UserID'].unique().shape[0]
nusers
Out[28]:
Average number of playlists for each user.
In [29]:
playlist.shape[0] / nusers
Out[29]:
Assume query is:
query = (start_song, length)
query = (start_song, length, user)
In [30]:
query_dict1 = dict()
query_dict2 = dict()
In [31]:
for ix in playlist.index:
uid = playlist.loc[ix, 'UserID']
tracks = [int(x) for x in str(playlist.loc[ix, 'TracksID']).split(',')]
ntracks = len(tracks)
q1 = (tracks[0], ntracks)
q2 = (tracks[0], ntracks, uid)
try:
query_dict1[q1].append(ix)
except KeyError:
query_dict1[q1] = [ix]
try:
query_dict2[q2].append(ix)
except KeyError:
query_dict2[q2] = [ix]
queries with multiple ground truths.
In [32]:
queries1 = sorted([q1 for q1 in query_dict1 if len(query_dict1[q1]) > 1])
In [33]:
print('query = (start_song, length)')
print('#Queries with multiple ground truth: %d, ratio: %.2f%%' % (len(queries1), 100*len(queries1) / len(query_dict1)))
In [34]:
queries2 = sorted([q2 for q2 in query_dict2 if len(query_dict2[q2]) > 1])
In [35]:
print('query = (start_song, length, user)')
print('#Queries with multiple ground truth: %d, ratio: %.2f%%' % (len(queries2), 100*len(queries2) / len(query_dict2)))
In [36]:
fpref = os.path.join(data_dir, 'love.csv')
In [37]:
pref = pd.read_csv(fpref, sep=';')
In [38]:
print('#likes:', pref.shape[0])
pref.head()
Out[38]: