Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.
All numbers and figures which appear in the paper and much more.
In [ ]:
%matplotlib inline
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import utils
sns.set_context("notebook", font_scale=1.5)
plt.rcParams['figure.figsize'] = (17, 5)
In [ ]:
tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')
np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()
tracks.shape, genres.shape, features.shape, echonest.shape
In [ ]:
print('{} tracks, {} artists, {} albums, {} genres'.format(
len(tracks), len(tracks['artist', 'id'].unique()),
len(tracks['album', 'id'].unique()),
sum(genres['#tracks'] > 0)))
mean_duration = tracks['track', 'duration'].mean()
print('track duration: {:.0f} days total, {:.0f} seconds average'.format(
sum(tracks['track', 'duration']) / 3600 / 24,
mean_duration))
In [ ]:
dimensionality = mean_duration * 44000 * 2
print('sample dimensionality: {:.1e}'.format(dimensionality))
print('total size, i.e. number of audio samples: {:.1e}'.format(dimensionality * len(tracks)))
In [ ]:
for subset in tracks['set', 'subset'].unique():
indicator = tracks['set', 'subset'] <= subset
print('{:6} {:6} tracks {:.1f} days'.format(
subset, sum(indicator), sum(indicator) * 30 / 3600 / 24))
In [ ]:
print('{} deleted tracks (largest track_id is {})'.format(tracks.index.max() - len(tracks), tracks.index.max()))
print('First track: {}'.format(tracks['track', 'date_created'].min()))
d = pd.DataFrame(tracks.index, index=tracks['track', 'date_created'].values)
d['indicator'] = 1
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
d['track_id'].plot(ax=ax1)
d['indicator'].cumsum().plot(ax=ax1)
ax1.set_ylabel('#tracks')
ax1.set_ylim(0, 160000)
(d['indicator'] * -100).plot(ax=ax2, style='r') # needed for no apparent reason
color = sns.color_palette('deep', 3)[2]
d['indicator'].resample('2M').sum().fillna(0).plot(ax=ax2, style='--', color=color)
ax2.set_ylabel('#tracks added')
ax2.set_ylim(500, 4500)
ax2.set_ylim(0, 4000)
ax2.grid(False)
lns = ax1.get_lines() + [ax2.get_lines()[1]]
ax1.legend(lns, ['largest track id', '#tracks still present', '#tracks added per 2 months'], loc='lower right')
plt.savefig('figures/growth.pdf')
In [ ]:
SPLITS = ['training', 'validation', 'test']
SUBSETS = ['small', 'medium', 'large']
print('subset #train #val #test val_ratio test_ratio')
for subset in SUBSETS:
counts = [sum((tracks['set', 'split'] == split) & (tracks['set', 'subset'] <= subset)) for split in SPLITS]
ratios = np.array(counts[0] / counts[1:])
print('{:8s} {:7d} {:7d} {:7d} {:8.2f} {:9.2f}'.format(subset, *counts, *ratios))
In [ ]:
for subset in ['small', 'medium']:
subset = tracks['set', 'subset'] <= subset
d = genres.reset_index().set_index('title')
d = d.loc[tracks.loc[subset, ('track', 'genre_top')].unique()]
for split in SPLITS:
b = tracks['set', 'split'] == split
d['#' + split] = tracks.loc[subset & b, ('track', 'genre_top')].value_counts()
d['val_ratio'] = d['#training'] / d['#validation']
d['test_ratio'] = d['#training'] / d['#test']
ipd.display(d.sort_values('#training', ascending=False))
In [ ]:
d = pd.DataFrame(index=genres.index, columns=SPLITS)
for genre in genres.index:
b = tracks['track', 'genres_all'].map(lambda genres: genre in genres)
d.loc[genre] = tracks.loc[b, ('set', 'split')].value_counts()
d['val_ratio'] = d['training'] / d['validation']
d['test_ratio'] = d['training'] / d['test']
d.sort_values('training', ascending=False, inplace=True)
ipd.display(d.head(10))
ipd.display(d.tail(10))
In [ ]:
def isnull(column, df=tracks):
if column[1] in ['tags', 'genres', 'genres_all']:
return df[column].apply(lambda x: len(x) == 0)
elif df.dtypes[column] == np.int:
return df[column] <= 0
else:
return df[column].isnull()
def count(series):
col0 = series.name[0]
df = tracks if col0 == 'track' else tracks.drop_duplicates((col0, 'id'))
n = (~isnull(series.name, df)).sum()
p = n / len(df) * 100
return n, p
# Columns / metadata usage across dataset.
d = pd.DataFrame(index=tracks.columns.drop('set'), columns=['n', 'p'])
d = d.apply(count, axis=1)
d['n'] = d['n'].astype(np.int)
d
In [ ]:
# Excerpt as example in the paper.
columns = [
('track', 'title'),
('track', 'genres_all'),
('track', 'genre_top'),
('track', 'duration'),
('track', 'listens'),
('album', 'title'),
('album', 'listens'),
('album', 'tags'),
('artist', 'name'),
('artist', 'location'),
]
non_null = ~isnull(columns[0])
for column in columns[1:]:
non_null &= ~isnull(column)
tids = np.random.RandomState(42).permutation(tracks.index[non_null])[:8]
tracks.loc[tids, columns].head()
#tracks.loc[tids, columns].to_latex('figures/tracks.tex', formatters={
# ('artist', 'longitude'): '{:,.1f}'.format,
# ('artist', 'latitude'): '{:,.1f}'.format,
#})
In [ ]:
tracks['track', 'license'].value_counts().head(10)
In [ ]:
tracks['track', 'language_code'].value_counts().head(10)
In [ ]:
durations = tracks['track', 'duration']
plt.figure(figsize=(10, 4)) # Poster: (7, 3)
p = sns.distplot(durations[durations.values < 800], kde=False, rug=False, color='k', hist_kws=dict(alpha=0.4))
p.set_xlabel('duration [seconds]')
p.set_ylabel('#tracks')
p.set_xlim(0, 800) # Poster: 500
plt.tight_layout()
plt.savefig('figures/duration_distribution.pdf')
durations.describe()
In [ ]:
# Uncommon bit rates are VBR encodings.
print('Common bit rates: {}'.format(tracks['track', 'bit_rate'].value_counts().head(5).index.tolist()))
print('Average bit rate: {:.0f} kbit/s'.format(tracks['track', 'bit_rate'].mean()/1000))
p = sns.distplot(tracks['track', 'bit_rate'], kde=False, rug=False)
p.set_xlabel('bit rate')
p.set_ylabel('#tracks');
In [ ]:
# Tags.
d1 = tracks['track', 'tags'].apply(len)
d2 = tracks.drop_duplicates(('album', 'id'))
d2 = d2['album', 'tags'].apply(len)
d3 = tracks.drop_duplicates(('artist', 'id'))
d3 = d3['artist', 'tags'].apply(len) - 1
labels = ['track', 'album', 'artist']
for l, d in zip(labels, [d1, d2, d3]):
print('{}: from {} to {} tags'.format(l, max(d.min(), 0), d.max()))
MAX = 13 # Poster: 11
fig, ax1 = plt.subplots(figsize=(10, 4)) # Poster: (7, 3)
ax2 = ax1.twinx()
ax1.hist(d1, bins=np.arange(MAX)+0.25, rwidth=0.2, color='C0', label=labels[0])
ax2.hist(d2, bins=np.arange(MAX)+0.50, rwidth=0.2, color='C1', label=labels[1])
ax2.hist(d3, bins=np.arange(MAX)+0.75, rwidth=0.2, color='C2', label=labels[2])
ax1.set_xlabel('#tags')
ax1.set_ylabel('#tracks')
ax2.set_ylabel('#artists / #albums')
ax1.set_xlim(0.5, MAX-0.5)
ax1.set_xticks(range(1, MAX))
ax1.set_ylim(0, 5000)
ax2.set_ylim(0, 500)
ax1.legend(loc='upper center')
ax2.legend(loc='upper right')
ax2.grid(False)
fig.tight_layout()
fig.savefig('figures/tag_distribution.pdf')
In [ ]:
# One artist tag is often the artist name.
col = 'artist'
d = tracks.drop_duplicates((col, 'id'))
d.loc[d[col, 'tags'].apply(len) > 0, [('artist', 'name'), (col, 'tags')]].head()
In [ ]:
# Listens, favorites, comments.
def plot(col0, col1, maxval, subplot=None):
if col0 == 'track':
d = tracks['track']
if col0 in ['artist', 'album']:
d = tracks[col0].drop_duplicates('id')
if subplot:
plt.subplot(subplot)
d = d[col1]
p = sns.distplot(d[d.values < maxval], kde=False, color='k', hist_kws=dict(alpha=0.4))
p.set_xlim(-1, maxval)
p.set_xlabel('#' + col1)
p.set_ylabel('#' + col0 + 's')
plt.figure(figsize=(17, 10))
plot('track', 'listens', 10e3, 221)
plot('track', 'interest', 10e3, 222)
plot('track', 'favorites', 100, 223)
plot('track', 'comments', 20, 224)
plt.figure(figsize=(17, 10))
plot('album', 'listens', 100e3, 221)
plot('album', 'favorites', 100, 223)
plot('album', 'comments', 20, 224)
plt.figure(figsize=(17, 5))
plot('artist', 'favorites', 100, 121)
plot('artist', 'comments', 20, 122)
In [ ]:
# Same as above, formated for the paper.
plt.figure(figsize=(10, 4)) # Poster: (7, 3)
plot('album', 'listens', 40e3) # Poster 20e3
plt.tight_layout()
plt.savefig('figures/listens_distribution.pdf')
tracks['album', 'listens'].max()
In [ ]:
# Most listened albums.
tracks['album'].groupby('id').first().sort_values('listens', ascending=False).head(10)
In [ ]:
def plot(col0, col1):
if col0 == 'track':
d = tracks['track']
if col0 in ['artist', 'album']:
d = tracks[col0].drop_duplicates('id')
d = pd.Series(1, index=d[col1])
d.resample('A').sum().fillna(0).plot()
plt.figure()
plot('track', 'date_recorded')
plot('album', 'date_released')
plt.figure()
plot('artist', 'active_year_begin')
plot('artist', 'active_year_end')
plt.figure()
plot('track', 'date_created')
plot('album', 'date_created')
plot('artist', 'date_created')
In [ ]:
# Same as above, formated for the paper.
plt.figure(figsize=(5, 4))
d = tracks['album'].drop_duplicates('id')
d = pd.Series(1, index=d['date_released'])
d = d.resample('A').sum().fillna(0)
b = d.index >= pd.to_datetime(1990, format='%Y')
b &= d.index <= pd.to_datetime(2017, format='%Y')
d[b].plot(color='k')
plt.xlabel('release year')
plt.ylabel('#albums')
plt.tight_layout()
plt.savefig('figures/album_release_year.pdf')
d.index.min().year, d.index.max().year
In [ ]:
for effect in ['artist', 'album']:
d = tracks[effect, 'id'].value_counts()
ipd.display(d.head(5))
p = sns.distplot(d[(d.values < 50) & (d.values >= 0)], kde=False)
p.set_xlabel('#tracks per ' + effect);
p.set_ylabel('#' + effect + 's');
In [ ]:
counts = pd.Series(index=genres.loc[genres['parent'] == 0, 'title'].values, name='#artists')
for genre in counts.index:
counts[genre] = len(tracks.loc[tracks['track', 'genre_top'] == genre, ('artist', 'id')].unique())
counts.sort_values(ascending=False).plot.bar()
plt.ylabel('#artists');
In [ ]:
a = set(tracks['track', 'genre_top'].unique().dropna())
b = set(genres.loc[genres['top_level'].unique(), 'title'].values)
assert a == b
print('{} top-level genres'.format(len(a)))
genres[genres['parent'] == 0].sort_values('#tracks', ascending=False)
Number of genres per track:
genres
: they have introduced a limit of 3 genres per track early on.genres_all
: more genres per track as all coarser genres in the hierarchy are included. E.g. an Indie-Rock song is counted as a Rock song too.
In [ ]:
# Genres per track.
labels = ['genres', 'genres_all'] #, 'genres_top']
d = [tracks['track', label].map(len) for label in labels]
labels = ['{}\nmax: {}'.format(label, d1.max()) for label, d1 in zip(labels, d)]
for l, d1 in zip(labels, d):
print('{} per track: from {} to {} tags'.format(l, d1.min(), d1.max()))
print('#tracks without genre: {}'.format((tracks['track', 'genres'].map(len) == 0).sum()))
MAX = 9
fig, ax = plt.subplots(figsize=(5, 4))
ax.hist(d, bins=np.arange(MAX)-0.5, label=labels)
ax.set_xlabel('#genres per track')
ax.set_ylabel('#tracks')
ax.set_xlim(-0.5, MAX-1.5)
ax.set_xticks(range(MAX-1))
ax.set_yticklabels(['0'] + ['{}0k'.format(i) for i in range(1, 6)])
ax.legend(loc='upper right')
fig.tight_layout()
fig.savefig('figures/genres_per_track.pdf')
In [ ]:
# Number of tracks per genre (full).
d = genres[genres['#tracks'] > 2000].sort_values('#tracks', ascending=False) # Poster: 5000
plt.figure(figsize=(10, 4)) # Poster: (7, 4)
p = sns.barplot('title', '#tracks', data=d, color='k', alpha=0.4)
p.set_xlabel('')
p.set_ylabel('#tracks')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('figures/genre_distribution.pdf')
genres.loc[genres['#tracks'] > 0, '#tracks'].min(), genres['#tracks'].max()
In [ ]:
# Number of tracks per top-level genre (medium).
d = tracks[tracks['set', 'subset'] <= 'medium']
d = d['track', 'genre_top'].value_counts()
plt.figure(figsize=(10, 4)) # Poster: (7, 4)
d.plot.bar(color='k', alpha=0.4)
plt.ylabel('#tracks')
plt.xlabel('')
plt.tight_layout()
plt.savefig('figures/genre_top_distribution.pdf')
d
Todo:
In [ ]:
g = utils.Genres(genres)
graph = g.create_tree([25, 31], 1)
ipd.Image(graph.create_png())
In [ ]:
graph = g.create_tree(14)
graph.write_pdf('figures/genre_hierarchy.pdf');
roots = g.find_roots()
print('{} roots'.format(len(roots)))
graph = g.create_tree(roots)
graph.write_pdf('figures/genre_hierarchy.pdf');
In [ ]:
enc = MultiLabelBinarizer()
genres_indicator = enc.fit_transform(tracks['track', 'genres'])
genres_names = enc.classes_
genres_names = genres.loc[enc.classes_, 'title'].values
cross_correlation = genres_indicator.T @ genres_indicator
In [ ]:
np.fill_diagonal(cross_correlation, 0)
plt.figure(figsize=(28, 28))
plt.imshow(np.log(cross_correlation))
plt.yticks(range(len(genres_names)), genres_names);
plt.xticks(range(len(genres_names)), genres_names, rotation=90);
In [ ]:
cross_correlation = np.tril(cross_correlation, k=-1)
sort = np.argsort(cross_correlation.flatten())
N = 20
indices = np.unravel_index(sort[:-N:-1], cross_correlation.shape)
for i, j in zip(*indices):
print('{}: {} | {}'.format(cross_correlation[i, j], genres_names[i], genres_names[j]))
Todo: understand features by listening to segments who have them, e.g. http://musicinformationretrieval.com/feature_sonification.html.
In [ ]:
features.head(5).style.format('{:.2f}')
In [ ]:
sns.pairplot(features.loc[:, ('mfcc', 'mean', slice('01','03'))]);
sns.pairplot(features.loc[:, ('mfcc', 'std', slice('01','03'))]);
In [ ]:
print('Echonest features available for {} tracks.'.format(len(echonest)))