In [0]:
Tasks and data taken from https://www.dtic.upf.edu/~aporter/amplab/.
In [1]:
from __future__ import print_function
import pandas # the ultimate weapon
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import os
DATA_DIR = os.path.join('data', 'amplab')
In [3]:
# Data from AcousticBrainz
ab_data = pandas.read_csv(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-selected-features.csv'), index_col=0)
print('Columns: %s' % ab_data.columns.values)
In [4]:
release_original = pandas.read_csv(
os.path.join(DATA_DIR, 'acousticbrainz-2015-01-original-release-years.csv'),
index_col=0, header=None, names=['mbid', 'year'])
release_actual = pandas.read_csv(
os.path.join(DATA_DIR, 'acousticbrainz-2015-01-actual-release-years.csv'),
index_col=0, header=None, names=['mbid', 'year'])
In [5]:
musicbrainz_data = pandas.read_csv(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-musicbrainz-relations-years.csv'),
index_col=0)
print('Columns: %s' % musicbrainz_data.columns.values)
This file contains values like:
Electronic - usual genre tag that can be parsed without any pre-processingHip-Hop; Underground Hip-Hop; Rap; Underground Rap; Underground - genres separated by semicolonsIndie,Instrumental,Experimental - same as the previous but now separated by commas (without spaces between genres)`"Manabi Straight, Karaoke" - comma-separated genres surrounded by quotation marks"Electronic, Hip Hop, Funk / Soul"... etc.
In [6]:
MBID_LENGTH = len('00018ca8-d84c-4c3d-bcab-5f9a4b9e4e92')
genre_tags = {}
import re
def parse_genre_tags(string):
string = string.translate(None, '"').lower()
tags = re.split(',|;|/', string)
tags = [t.strip() for t in tags]
return filter(None, tags)
for line in open(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-file-genre-tags.csv'), 'r'):
genre_tags[line[:MBID_LENGTH]] = parse_genre_tags(line[MBID_LENGTH+1:])
In [7]:
print('Examples:')
print(genre_tags['829d2d93-8d48-49c3-a41e-a2f313d9fb5a'])
print(genre_tags['3f843014-13f6-46a0-9105-1fa32d2b4d8f'])
print(genre_tags['6e76306c-4a8f-4701-81f5-753a6c322655'])
Contains tags that people have added to their music files under the "mood" heading, similar to the genre tag file.
In [8]:
mood_tags = {}
def parse_mood_tags(string):
tags = re.split(',|;|/', string.strip().lower())
return filter(None, tags)
for line in open(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-file-mood-tags.csv'), 'r'):
mood_tags[line[:MBID_LENGTH]] = parse_mood_tags(line[MBID_LENGTH+1:])
In [9]:
print('Examples:')
print(mood_tags['c2fed0ce-3bb5-48b3-87f9-bc1bbc376c56'])
print(mood_tags['8d66fde0-c289-42a5-8d89-99a319b3a60e'])
print(mood_tags['d7fc72a8-b9f1-4dfe-bcdc-bae9dbcf2654'])
This file contains variable number of (tag, count) pairs.
In [10]:
lastfm_tags = {}
from itertools import izip
def pairwise(iterable):
i = iter(iterable)
return izip(i, i)
for line in open(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-lastfm-tags.csv'), 'r'):
track = line.strip().split(',')
tags = []
for tag, count in pairwise(track[1:]):
tags.append((tag.lower(), count))
lastfm_tags[track[0]] = tags
In [11]:
print('Examples:')
print(lastfm_tags['02aee867-c503-4749-a877-5c923a6e0908'])
print(lastfm_tags['00018ca8-d84c-4c3d-bcab-5f9a4b9e4e92'])
Some tracks in AcousticBrainz data dump are missing metadata:
artistartistidreleasereleaseidrecording (recordingid is there)Information about releases (release and releaseid is missing from all rows).
In [12]:
print('releases:', len(ab_data[ab_data.release.notnull()]))
print('releaseids:', len(ab_data[ab_data.releaseid.notnull()]))
In [13]:
# Looking for rows where recording name is unset:
ab_data[ab_data.recording.isnull()].loc[:,[
'artist', 'artistid', 'release', 'releaseid', 'recording']]
Out[13]:
Some columns also contain incorrect data (random string in place of UUID). These things are not verified during submission to AcousticBrainz.
In [14]:
# Remove bad data so we don't have to worry about it later
ab_data = ab_data[ab_data.recording.notnull()]
ab_data = ab_data[ab_data.artistid.notnull()]
# Some artistids are not UUIDs, so we remove them
# Not the best way to filter, but it should work for now
ab_data = ab_data[~(ab_data.artistid.str.len() != len('bb91ce0e-8493-469f-bd1a-a314b2682400'))]
Some data in data sets with release years is incorrect, so we need to fix this as well.
In [15]:
release_original[release_original.year < 1890] = np.nan
release_actual[release_actual.year < 1890] = np.nan
In [16]:
duplicates = ab_data['recordingid'].value_counts()
In [17]:
duplicates_joined = duplicates.to_frame('duplicates').join(ab_data).drop_duplicates('recordingid')
In [18]:
print('Top recordings:')
duplicates_joined.sort('duplicates', ascending=False).loc[:,['artist', 'recording', 'duplicates']].head(10)
Out[18]:
In [19]:
plt.hist(duplicates, bins=50, facecolor='green', alpha=0.6)
plt.yscale('symlog')
plt.xlabel('duplicate tracks')
plt.ylabel('number of tracks')
plt.show()
In [20]:
# Extracting artists
artists = ab_data.loc[:,['artistid', 'artist']].drop_duplicates('artistid').set_index('artistid')
In [21]:
duplicate_artists = ab_data.loc[:,['artistid', 'artist']].groupby('artistid').count()
In [22]:
print('Top artists:')
duplicate_artists.join(artists, lsuffix='_count').sort('artist_count', ascending=False).head(10)
Out[22]:
In [23]:
plt.hist(duplicate_artists['artist'].values, bins=60, facecolor='green', alpha=0.6)
plt.yscale('symlog')
plt.xlabel('duplicate artists')
plt.ylabel('number of artists')
plt.show()
In [25]:
duplicate_years_original = ab_data.loc[:,['recordingid']].join(release_original).groupby('year').count()
duplicate_years_original.rename(columns={'recordingid': 'count'}, inplace=True)
print('Top years (original):')
duplicate_years_original.sort('count', ascending=False).head(5)
Out[25]:
In [26]:
duplicate_years_actual = ab_data.loc[:,['recordingid']].join(release_actual).groupby('year').count()
duplicate_years_actual.rename(columns={'recordingid': 'count'}, inplace=True)
print('Top years (actual):')
duplicate_years_actual.sort('count', ascending=False).head(5)
Out[26]:
In [27]:
plt.plot(duplicate_years_original.index, duplicate_years_original, label='original')
plt.plot(duplicate_years_actual.index, duplicate_years_actual, label='actual')
plt.ylabel('duplicates')
plt.xlabel('year')
plt.legend(loc='upper left')
plt.show()
In [ ]:
avg_loudness = ab_data.loc[:,['average_loudness']]
original_loudness = avg_loudness.join(release_original).groupby('year').aggregate(np.mean)
actual_loudness = avg_loudness.join(release_actual).groupby('year').aggregate(np.mean)
In [ ]:
plt.plot(original_loudness.index, original_loudness, label='original')
plt.plot(actual_loudness.index, actual_loudness, label='actual')
plt.axis(xmin=min([min(original_loudness.index), min(actual_loudness.index)]),
xmax=max([max(original_loudness.index), max(actual_loudness.index)]))
plt.ylabel('average_loudness')
plt.xlabel('year')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Using dynamic_complexity
dyn_complexity = ab_data.loc[:,['dynamic_complexity']]
original_loudness = dyn_complexity.join(release_original).groupby('year').aggregate(np.mean)
actual_loudness = dyn_complexity.join(release_actual).groupby('year').aggregate(np.mean)
In [ ]:
plt.plot(original_loudness.index, original_loudness, label='original')
plt.plot(actual_loudness.index, actual_loudness, label='actual')
plt.axis(xmin=min([min(original_loudness.index), min(actual_loudness.index)]),
xmax=max([max(original_loudness.index), max(actual_loudness.index)]))
plt.ylabel('dynamic_complexity')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.show()
In [ ]:
replay_gain = ab_data.loc[:,['replay_gain']]
original_loudness = replay_gain.join(release_original).groupby('year').aggregate(np.mean)
actual_loudness = replay_gain.join(release_actual).groupby('year').aggregate(np.mean)
In [ ]:
plt.plot(original_loudness.index, original_loudness, label='original')
plt.plot(actual_loudness.index, actual_loudness, label='actual')
plt.axis(xmin=min([min(original_loudness.index), min(actual_loudness.index)]),
xmax=max([max(original_loudness.index), max(actual_loudness.index)]))
plt.ylabel('replay_gain')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.show()
In [ ]:
# TODO: Try something else!