AcousticBrainz AMPLab projects


In [0]:

Tasks and data taken from https://www.dtic.upf.edu/~aporter/amplab/.


In [1]:
from __future__ import print_function
import pandas  # the ultimate weapon
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Preparing the data


In [2]:
import os
DATA_DIR = os.path.join('data', 'amplab')

AcousticBrainz


In [3]:
# Data from AcousticBrainz
ab_data = pandas.read_csv(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-selected-features.csv'), index_col=0)

print('Columns: %s' % ab_data.columns.values)


Columns: ['artist' 'artistid' 'release' 'releaseid' 'recording' 'recordingid'
 'lossless' 'codec' 'length' 'bit_rate' 'dissonance_mean'
 'dissonance_median' 'dissonance_var' 'hfc_mean' 'hfc_median' 'hfc_var'
 'spectral_complexity_mean' 'spectral_complexity_median'
 'spectral_complexity_var' 'spectral_rolloff_mean'
 'spectral_rolloff_median' 'spectral_rolloff_var' 'zerocrossingrate_mean'
 'zerocrossingrate_median' 'zerocrossingrate_var' 'average_loudness'
 'dynamic_complexity' 'replay_gain' 'chords_changes_rate' 'key_key'
 'key_scale' 'key_strength' 'tuning_frequency'
 'tuning_equal_tempered_deviation' 'onset_rate' 'bpm' 'danceability'
 'mood_acoustic_acoustic' 'mood_acoustic_not_acoustic'
 'mood_electronic_electronic' 'mood_electronic_not_electronic'
 'mood_aggressive_aggressive' 'mood_aggressive_not_aggressive'
 'mood_happy_happy' 'mood_happy_not_happy' 'mood_party_party'
 'mood_party_not_party' 'mood_sad_sad' 'mood_sad_not_sad'
 'mood_relaxed_relaxed' 'mood_relaxed_not_relaxed'
 'genre_dortmund_alternative' 'genre_dortmund_blues'
 'genre_dortmund_electronic' 'genre_dortmund_folkcountry'
 'genre_dortmund_funksoulrnb' 'genre_dortmund_jazz' 'genre_dortmund_pop'
 'genre_dortmund_raphiphop' 'genre_dortmund_rock' 'genre_electronic'
 'genre_electronic_probability' 'genre_rosamerica_cla'
 'genre_rosamerica_dan' 'genre_rosamerica_hip' 'genre_rosamerica_jaz'
 'genre_rosamerica_pop' 'genre_rosamerica_rhy' 'genre_rosamerica_roc'
 'genre_rosamerica_spe' 'genre_tzanetakis_blu' 'genre_tzanetakis_cla'
 'genre_tzanetakis_cou' 'genre_tzanetakis_dis' 'genre_tzanetakis_hip'
 'genre_tzanetakis_jaz' 'genre_tzanetakis_met' 'genre_tzanetakis_pop'
 'genre_tzanetakis_reg' 'genre_tzanetakis_roc']

Release years


In [4]:
release_original = pandas.read_csv(
    os.path.join(DATA_DIR, 'acousticbrainz-2015-01-original-release-years.csv'),
    index_col=0, header=None, names=['mbid', 'year'])

release_actual = pandas.read_csv(
    os.path.join(DATA_DIR, 'acousticbrainz-2015-01-actual-release-years.csv'),
    index_col=0, header=None, names=['mbid', 'year'])

MusicBrainz entity tag maps


In [5]:
musicbrainz_data = pandas.read_csv(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-musicbrainz-relations-years.csv'),
                                   index_col=0)

print('Columns: %s' % musicbrainz_data.columns.values)


Columns: ['release_id' 'release_year' 'release_group_id' 'work_id' 'disc' 'track']

Metadata genre tags

This file contains values like:

  • Electronic - usual genre tag that can be parsed without any pre-processing
  • Hip-Hop; Underground Hip-Hop; Rap; Underground Rap; Underground - genres separated by semicolons
  • Indie,Instrumental,Experimental - same as the previous but now separated by commas (without spaces between genres)`
  • "Manabi Straight, Karaoke" - comma-separated genres surrounded by quotation marks
  • "Electronic, Hip Hop, Funk / Soul"... etc.

In [6]:
MBID_LENGTH = len('00018ca8-d84c-4c3d-bcab-5f9a4b9e4e92')

genre_tags = {}

import re
def parse_genre_tags(string):
    string = string.translate(None, '"').lower()
    tags = re.split(',|;|/', string)
    tags = [t.strip() for t in tags]
    return filter(None, tags)

for line in open(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-file-genre-tags.csv'), 'r'):
    genre_tags[line[:MBID_LENGTH]] = parse_genre_tags(line[MBID_LENGTH+1:])

In [7]:
print('Examples:')
print(genre_tags['829d2d93-8d48-49c3-a41e-a2f313d9fb5a'])
print(genre_tags['3f843014-13f6-46a0-9105-1fa32d2b4d8f'])
print(genre_tags['6e76306c-4a8f-4701-81f5-753a6c322655'])


Examples:
['love', 'dance', 'pop', 'soundtrack', '80s', 'dirty dancing']
['dark ambient', 'experimental', 'finland', 'industrial', 'ritual']
['classical', 'pop', 'sex', '8 of 10 stars', '80s', 'easy listening']

Metadata mood tags

Contains tags that people have added to their music files under the "mood" heading, similar to the genre tag file.


In [8]:
mood_tags = {}

def parse_mood_tags(string):
    tags = re.split(',|;|/', string.strip().lower())
    return filter(None, tags)

for line in open(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-file-mood-tags.csv'), 'r'):
    mood_tags[line[:MBID_LENGTH]] = parse_mood_tags(line[MBID_LENGTH+1:])

In [9]:
print('Examples:')
print(mood_tags['c2fed0ce-3bb5-48b3-87f9-bc1bbc376c56'])
print(mood_tags['8d66fde0-c289-42a5-8d89-99a319b3a60e'])
print(mood_tags['d7fc72a8-b9f1-4dfe-bcdc-bae9dbcf2654'])


Examples:
['mellow', 'melancholy', 'sad', 'chill', 'intense', 'emotional', 'haunting']
['chill', 'smooth', 'happy', 'mellow', 'feel good', 'crazy']
['mellow', ' smooth', ' chill', ' retro']

Last.fm tags

This file contains variable number of (tag, count) pairs.


In [10]:
lastfm_tags = {}

from itertools import izip
def pairwise(iterable):
    i = iter(iterable)
    return izip(i, i)

for line in open(os.path.join(DATA_DIR, 'acousticbrainz-2015-01-lastfm-tags.csv'), 'r'):
    track = line.strip().split(',')
    tags = []
    for tag, count in pairwise(track[1:]):
        tags.append((tag.lower(), count))
    lastfm_tags[track[0]] = tags

In [11]:
print('Examples:')
print(lastfm_tags['02aee867-c503-4749-a877-5c923a6e0908'])
print(lastfm_tags['00018ca8-d84c-4c3d-bcab-5f9a4b9e4e92'])


Examples:
[('japanese', '100'), ('jpop', '100'), ('j-pop', '50'), ('female vocalists', '50'), ('asian', '50'), ('asian music', '50'), ('asian pop', '50')]
[('video game music', '100'), ('instrumental', '28'), ('soundtrack', '28'), ('classical', '14'), ('game music', '14'), ('orchestral', '14'), ('final fantasy', '14'), ('final fantasy xii', '14'), ('slobeat', '14')]

Problems

Missing metadata

Some tracks in AcousticBrainz data dump are missing metadata:

  • artist
  • artistid
  • release
  • releaseid
  • recording (recordingid is there)

Information about releases (release and releaseid is missing from all rows).


In [12]:
print('releases:', len(ab_data[ab_data.release.notnull()]))
print('releaseids:', len(ab_data[ab_data.releaseid.notnull()]))


releases: 0
releaseids: 0

In [13]:
# Looking for rows where recording name is unset:
ab_data[ab_data.recording.isnull()].loc[:,[
    'artist', 'artistid', 'release', 'releaseid', 'recording']]


Out[13]:
artist artistid release releaseid recording
mbid
40567a08-62bb-4f49-bf6f-6858a58d745e NaN NaN NaN NaN NaN
3c66ffef-cbd6-4b1f-b660-e0ab57eb463b Avril Lavigne NaN NaN NaN NaN
3eec4378-a37e-49bb-93e7-25e9d2c9b669 NaN NaN NaN NaN NaN
470c47aa-88a6-422c-a65d-4b6147280e58 NaN NaN NaN NaN NaN
160452f0-db90-416d-ac78-03feef62f717 NaN NaN NaN NaN NaN
4eb24aaf-95ca-4848-bdc0-bb5bcf98f341 Alicia Keys NaN NaN NaN NaN
6fee7302-b27d-4720-be4a-82dd7cab4da3 NaN NaN NaN NaN NaN
34cde14a-4a1b-4183-8abb-76628055f3a2 NaN NaN NaN NaN NaN
352d9fca-b268-41a6-bd92-213c821f0a8b NaN NaN NaN NaN NaN
907eff31-57d1-4998-93ad-73de66d0f659 NaN NaN NaN NaN NaN
a7340bf0-df10-41c7-891b-924a502c8559 The Selecter 1329faaf-799f-44e2-91f9-87fe52f17afd NaN NaN NaN
ad256b03-3e8c-4777-b691-ee4a5488bd1b NaN usicBrainz Artist Id NaN NaN NaN
6dfe2848-9617-4f0a-a31e-bc78adaf4a01 NaN NaN NaN NaN NaN
485d0a0e-65d2-47da-8171-1ae8fcddb597 NaN NaN NaN NaN NaN
7d924bb6-0b0e-450c-993d-0af172b44248 NaN usicBrainz Artist Id NaN NaN NaN
d27e1f13-8482-4c03-b161-9cc020825a88 NaN NaN NaN NaN NaN
8b1863dc-8a61-4883-b4d2-a195dd385882 NaN usicBrainz Artist Id NaN NaN NaN
beb50e18-7a22-4474-8021-b75eef4783fa NaN NaN NaN NaN NaN
bb91ce0e-8493-469f-bd1a-a314b2682400 Daft Punk usicBrainz Artist Id NaN NaN NaN
91969a43-5c28-4f25-8292-8eabe570171d NaN NaN NaN NaN NaN
1f9e0f8a-1df5-4c49-8fd1-03c3d6dede32 NaN NaN NaN NaN NaN
1a6dd09c-3218-4d13-8ad1-668a51b3f0df NaN NaN NaN NaN NaN
a700b5a4-82e6-40d5-8023-58ccd4ed50b3 NaN NaN NaN NaN NaN

Some columns also contain incorrect data (random string in place of UUID). These things are not verified during submission to AcousticBrainz.


In [14]:
# Remove bad data so we don't have to worry about it later
ab_data = ab_data[ab_data.recording.notnull()]
ab_data = ab_data[ab_data.artistid.notnull()]

# Some artistids are not UUIDs, so we remove them
# Not the best way to filter, but it should work for now
ab_data = ab_data[~(ab_data.artistid.str.len() != len('bb91ce0e-8493-469f-bd1a-a314b2682400'))]

Incorrect release years

Some data in data sets with release years is incorrect, so we need to fix this as well.


In [15]:
release_original[release_original.year < 1890] = np.nan
release_actual[release_actual.year < 1890] = np.nan

Duplicates

Recordings


In [16]:
duplicates = ab_data['recordingid'].value_counts()

In [17]:
duplicates_joined = duplicates.to_frame('duplicates').join(ab_data).drop_duplicates('recordingid')

In [18]:
print('Top recordings:')
duplicates_joined.sort('duplicates', ascending=False).loc[:,['artist', 'recording', 'duplicates']].head(10)


Top recordings:
Out[18]:
artist recording duplicates
ee898790-133f-445a-874f-d996abd843af The Beatles I'm Down 125
b2b50082-0bd1-4702-9a95-3499a4e5781b The Beatles Slow Down 107
659b4269-fe81-40e4-86e9-12879c09c9e6 The Beatles Can't Buy Me Love 94
c1d63906-f64a-4cd1-9873-9f3a9f98883c The Beatles A Hard Day's Night 93
b849acd4-0638-49ea-8e40-7391613d4890 The Beatles Something 92
917f2be3-065e-4d1e-8a76-1b50abd1ad95 The Beatles And I Love Her 90
15127932-c879-466e-b0f8-a1c5022d16e7 The Beatles Octopus's Garden 88
00c47ea6-3a10-4a32-b1f1-990ac756c6a0 The Beatles Ticket to Ride 87
63dd7ef6-6d6e-44d5-a4d9-190e49223077 The Beatles Eight Days a Week 86
485bbe7f-d0f7-4ffe-8adb-0f1093dd2dbf The Beatles Come Together 83

In [19]:
plt.hist(duplicates, bins=50, facecolor='green', alpha=0.6)
plt.yscale('symlog')
plt.xlabel('duplicate tracks')
plt.ylabel('number of tracks')
plt.show()


Artists


In [20]:
# Extracting artists
artists = ab_data.loc[:,['artistid', 'artist']].drop_duplicates('artistid').set_index('artistid')

In [21]:
duplicate_artists = ab_data.loc[:,['artistid', 'artist']].groupby('artistid').count()

In [22]:
print('Top artists:')
duplicate_artists.join(artists, lsuffix='_count').sort('artist_count', ascending=False).head(10)


Top artists:
Out[22]:
artist_count artist
artistid
b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d 23922 The Beatles
24f1766e-9635-4d58-a4d4-9413f9f98a4c 10305 Johann Sebastian Bach
691b0e9d-9e57-41cf-932d-a3d21b068e75 8910 Jean Sibelius
53b106e7-0cc6-42cc-ac95-ed8d30a3a98e 7492 John Williams
1f9df192-a621-4f54-8850-2c5373b7eac9 7208 Ludwig van Beethoven
b972f589-fb0e-474e-b64a-803b0364fa75 5942 Wolfgang Amadeus Mozart
197450cd-0124-4164-b723-3c22dd16494d 5889 Frank Sinatra
72c536dc-7137-4477-a521-567eeb840fa8 5738 Bob Dylan & The Band
5a28f8c2-31fb-4047-ae57-c5c326989262 5345 Hawkwind
9ddd7abc-9e1b-471d-8031-583bc6bc8be9 5020 Pyotr Ilyich Tchaikovsky

In [23]:
plt.hist(duplicate_artists['artist'].values, bins=60, facecolor='green', alpha=0.6)
plt.yscale('symlog')
plt.xlabel('duplicate artists')
plt.ylabel('number of artists')
plt.show()


Years


In [25]:
duplicate_years_original = ab_data.loc[:,['recordingid']].join(release_original).groupby('year').count()
duplicate_years_original.rename(columns={'recordingid': 'count'}, inplace=True)

print('Top years (original):')
duplicate_years_original.sort('count', ascending=False).head(5)


Top years (original):
Out[25]:
count
year
2007 63220
2005 61249
2008 59693
2006 56909
2003 56775

In [26]:
duplicate_years_actual = ab_data.loc[:,['recordingid']].join(release_actual).groupby('year').count()
duplicate_years_actual.rename(columns={'recordingid': 'count'}, inplace=True)

print('Top years (actual):')
duplicate_years_actual.sort('count', ascending=False).head(5)


Top years (actual):
Out[26]:
count
year
2007 72021
2005 69358
2008 69338
2006 66521
2003 62997

In [27]:
plt.plot(duplicate_years_original.index, duplicate_years_original, label='original')
plt.plot(duplicate_years_actual.index, duplicate_years_actual, label='actual')
plt.ylabel('duplicates')
plt.xlabel('year')
plt.legend(loc='upper left')
plt.show()


The Loudness wars

Average loudness


In [ ]:
avg_loudness = ab_data.loc[:,['average_loudness']]

original_loudness = avg_loudness.join(release_original).groupby('year').aggregate(np.mean)
actual_loudness = avg_loudness.join(release_actual).groupby('year').aggregate(np.mean)

In [ ]:
plt.plot(original_loudness.index, original_loudness, label='original')
plt.plot(actual_loudness.index, actual_loudness, label='actual')
plt.axis(xmin=min([min(original_loudness.index), min(actual_loudness.index)]), 
         xmax=max([max(original_loudness.index), max(actual_loudness.index)]))
plt.ylabel('average_loudness')
plt.xlabel('year')
plt.legend(loc='lower right')
plt.show()

Dynamic complexity


In [ ]:
# Using dynamic_complexity
dyn_complexity = ab_data.loc[:,['dynamic_complexity']]

original_loudness = dyn_complexity.join(release_original).groupby('year').aggregate(np.mean)
actual_loudness = dyn_complexity.join(release_actual).groupby('year').aggregate(np.mean)

In [ ]:
plt.plot(original_loudness.index, original_loudness, label='original')
plt.plot(actual_loudness.index, actual_loudness, label='actual')
plt.axis(xmin=min([min(original_loudness.index), min(actual_loudness.index)]), 
         xmax=max([max(original_loudness.index), max(actual_loudness.index)]))
plt.ylabel('dynamic_complexity')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.show()

Replay gain


In [ ]:
replay_gain = ab_data.loc[:,['replay_gain']]

original_loudness = replay_gain.join(release_original).groupby('year').aggregate(np.mean)
actual_loudness = replay_gain.join(release_actual).groupby('year').aggregate(np.mean)

In [ ]:
plt.plot(original_loudness.index, original_loudness, label='original')
plt.plot(actual_loudness.index, actual_loudness, label='actual')
plt.axis(xmin=min([min(original_loudness.index), min(actual_loudness.index)]), 
         xmax=max([max(original_loudness.index), max(actual_loudness.index)]))
plt.ylabel('replay_gain')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.show()

In [ ]:
# TODO: Try something else!