In [1]:
%load_ext watermark
In [2]:
%watermark -a 'Sebastian Raschka' -d -v
[More information](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `watermark` magic command extension.
A subset of 10,000 songs in HDF5
format was downloaded from the Million Song Dataset. A feature list of the file contents can be found here.
The following snippet flattens the directory tree that the Million Song subset comes in:
In [3]:
import os, sys
dir_tree = '/Users/sebastian/Desktop/MillionSongSubset/'
for dir_path, dir_names, file_names in os.walk(dir_tree):
for file_name in file_names:
try:
os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
except OSError:
print ("Could not move %s " % os.join(dir_path, file_name))
Now, we create a a pandas DataFrame with the three feature columns file
, artist
, and title
, where the artist
and title
are our input for the lyrics search, and the file
name is merely serves for identification purposes.
In [16]:
import os
import pandas as pd
def make_artist_table(base):
# Get file names
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
data = {'file':[], 'artist':[], 'title':[]}
# Add artist and title data to dictionary
for f in files:
store = pd.HDFStore(f)
title = store.root.metadata.songs.cols.title[0]
artist = store.root.metadata.songs.cols.artist_name[0]
data['file'].append(os.path.basename(f))
data['title'].append(title.decode("utf-8"))
data['artist'].append(artist.decode("utf-8"))
store.close()
# Convert dictionary to pandas DataFrame
df = pd.DataFrame.from_dict(data, orient='columns')
df = df[['file', 'artist', 'title']]
return df
In [17]:
base = '/Users/sebastian/Desktop/MillionSongSubset/'
df = make_artist_table(base)
df.tail()
Out[17]:
First, we add a new column for the lyrics to our DataFrame.
In [20]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()
Out[20]:
Then, we use the following code to download the song lyrics from LyricWikia based on the artist and title names in the pandas DataFrame.
In [24]:
# Sebastian Raschka, 2014
#
# Script to download lyrics from http://lyrics.wikia.com/
import urllib
import lxml.html
class Song(object):
def __init__(self, artist, title):
self.artist = self.__format_str(artist)
self.title = self.__format_str(title)
self.url = None
self.lyric = None
def __format_str(self, s):
# remove paranthesis and contents
s = s.strip()
try:
# strip accent
s = ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
except:
pass
s = s.title()
return s
def __quote(self, s):
return urllib.parse.quote(s.replace(' ', '_'))
def __make_url(self):
artist = self.__quote(self.artist)
title = self.__quote(self.title)
artist_title = '%s:%s' %(artist, title)
url = 'http://lyrics.wikia.com/' + artist_title
self.url = url
def update(self, artist=None, title=None):
if artist:
self.artist = self.__format_str(artist)
if title:
self.title = self.__format_str(title)
def lyricwikia(self):
self.__make_url()
try:
doc = lxml.html.parse(self.url)
lyricbox = doc.getroot().cssselect('.lyricbox')[0]
except (IOError, IndexError) as e:
self.lyric = ''
return self.lyric
lyrics = []
for node in lyricbox:
if node.tag == 'br':
lyrics.append('\n')
if node.tail is not None:
lyrics.append(node.tail)
self.lyric = "".join(lyrics).strip()
return self.lyric
If this script doesn't work for you, you can find some alternatives to download lyrics from other websites in my datacollect repository.
In [25]:
song = Song(artist='John Mellencamp', title='Jack and Diane')
lyr = song.lyricwikia()
print(lyr)
In [26]:
import pyprind
In [27]:
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
song = Song(artist=df.loc[row_id]['artist'], title=df.loc[row_id]['title'])
lyr = song.lyricwikia()
df.loc[row_id,'lyrics'] = lyr
pbar.update()
In [28]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()
Out[28]:
In [29]:
df.to_csv('/Users/sebastian/Desktop/df_lyr_backup.csv')
If lyrics were not available, this can be due to one of the following reasons
In [30]:
df = df[df.lyrics!='']
Now, we remove all lyrics that are not in English. Basically, we say that if the song contains more English than non-English words (> 50%), then it is an English song. We use this relatively high cutoff-ratio of 0.5, since a songtext likely contains also names and other special words that are not part of a common English dictionary.
In [32]:
import nltk
def eng_ratio(text):
''' Returns the ratio of non-English to English words from a text '''
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha())
unusual = text_vocab.difference(english_vocab)
diff = len(unusual)/len(text_vocab)
return diff
text = 'This is a test fahrrad'
print(eng_ratio(text))
In [33]:
before = df.shape[0]
for row_id in df.index:
text = df.loc[row_id]['lyrics']
diff = eng_ratio(text)
if diff >= 0.5:
df = df[df.index != row_id]
after = df.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)
In [34]:
df.to_csv('/Users/sebastian/Desktop/df_lyr_backup2.csv', index=False)
Now, we copy all songs for which the lyrics exist to a new directory.
In [35]:
import os
import shutil
new_dir = '/Users/sebastian/Desktop/h5_filtered/'
if not os.path.exists(new_dir):
os.mkdir(new_dir)
h1 = '/Users/sebastian/Desktop/MillionSongSubset/'
filepaths1 = [os.path.join(h1, f) for f in os.listdir(h1) if f.endswith('.h5')]
filepaths = filepaths1
for f in filepaths:
base = os.path.basename(f)
if base in df.file.values:
target = os.path.join(new_dir, base)
shutil.copyfile(f, target)
In this step, the dataset is reduced to a "reasonable" amount for the manual labeling step: 1000 songs for the training dataset and 200 songs for the validation dataset.
In [38]:
import random
h2 = '/Users/sebastian/Desktop/h5_filtered/'
filepaths2 = [os.path.join(h2, f) for f in os.listdir(h2) if f.endswith('.h5')]
random.shuffle(filepaths2)
train_dir = '../../dataset/training/h5_train/'
valid_dir = '../../dataset/validation/h5_valid/'
aux_dir = '../../dataset/auxiliary/h5_aux/'
for d in (train_dir, valid_dir, aux_dir):
if not os.path.exists(d):
os.mkdir(d)
for f in filepaths2[:1000]:
base = os.path.basename(f)
target = os.path.join(train_dir, base)
shutil.copyfile(f, target)
for i in filepaths2[1000:1200]:
base = os.path.basename(f)
target = os.path.join(valid_dir, base)
shutil.copyfile(f, target)
for i in filepaths2[1200:]:
base = os.path.basename(f)
target = os.path.join(aux_dir, base)
shutil.copyfile(f, target)
In [88]:
df_train = make_artist_table('../../dataset/training/h5_train')
df_train['lyrics'] = pd.Series('', index=df_train.index)
pbar = pp.ProgBar(df_train.shape[0])
for row_id in df_train.index:
song = Song(artist=df_train.loc[row_id]['artist'], title=df_train.loc[row_id]['title'])
lyr = song.lyricwikia()
df_train.loc[row_id]['lyrics'] = lyr
pbar.update()
In [89]:
df_valid = make_artist_table('../../dataset/validation/h5_validate')
df_valid['lyrics'] = pd.Series('', index=df_valid.index)
pbar = pp.ProgBar(df_valid.shape[0])
for row_id in df_valid.index:
song = Song(artist=df_valid.loc[row_id]['artist'], title=df_valid.loc[row_id]['title'])
lyr = song.lyricwikia()
df_valid.loc[row_id]['lyrics'] = lyr
pbar.update()
In [90]:
df_train.to_csv('../../dataset/training/train_lyrics_1000.csv')
df_valid.to_csv('../../dataset/validation/valid_lyrics_200.csv')
In [39]:
import pandas as pd
In [46]:
df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df.head()
Out[46]:
In [48]:
import os
df['year'] = pd.Series('', index=df.index)
base = '../../dataset/training/h5_train/'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
filename = df.loc[row_id]['file']
filepath = os.path.join(base,filename)
store = pd.HDFStore(filepath)
year = store.root.musicbrainz.songs.cols.year[0]
df.loc[row_id]['year'] = year
In [49]:
df[['file', 'artist', 'title','lyrics','year']].tail()
Out[49]:
In [22]:
df.to_csv('../../dataset/training/train_lyrics_1000.csv', index=False)
Missing year labels were manually added based on information from http://www.allmusic.com