In [1]:
%load_ext watermark
In [2]:
%watermark -a 'Sebastian Raschka' -d -v
[More information](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `watermark` magic command extension.
A subset of 10,000 songs in HDF5 format was downloaded from the Million Song Dataset. A feature list of the file contents can be found here.
The following snippet flattens the directory tree that the Million Song subset comes in:
In [3]:
import os, sys
dir_tree = '/Users/sebastian/Desktop/MillionSongSubset/'
for dir_path, dir_names, file_names in os.walk(dir_tree):
for file_name in file_names:
try:
os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
except OSError:
print ("Could not move %s " % os.join(dir_path, file_name))
Now, we create a a pandas DataFrame with the three feature columns file, artist, and title, where the artist and title are our input for the lyrics search, and the file name is merely serves for identification purposes.
In [16]:
import os
import pandas as pd
def make_artist_table(base):
# Get file names
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
data = {'file':[], 'artist':[], 'title':[]}
# Add artist and title data to dictionary
for f in files:
store = pd.HDFStore(f)
title = store.root.metadata.songs.cols.title[0]
artist = store.root.metadata.songs.cols.artist_name[0]
data['file'].append(os.path.basename(f))
data['title'].append(title.decode("utf-8"))
data['artist'].append(artist.decode("utf-8"))
store.close()
# Convert dictionary to pandas DataFrame
df = pd.DataFrame.from_dict(data, orient='columns')
df = df[['file', 'artist', 'title']]
return df
In [17]:
base = '/Users/sebastian/Desktop/MillionSongSubset/'
df = make_artist_table(base)
df.tail()
Out[17]:
First, we add a new column for the lyrics to our DataFrame.
In [20]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()
Out[20]:
Then, we use the following code to download the song lyrics from LyricWikia based on the artist and title names in the pandas DataFrame.
In [24]:
# Sebastian Raschka, 2014
#
# Script to download lyrics from http://lyrics.wikia.com/
import urllib
import lxml.html
class Song(object):
def __init__(self, artist, title):
self.artist = self.__format_str(artist)
self.title = self.__format_str(title)
self.url = None
self.lyric = None
def __format_str(self, s):
# remove paranthesis and contents
s = s.strip()
try:
# strip accent
s = ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
except:
pass
s = s.title()
return s
def __quote(self, s):
return urllib.parse.quote(s.replace(' ', '_'))
def __make_url(self):
artist = self.__quote(self.artist)
title = self.__quote(self.title)
artist_title = '%s:%s' %(artist, title)
url = 'http://lyrics.wikia.com/' + artist_title
self.url = url
def update(self, artist=None, title=None):
if artist:
self.artist = self.__format_str(artist)
if title:
self.title = self.__format_str(title)
def lyricwikia(self):
self.__make_url()
try:
doc = lxml.html.parse(self.url)
lyricbox = doc.getroot().cssselect('.lyricbox')[0]
except (IOError, IndexError) as e:
self.lyric = ''
return self.lyric
lyrics = []
for node in lyricbox:
if node.tag == 'br':
lyrics.append('\n')
if node.tail is not None:
lyrics.append(node.tail)
self.lyric = "".join(lyrics).strip()
return self.lyric
If this script doesn't work for you, you can find some alternatives to download lyrics from other websites in my datacollect repository.
In [25]:
song = Song(artist='John Mellencamp', title='Jack and Diane')
lyr = song.lyricwikia()
print(lyr)
In [26]:
import pyprind
In [27]:
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
song = Song(artist=df.loc[row_id]['artist'], title=df.loc[row_id]['title'])
lyr = song.lyricwikia()
df.loc[row_id,'lyrics'] = lyr
pbar.update()
In [28]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()
Out[28]:
In [29]:
df.to_csv('/Users/sebastian/Desktop/df_lyr_backup.csv')
If lyrics were not available, this can be due to one of the following reasons
In [30]:
df = df[df.lyrics!='']
Now, we remove all lyrics that are not in English. Basically, we say that if the song contains more English than non-English words (> 50%), then it is an English song. We use this relatively high cutoff-ratio of 0.5, since a songtext likely contains also names and other special words that are not part of a common English dictionary.
In [32]:
import nltk
def eng_ratio(text):
''' Returns the ratio of non-English to English words from a text '''
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha())
unusual = text_vocab.difference(english_vocab)
diff = len(unusual)/len(text_vocab)
return diff
text = 'This is a test fahrrad'
print(eng_ratio(text))
In [33]:
before = df.shape[0]
for row_id in df.index:
text = df.loc[row_id]['lyrics']
diff = eng_ratio(text)
if diff >= 0.5:
df = df[df.index != row_id]
after = df.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)
In [34]:
df.to_csv('/Users/sebastian/Desktop/df_lyr_backup2.csv', index=False)
Now, we copy all songs for which the lyrics exist to a new directory.
In [35]:
import os
import shutil
new_dir = '/Users/sebastian/Desktop/h5_filtered/'
if not os.path.exists(new_dir):
os.mkdir(new_dir)
h1 = '/Users/sebastian/Desktop/MillionSongSubset/'
filepaths1 = [os.path.join(h1, f) for f in os.listdir(h1) if f.endswith('.h5')]
filepaths = filepaths1
for f in filepaths:
base = os.path.basename(f)
if base in df.file.values:
target = os.path.join(new_dir, base)
shutil.copyfile(f, target)
In this step, the dataset is reduced to a "reasonable" amount for the manual labeling step: 1000 songs for the training dataset and 200 songs for the validation dataset.
In [38]:
import random
h2 = '/Users/sebastian/Desktop/h5_filtered/'
filepaths2 = [os.path.join(h2, f) for f in os.listdir(h2) if f.endswith('.h5')]
random.shuffle(filepaths2)
train_dir = '../../dataset/training/h5_train/'
valid_dir = '../../dataset/validation/h5_valid/'
aux_dir = '../../dataset/auxiliary/h5_aux/'
for d in (train_dir, valid_dir, aux_dir):
if not os.path.exists(d):
os.mkdir(d)
for f in filepaths2[:1000]:
base = os.path.basename(f)
target = os.path.join(train_dir, base)
shutil.copyfile(f, target)
for i in filepaths2[1000:1200]:
base = os.path.basename(f)
target = os.path.join(valid_dir, base)
shutil.copyfile(f, target)
for i in filepaths2[1200:]:
base = os.path.basename(f)
target = os.path.join(aux_dir, base)
shutil.copyfile(f, target)
In [88]:
df_train = make_artist_table('../../dataset/training/h5_train')
df_train['lyrics'] = pd.Series('', index=df_train.index)
pbar = pp.ProgBar(df_train.shape[0])
for row_id in df_train.index:
song = Song(artist=df_train.loc[row_id]['artist'], title=df_train.loc[row_id]['title'])
lyr = song.lyricwikia()
df_train.loc[row_id]['lyrics'] = lyr
pbar.update()
In [89]:
df_valid = make_artist_table('../../dataset/validation/h5_validate')
df_valid['lyrics'] = pd.Series('', index=df_valid.index)
pbar = pp.ProgBar(df_valid.shape[0])
for row_id in df_valid.index:
song = Song(artist=df_valid.loc[row_id]['artist'], title=df_valid.loc[row_id]['title'])
lyr = song.lyricwikia()
df_valid.loc[row_id]['lyrics'] = lyr
pbar.update()
In [90]:
df_train.to_csv('../../dataset/training/train_lyrics_1000.csv')
df_valid.to_csv('../../dataset/validation/valid_lyrics_200.csv')
In [39]:
import pandas as pd
In [46]:
df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df.head()
Out[46]:
In [48]:
import os
df['year'] = pd.Series('', index=df.index)
base = '../../dataset/training/h5_train/'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
filename = df.loc[row_id]['file']
filepath = os.path.join(base,filename)
store = pd.HDFStore(filepath)
year = store.root.musicbrainz.songs.cols.year[0]
df.loc[row_id]['year'] = year
In [49]:
df[['file', 'artist', 'title','lyrics','year']].tail()
Out[49]:
In [22]:
df.to_csv('../../dataset/training/train_lyrics_1000.csv', index=False)
Missing year labels were manually added based on information from http://www.allmusic.com