In [1]:
%load_ext watermark
In [2]:
%watermark -a 'Sebastian Raschka' -d -v
The lyrics have been removed from the datasets in the public GitHub repository for copyright reasons. But you can follow the steps in this IPython notebook to re-download the lyrics.
In [3]:
import pandas as pd
df = pd.read_csv('../../dataset/training/train_lyrics_rem_1000.csv')
df.tail()
Out[3]:
In [4]:
# Sebastian Raschka, 2014
#
# Script to download lyrics from http://lyrics.wikia.com/
import urllib
import lxml.html
class Song(object):
def __init__(self, artist, title):
self.artist = self.__format_str(artist)
self.title = self.__format_str(title)
self.url = None
self.lyric = None
def __format_str(self, s):
# remove paranthesis and contents
s = s.strip()
try:
# strip accent
s = ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
except:
pass
s = s.title()
return s
def __quote(self, s):
return urllib.parse.quote(s.replace(' ', '_'))
def __make_url(self):
artist = self.__quote(self.artist)
title = self.__quote(self.title)
artist_title = '%s:%s' %(artist, title)
url = 'http://lyrics.wikia.com/' + artist_title
self.url = url
def update(self, artist=None, title=None):
if artist:
self.artist = self.__format_str(artist)
if title:
self.title = self.__format_str(title)
def lyricwikia(self):
self.__make_url()
try:
doc = lxml.html.parse(self.url)
lyricbox = doc.getroot().cssselect('.lyricbox')[0]
except (IOError, IndexError) as e:
self.lyric = ''
return self.lyric
lyrics = []
for node in lyricbox:
if node.tag == 'br':
lyrics.append('\n')
if node.tail is not None:
lyrics.append(node.tail)
self.lyric = "".join(lyrics).strip()
return self.lyric
song = Song(artist='John Mellencamp', title='Jack and Diane')
lyr = song.lyricwikia()
print(lyr)
In [6]:
import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
song = Song(artist=df.loc[row_id]['artist'], title=df.loc[row_id]['title'])
lyr = song.lyricwikia()
df.loc[row_id,'lyrics'] = lyr
pbar.update()
df.tail()
Out[6]:
In [ ]:
df.to_csv('../../dataset/training/train_lyrics_1000.csv', index=False)
In [7]:
df = pd.read_csv('../../dataset/validation/valid_lyrics_rem_200.csv')
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
song = Song(artist=df.loc[row_id]['artist'], title=df.loc[row_id]['title'])
lyr = song.lyricwikia()
df.loc[row_id,'lyrics'] = lyr
pbar.update()
df.tail()
Out[7]:
In [ ]:
df.to_csv('../../dataset/validation/valid_lyrics_200.csv', index=False)
In [ ]:
df = pd.read_csv('../../dataset/auxiliary/aux_lyrics_rem.csv')
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
song = Song(artist=df.loc[row_id]['artist'], title=df.loc[row_id]['title'])
lyr = song.lyricwikia()
df.loc[row_id,'lyrics'] = lyr
pbar.update()
df.tail()
In [ ]:
df.to_csv('../../dataset/auxiliary/aux_lyrics.csv', index=False)