In [1]:
%load_ext watermark
%watermark -d -v -a 'Sebastian Raschka'


Sebastian Raschka 13/12/2014 

CPython 2.7.8
IPython 2.1.0



Loading artists and titles from the Million Song Dataset into a Pandas DataFrame


In [2]:
import pandas as pd
import re

store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
artists = pd.Series(store.root.metadata.songs.cols.artist_name)
titles = pd.Series(store.root.metadata.songs.cols.title)
store.close()

df = pd.concat([artists, titles], axis=1)

df.columns=['artist', 'title']

df.loc[:, 'artist'] = df.loc[:, 'artist'].apply((lambda x: x.decode('utf-8')))
df.loc[:, 'title'] = df.loc[:, 'title'].apply((lambda x: x.decode('utf-8')))

# removes parentheses + content e.g., 'Bleed (Album Version)' -> 'Bleed'
df.loc[:, 'title'] = df.loc[:, 'title'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))

df.tail()


---------------------------------------------------------------------------
NoSuchNodeError                           Traceback (most recent call last)
<ipython-input-2-b11d88fe4c33> in <module>()
      3 
      4 store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
----> 5 artists = pd.Series(store.root.metadata.songs.cols.artist_name)
      6 titles = pd.Series(store.root.metadata.songs.cols.title)
      7 store.close()

/Users/sebastian/miniconda3/envs/py27/lib/python2.7/site-packages/tables/group.pyc in __getattr__(self, name)
    809             self._g_add_children_names()
    810             return mydict[name]
--> 811         return self._f_get_child(name)
    812 
    813     def __setattr__(self, name, value):

/Users/sebastian/miniconda3/envs/py27/lib/python2.7/site-packages/tables/group.pyc in _f_get_child(self, childname)
    679         self._g_check_open()
    680 
--> 681         self._g_check_has_child(childname)
    682 
    683         childpath = join_path(self._v_pathname, childname)

/Users/sebastian/miniconda3/envs/py27/lib/python2.7/site-packages/tables/group.pyc in _g_check_has_child(self, name)
    403             raise NoSuchNodeError(
    404                 "group ``%s`` does not have a child named ``%s``"
--> 405                 % (self._v_pathname, name))
    406         return node_type
    407 

NoSuchNodeError: group ``/`` does not have a child named ``metadata``



Save the artist-title table as SQLite3 for fututure analyses


In [3]:
import sqlite3

conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
df.to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()



Code to scrape lyrics from the web


In [4]:
import urllib, re
import bs4
          
def songlyrics(artist, title):
    artist = urllib.quote(artist.lower().replace(' ','-'))
    title = urllib.quote(title.lower().replace(' ','-'))

    try:
        lyrics = urllib.urlopen('http://www.songlyrics.com/%s/%s-lyrics/' % (artist,title))
    except:
        return None
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    lyrics = soup.findAll(attrs= {'id' : 'songLyricsDiv'})
    if not lyrics:
        return None
    else:
        if str(lyrics[0]).startswith("<p class='songLyricsV14 iComment-text' id='songLyricsDiv'></p>"):

            return None
        try:
            return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
        except:
            return None


def lyricsmode(artist, title):
    artist = urllib.quote(artist.lower().replace(' ','_'))
    title = urllib.quote(title.lower().replace(' ','_'))

    try:
        url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)
        lyrics = urllib.urlopen(url)
    except:
        return None 
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    #lyricsmode places the lyrics in a span with an id of "lyrics"
    lyrics = soup.findAll(attrs= {'id' : 'lyrics_text'})
    if not lyrics:
        return None 
    try:
        return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
    except:
        return None  

def get_lyrics(artist, title):
    lyr = songlyrics(artist, title)
    if not lyr:
        lyr = lyricsmode(artist, title)
    return lyr


test = get_lyrics('Bob Dylan','Blowing in the wind')
print(test)
test2 = get_lyrics('test','test')
print(test2)


How many roads must a man walk down
Before you call him a man?
Yes, 'n' how many seas must a white dove sail
Before she sleeps in the sand?
Yes, 'n' how many times must the cannon balls fly
Before they're forever banned?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind

How many times must a man look up
Before he can see the sky?
Yes, 'n' how many ears must one man have
Before he can hear people cry?
Yes, 'n' how many deaths will it take till he knows
That too many people have died?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind

How many years can a mountain exist
Before it's washed to the sea?
Yes, 'n' how many years can some people exist
Before they're allowed to be free?
Yes, 'n' how many times can a man turn his head
Pretending he just doesn't see?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind
None



Code to check if lyrics are English

As a rule of thumb, I assume that every song that has less than 50% English words (i.e., words that are not in the English vocabulary) is non-English.


In [ ]:
import nltk

def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    common = text_vocab.intersection(english_vocab)
    try:
        diff = len(common)/float(len(text_vocab))
    except ZeroDivisionError:
        diff = 0.0
    return diff
    
text = 'This is a test fahrrad'

print(eng_ratio(text))
lyr = get_lyrics('Pharrell','Happy')
print(eng_ratio(lyr))


0.8
0.986666666667



Annotating the language of the songs

The following labels are being used to annotate the songs:
0 = no lyrics
1 = likely English
2 = likely non-English


In [ ]:
#df = df.loc[:3000, :]
df['lang'] = pd.Series('', index=df.index)
df.tail()

In [ ]:
import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    lyr = get_lyrics(artist=df.loc[row_id]['artist'].encode('utf-8'), title=df.loc[row_id]['title'].encode('utf-8'))
    
    if not lyr:
        df.loc[row_id,'lang'] = 0
    elif eng_ratio(lyr) >= 0.5:
        df.loc[row_id,'lang'] = 1
    else:
        df.loc[row_id,'lang'] = 2
    
    pbar.update()


0%                          100%
[                              ]

In [ ]:
df[df['lang'] == 1].shape

In [20]:
df = df[df['lang'] == 1]
df.index = range(df.shape[0])
df.tail()


Out[20]:
artist title lang
647 Suzanne Vega Marlene On The Wall 1
648 Nelly St. Louie 1
649 Larue Reason 1
650 Liam Lynch SOS 1
651 Oasis Boy With The Blues 1

In [21]:
df = df[df['lang'] == 1]

conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
df[df['lang'] == 1].loc[:649,:].to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()



Pick a random song from the database


In [27]:
conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
cursor = conn.cursor()
sql = "SELECT * FROM artist_title ORDER BY RANDOM() LIMIT 1;"
cursor.execute(sql)
result = cursor.fetchone()
artistname = result[1].decode('utf-8')
songtitle = result[2].decode('utf-8')
print('Arist: %s \nSong: %s' % (artistname, songtitle))
conn.close()


Arist: Grave Digger 
Song: My blood will live forever



Check songs in the SQLite database


In [ ]:
import sys
import sqlite3

conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
c = conn.cursor()


try:

    for i in range(1000000):

        
	    sys.stdout.write('\r')
	    sys.stdout.write('%s' %i)

	    sql = "SELECT * FROM artist_title WHERE language IS NULL LIMIT 1;"
	    c.execute(sql)
	    result = c.fetchone()
	    row_id = result[0]
	    artistname = result[1].decode('utf-8')
	    songtitle = result[2].decode('utf-8')
    
    
	    lyr = get_lyrics(artist=artistname, title=songtitle)
    
	    lang = None
	    if not lyr:
	        lang = 0
	    elif eng_ratio(lyr) >= 0.5:
	        lang = 1
	    else:
	        lang = 2
    
	    sql = "UPDATE artist_title SET language=? WHERE rowid=?;"
    
	    c.execute(sql, (lang, row_id))
    
	    sys.stdout.flush()
        
except KeyboardInterrupt:
    pass
finally:
    conn.commit()
    conn.close()


32

In [ ]: