In [1]:

    
%load_ext watermark
%watermark -d -v -a 'Sebastian Raschka'









    



Sebastian Raschka 13/12/2014 

CPython 2.7.8
IPython 2.1.0

Loading artists and titles from the Million Song Dataset into a Pandas DataFrame



In [2]:

    
import pandas as pd
import re

store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
artists = pd.Series(store.root.metadata.songs.cols.artist_name)
titles = pd.Series(store.root.metadata.songs.cols.title)
store.close()

df = pd.concat([artists, titles], axis=1)

df.columns=['artist', 'title']

df.loc[:, 'artist'] = df.loc[:, 'artist'].apply((lambda x: x.decode('utf-8')))
df.loc[:, 'title'] = df.loc[:, 'title'].apply((lambda x: x.decode('utf-8')))

# removes parentheses + content e.g., 'Bleed (Album Version)' -> 'Bleed'
df.loc[:, 'title'] = df.loc[:, 'title'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))

df.tail()









    



---------------------------------------------------------------------------
NoSuchNodeError                           Traceback (most recent call last)
<ipython-input-2-b11d88fe4c33> in <module>()
      3 
      4 store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
----> 5 artists = pd.Series(store.root.metadata.songs.cols.artist_name)
      6 titles = pd.Series(store.root.metadata.songs.cols.title)
      7 store.close()

/Users/sebastian/miniconda3/envs/py27/lib/python2.7/site-packages/tables/group.pyc in __getattr__(self, name)
    809             self._g_add_children_names()
    810             return mydict[name]
--> 811         return self._f_get_child(name)
    812 
    813     def __setattr__(self, name, value):

/Users/sebastian/miniconda3/envs/py27/lib/python2.7/site-packages/tables/group.pyc in _f_get_child(self, childname)
    679         self._g_check_open()
    680 
--> 681         self._g_check_has_child(childname)
    682 
    683         childpath = join_path(self._v_pathname, childname)

/Users/sebastian/miniconda3/envs/py27/lib/python2.7/site-packages/tables/group.pyc in _g_check_has_child(self, name)
    403             raise NoSuchNodeError(
    404                 "group ``%s`` does not have a child named ``%s``"
--> 405                 % (self._v_pathname, name))
    406         return node_type
    407 

NoSuchNodeError: group ``/`` does not have a child named ``metadata``

Save the artist-title table as SQLite3 for fututure analyses



In [3]:

    
import sqlite3

conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
df.to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()

Code to scrape lyrics from the web



In [4]:

    
import urllib, re
import bs4
          
def songlyrics(artist, title):
    artist = urllib.quote(artist.lower().replace(' ','-'))
    title = urllib.quote(title.lower().replace(' ','-'))

    try:
        lyrics = urllib.urlopen('http://www.songlyrics.com/%s/%s-lyrics/' % (artist,title))
    except:
        return None
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    lyrics = soup.findAll(attrs= {'id' : 'songLyricsDiv'})
    if not lyrics:
        return None
    else:
        if str(lyrics[0]).startswith("<p class='songLyricsV14 iComment-text' id='songLyricsDiv'></p>"):

            return None
        try:
            return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
        except:
            return None


def lyricsmode(artist, title):
    artist = urllib.quote(artist.lower().replace(' ','_'))
    title = urllib.quote(title.lower().replace(' ','_'))

    try:
        url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)
        lyrics = urllib.urlopen(url)
    except:
        return None 
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    #lyricsmode places the lyrics in a span with an id of "lyrics"
    lyrics = soup.findAll(attrs= {'id' : 'lyrics_text'})
    if not lyrics:
        return None 
    try:
        return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
    except:
        return None  

def get_lyrics(artist, title):
    lyr = songlyrics(artist, title)
    if not lyr:
        lyr = lyricsmode(artist, title)
    return lyr


test = get_lyrics('Bob Dylan','Blowing in the wind')
print(test)
test2 = get_lyrics('test','test')
print(test2)









    



How many roads must a man walk down
Before you call him a man?
Yes, 'n' how many seas must a white dove sail
Before she sleeps in the sand?
Yes, 'n' how many times must the cannon balls fly
Before they're forever banned?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind

How many times must a man look up
Before he can see the sky?
Yes, 'n' how many ears must one man have
Before he can hear people cry?
Yes, 'n' how many deaths will it take till he knows
That too many people have died?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind

How many years can a mountain exist
Before it's washed to the sea?
Yes, 'n' how many years can some people exist
Before they're allowed to be free?
Yes, 'n' how many times can a man turn his head
Pretending he just doesn't see?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind
None

Code to check if lyrics are English

As a rule of thumb, I assume that every song that has less than 50% English words (i.e., words that are not in the English vocabulary) is non-English.



In [ ]:

    
import nltk

def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    common = text_vocab.intersection(english_vocab)
    try:
        diff = len(common)/float(len(text_vocab))
    except ZeroDivisionError:
        diff = 0.0
    return diff
    
text = 'This is a test fahrrad'

print(eng_ratio(text))
lyr = get_lyrics('Pharrell','Happy')
print(eng_ratio(lyr))









    



0.8
0.986666666667

Annotating the language of the songs

The following labels are being used to annotate the songs:
0 = no lyrics
1 = likely English
2 = likely non-English



In [ ]:

    
#df = df.loc[:3000, :]
df['lang'] = pd.Series('', index=df.index)
df.tail()



In [ ]:

    
import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    lyr = get_lyrics(artist=df.loc[row_id]['artist'].encode('utf-8'), title=df.loc[row_id]['title'].encode('utf-8'))
    
    if not lyr:
        df.loc[row_id,'lang'] = 0
    elif eng_ratio(lyr) >= 0.5:
        df.loc[row_id,'lang'] = 1
    else:
        df.loc[row_id,'lang'] = 2
    
    pbar.update()









    



0%                          100%
[                              ]



In [ ]:

    
df[df['lang'] == 1].shape



In [20]:

    
df = df[df['lang'] == 1]
df.index = range(df.shape[0])
df.tail()









    Out[20]:






  
    
      
      artist
      title
      lang
    
  
  
    
      647
       Suzanne Vega
       Marlene On The Wall
       1
    
    
      648
              Nelly
                 St. Louie
       1
    
    
      649
              Larue
                    Reason
       1
    
    
      650
         Liam Lynch
                       SOS
       1
    
    
      651
              Oasis
        Boy With The Blues
       1



In [21]:

    
df = df[df['lang'] == 1]

conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
df[df['lang'] == 1].loc[:649,:].to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()

Pick a random song from the database



In [27]:

    
conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
cursor = conn.cursor()
sql = "SELECT * FROM artist_title ORDER BY RANDOM() LIMIT 1;"
cursor.execute(sql)
result = cursor.fetchone()
artistname = result[1].decode('utf-8')
songtitle = result[2].decode('utf-8')
print('Arist: %s \nSong: %s' % (artistname, songtitle))
conn.close()









    



Arist: Grave Digger 
Song: My blood will live forever

Check songs in the SQLite database



In [ ]:

    
import sys
import sqlite3

conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
c = conn.cursor()


try:

    for i in range(1000000):

        
	    sys.stdout.write('\r')
	    sys.stdout.write('%s' %i)

	    sql = "SELECT * FROM artist_title WHERE language IS NULL LIMIT 1;"
	    c.execute(sql)
	    result = c.fetchone()
	    row_id = result[0]
	    artistname = result[1].decode('utf-8')
	    songtitle = result[2].decode('utf-8')
    
    
	    lyr = get_lyrics(artist=artistname, title=songtitle)
    
	    lang = None
	    if not lyr:
	        lang = 0
	    elif eng_ratio(lyr) >= 0.5:
	        lang = 1
	    else:
	        lang = 2
    
	    sql = "UPDATE artist_title SET language=? WHERE rowid=?;"
    
	    c.execute(sql, (lang, row_id))
    
	    sys.stdout.flush()
        
except KeyboardInterrupt:
    pass
finally:
    conn.commit()
    conn.close()



In [ ]:

	artist	title	lang
647	Suzanne Vega	Marlene On The Wall	1
648	Nelly	St. Louie	1
649	Larue	Reason	1
650	Liam Lynch	SOS	1
651	Oasis	Boy With The Blues	1