In [1]:
%load_ext watermark
%watermark -d -v -a 'Sebastian Raschka'
In [2]:
import pandas as pd
import re
store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
artists = pd.Series(store.root.metadata.songs.cols.artist_name)
titles = pd.Series(store.root.metadata.songs.cols.title)
store.close()
df = pd.concat([artists, titles], axis=1)
df.columns=['artist', 'title']
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply((lambda x: x.decode('utf-8')))
df.loc[:, 'title'] = df.loc[:, 'title'].apply((lambda x: x.decode('utf-8')))
# removes parentheses + content e.g., 'Bleed (Album Version)' -> 'Bleed'
df.loc[:, 'title'] = df.loc[:, 'title'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.tail()
In [3]:
import sqlite3
conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
df.to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()
In [4]:
import urllib, re
import bs4
def songlyrics(artist, title):
artist = urllib.quote(artist.lower().replace(' ','-'))
title = urllib.quote(title.lower().replace(' ','-'))
try:
lyrics = urllib.urlopen('http://www.songlyrics.com/%s/%s-lyrics/' % (artist,title))
except:
return None
text = lyrics.read()
soup = bs4.BeautifulSoup(text)
lyrics = soup.findAll(attrs= {'id' : 'songLyricsDiv'})
if not lyrics:
return None
else:
if str(lyrics[0]).startswith("<p class='songLyricsV14 iComment-text' id='songLyricsDiv'></p>"):
return None
try:
return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
except:
return None
def lyricsmode(artist, title):
artist = urllib.quote(artist.lower().replace(' ','_'))
title = urllib.quote(title.lower().replace(' ','_'))
try:
url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)
lyrics = urllib.urlopen(url)
except:
return None
text = lyrics.read()
soup = bs4.BeautifulSoup(text)
#lyricsmode places the lyrics in a span with an id of "lyrics"
lyrics = soup.findAll(attrs= {'id' : 'lyrics_text'})
if not lyrics:
return None
try:
return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
except:
return None
def get_lyrics(artist, title):
lyr = songlyrics(artist, title)
if not lyr:
lyr = lyricsmode(artist, title)
return lyr
test = get_lyrics('Bob Dylan','Blowing in the wind')
print(test)
test2 = get_lyrics('test','test')
print(test2)
As a rule of thumb, I assume that every song that has less than 50% English words (i.e., words that are not in the English vocabulary) is non-English.
In [ ]:
import nltk
def eng_ratio(text):
''' Returns the ratio of non-English to English words from a text '''
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha())
common = text_vocab.intersection(english_vocab)
try:
diff = len(common)/float(len(text_vocab))
except ZeroDivisionError:
diff = 0.0
return diff
text = 'This is a test fahrrad'
print(eng_ratio(text))
lyr = get_lyrics('Pharrell','Happy')
print(eng_ratio(lyr))
The following labels are being used to annotate the songs:
0 = no lyrics
1 = likely English
2 = likely non-English
In [ ]:
#df = df.loc[:3000, :]
df['lang'] = pd.Series('', index=df.index)
df.tail()
In [ ]:
import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
lyr = get_lyrics(artist=df.loc[row_id]['artist'].encode('utf-8'), title=df.loc[row_id]['title'].encode('utf-8'))
if not lyr:
df.loc[row_id,'lang'] = 0
elif eng_ratio(lyr) >= 0.5:
df.loc[row_id,'lang'] = 1
else:
df.loc[row_id,'lang'] = 2
pbar.update()
In [ ]:
df[df['lang'] == 1].shape
In [20]:
df = df[df['lang'] == 1]
df.index = range(df.shape[0])
df.tail()
Out[20]:
In [21]:
df = df[df['lang'] == 1]
conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
df[df['lang'] == 1].loc[:649,:].to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()
In [27]:
conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
cursor = conn.cursor()
sql = "SELECT * FROM artist_title ORDER BY RANDOM() LIMIT 1;"
cursor.execute(sql)
result = cursor.fetchone()
artistname = result[1].decode('utf-8')
songtitle = result[2].decode('utf-8')
print('Arist: %s \nSong: %s' % (artistname, songtitle))
conn.close()
In [ ]:
import sys
import sqlite3
conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
c = conn.cursor()
try:
for i in range(1000000):
sys.stdout.write('\r')
sys.stdout.write('%s' %i)
sql = "SELECT * FROM artist_title WHERE language IS NULL LIMIT 1;"
c.execute(sql)
result = c.fetchone()
row_id = result[0]
artistname = result[1].decode('utf-8')
songtitle = result[2].decode('utf-8')
lyr = get_lyrics(artist=artistname, title=songtitle)
lang = None
if not lyr:
lang = 0
elif eng_ratio(lyr) >= 0.5:
lang = 1
else:
lang = 2
sql = "UPDATE artist_title SET language=? WHERE rowid=?;"
c.execute(sql, (lang, row_id))
sys.stdout.flush()
except KeyboardInterrupt:
pass
finally:
conn.commit()
conn.close()
In [ ]: