The goal of this notebook is to perform all that I need for reading a list of artists, downloading their whole discography from allmusic.com, and proceeding to retrieve lyrics for their songs from http://rap.genius.com.
Once the lyrics are obtained, I filter the lyrics and search for the occurrence of these key words:
Education, Knowledge, Teacher, Teach, School, Science
In [731]:
import commands as c
import pandas as pd
from pandas import DataFrame
from pandas import Series
import numpy as np
import urllib2
import json
from bs4 import BeautifulSoup
import os
In [2]:
## Read the list of rappers
infile = "list_of_rappers.txt"
artists = open(infile).readlines()
In [4]:
## The list Reece gave me was formatted as "number" and "name", so I had to
## create a loop to just get the names from each line.
artist_names = []
for ii in range(len(artists)):
line = artists[ii].split()
if len(line) > 2:
the_name = ' '.join(line[1:])
else:
the_name = line[1]
artist_names.append(the_name)
artist_names = list(set(artist_names))
The following set of functions and function-executions are here to accurately scrape the AllMusic.com website for the tracklists that I want for each artist.
In [78]:
def setup_the_allmusic_search(the_artist):
## Setup to search the allmusic.com database for the artist page
## Returns the search URL
url_root = "http://www.allmusic.com/search/artists/"
the_name = the_artist.replace(" ","+")
return url_root+the_name
def get_the_page(the_url):
## Send the HTML request for any URL provided.
## Return the HTML source for the requested page.
request = urllib2.urlopen(the_url)
htmlSource = request.read()
return htmlSource
def get_db_top_search_result(artist_name):
## Query AllMusic.com and get the top search result only.
## Return the url for the artist page
htmlSource = get_the_page(setup_the_allmusic_search(artist_name))
soup = BeautifulSoup(htmlSource)
search_results = soup.findAll("li", { "class" : "artist" })
top_result = search_results[0].findAll("div", {"class" : "name"})
artist_url = top_result[0].find("a").get("href")
return artist_url
def get_db_discography(url):
## Returns the HTML source for a given artist's page
the_discography_url = url+"/discography"
htmlSource = get_the_page(the_discography_url)
return htmlSource
def get_db_discography_table(htmlSource):
## Take the HTML source for a discography page and
## return the urls for each album, as well as the year
## that it was released.
the_table = BeautifulSoup(htmlSource).findAll("table")
the_tds_years = the_table[0].findAll("td", {"class" : "year"})
the_years = [data.text.strip() for data in the_tds_years]
the_tds_albums = the_table[0].findAll("td", {"class" : "title"})
the_album_urls = [data.find("a").get("href") for data in the_tds_albums]
return the_years, the_album_urls
def get_db_tracklist(the_album_urls):
## Given the url for an album, this function goes to the album
## page, and returns the title of every track listed for that album.
the_titles = []
for ii in range(len(the_album_urls)):
htmlSource = get_the_page(the_album_urls[ii])
the_titles_raw = BeautifulSoup(htmlSource).findAll("div", {"class" : "title"})
the_titles.append([title.text.strip() for title in the_titles_raw])
the_titles = np.concatenate(the_titles)
return the_titles
In [96]:
## Functions are set. Run them in a loop for every artist in the artist list.
## Save the artist name as well as their song title in file "artists_and_songs.txt"
fout = open('reece_results/artists_and_songs.txt', 'w')
fout.write('Artist\tSong\n')
fmt = "%s\t%s\n"
total_song_list = []
for ii in range(len(artist_names)):
search_result = get_db_top_search_result(artist_names[ii]) ## get the search result
## sometimes we get errors. In the case of an Exception, call it a not-found.
try:
db_page = get_db_discography(search_result)
years, albums = get_db_discography_table(db_page)
all_the_songs = get_db_tracklist(albums)
total_song_list.append(all_the_songs)
for jj in range(len(all_the_songs)):
fout.write(fmt % (artist_names[ii], all_the_songs[jj].encode('utf-8')))
except:
fout.write(fmt % (artist_names[ii], 'artist_not_found'))
fout.close()
In [97]:
## Each artist's tracklisting would be a list of song titles within the larger
## list for all the artists. Just compress them all into one long list
## of song titles so we can move forward.
total_song_list = np.concatenate(total_song_list)
Before this, the code was focused on getting the artist and song listings for all of the artists. Now that we have those, we query rap genius. Let's work with DataFrames in Pandas to keep things nice and ordered. Also, let's set up our first bit of functions.
In [542]:
def search_rapgenius_for_song_top_result(artist_name, song_name):
## Unfortunately there's no set format for song URLs in the genius.com site,
## so we need to search for the most probable search result for a given
## artist/song combination.
## First we need to filter out a set of track names that will be ambiguous,
## or otherwise unsearchable. All unsearchables will be noted as such.
bad_names = ['[Untitled]','[Untitled Track]','[Untitled Hidden Track]','','???',
'?','DVD','[DVD]','Instrumental','Interlude','Intro','Outro',
'Skit','Skit #1','Skit #2','Skit #3','Skit #4','[Silence]',
'artist_not_found','[CD-Rom Track]','[CD-ROM Track]']
bad_contents = 'instrumental'
if (str(song_name) != "nan"):
if ('instrumental' not in song_name.lower()) and (song_name not in bad_names):
song_name = song_name.replace("'","")
song_name = song_name.replace(" ","+")
url_string = "curl -d 'name=%s' -d 'genre=rap' 'http://genius-api.com/api/songInfo'" % (song_name)
all_results = eval(c.getoutput(url_string).split('\n')[-1])
if len(all_results) > 1:
## Now comes the great song-name replacement.
the_link = all_results[0]['link']
artist_name = artist_name.lower()
artist_name = artist_name.replace(".","")
artist_name = artist_name.replace("'","")
artist_name = artist_name.replace(" ","-")
artist_name = artist_name.replace("tupac","2pac")
artist_name = artist_name.replace("the-game","game")
song_name = song_name.lower()
song_name = song_name.replace("+","-")
song_name = song_name.replace("(","")
song_name = song_name.replace(")","")
song_name = song_name.replace("'","")
song_name = song_name.replace(".","")
song_name = song_name.replace(",","")
song_name = song_name.replace("?","")
song_name = song_name.replace(":","")
song_name = song_name.replace("/","-")
song_name = song_name.replace("a**","ass")
song_name = song_name.replace("f*****","fuckin")
song_name = song_name.replace("f****n","fuckin")
song_name = song_name.replace("f*ck","fuck")
song_name = song_name.replace("f**k","fuck")
song_name = song_name.replace("f***k","fuck")
song_name = song_name.replace("f***","fuck")
song_name = song_name.replace("f*@k","fuck")
song_name = song_name.replace("f#@*","fuck")
song_name = song_name.replace("f*?#","fuck")
song_name = song_name.replace("f**ck","fuck")
song_name = song_name.replace("h**s","hoes")
song_name = song_name.replace("h***s","hoes")
song_name = song_name.replace("h**","hoe")
song_name = song_name.replace("buter","butter")
song_name = song_name.replace("b*****s","bitches")
song_name = song_name.replace("b******","bitches")
song_name = song_name.replace("b****","bitch")
song_name = song_name.replace("b***h","bitch")
song_name = song_name.replace("b*tch","bitch")
song_name = song_name.replace("b**tch","bitch")
song_name = song_name.replace("ni**a","nigga")
song_name = song_name.replace("n***a","nigga")
song_name = song_name.replace("n*gga","nigga")
song_name = song_name.replace("n**gaz","niggaz")
song_name = song_name.replace("n***az","niggaz")
song_name = song_name.replace("n****s","niggas")
song_name = song_name.replace("n****z","niggaz")
song_name = song_name.replace("n******","niggaz")
song_name = song_name.replace("s**t","shit")
song_name = song_name.replace("s***","shit")
song_name = song_name.replace("sh*t","shit")
song_name = song_name.replace("shi*t","shit")
song_name = song_name.replace("sh#t","shit")
song_name = song_name.replace("p***y","pussy")
song_name = song_name.replace("p*ssy","pussy")
song_name = song_name.replace("*****","nigga")
song_name = song_name.replace("****","fuck")
song_name = song_name.replace("***","ass")
song_name = song_name.replace("f*","fuck")
song_name = song_name.replace("d**k","dick")
alt_link = "http://rap.genius.com/%s-%s-lyrics" % (artist_name, song_name)
## Even though I do the search, I still want to check that my search results
## match what I expect for a rap genius lyrics page.
## If they don't match, I just used my pre-constructed link.
## This step is likely to produce links that don't work. Such is life.
if the_link.lower() != alt_link.lower():
return alt_link.lower()
else:
return the_link
else:
return "no_search_results"
else:
return "not_searchable"
else:
return "not_searchable"
In [157]:
## Read in the file containing artist and song names that was generated above.
## Put it into a dataframe. Columns will simply be Artist and Song
infile = 'reece_results/artists_and_songs.txt'
song_df = pd.io.api.read_csv(infile, sep="\t")
In [575]:
## Save all the links to the link list, whether searchable or not.
link_list = []
for ii in range(13492, len(song_df)):
the_link = search_rapgenius_for_song_top_result(song_df.Artist[ii], song_df.Song[ii])
link_list.append(the_link)
I had issues with the above loop taking a LONG long time, so the next few cells were made to accomodate that. What I would do is that when the loop caught a snag (or when I had to cut the loop off and head to work/home), I'd save what progress it made to "already_found". Then I'd start the loop again from whatever index it had left off at.
Once the whole thing was done, I was so paranoid about my already-retrieved data that I did the following:
In [576]:
print "%i links" % ii
In [577]:
## already good up to ii = 13492
already_found.append(link_list)
In [578]:
jeepers = already_found
In [585]:
creepers = []
for kk in range(len(jeepers)-11):
creepers.append(jeepers[kk])
for kk in range(len(jeepers)-11, len(jeepers)):
for nn in range(len(jeepers[kk])):
creepers.append(jeepers[kk][nn])
In [586]:
len(creepers)
Out[586]:
In [587]:
fout = open("lyrics_links.txt","w")
fout.write("ID\tArtist\tSong\tLink\n")
fmt = "%i\t%s\t%s\t%s\n"
for jj in range(len(creepers)):
fout.write(fmt % (jj, song_df.Artist[jj], song_df.Song[jj], creepers[jj]))
fout.close()
In [588]:
## Read in the lyrics list to a data frame
## Column headers: Artist, Song, and Link
infile = "lyrics_links.txt"
lyrics_df = pd.io.api.read_csv(infile, sep="\t")
In [723]:
def get_the_rapgenius_lyrics_page(the_url):
## This function will return JUST the lyrics for every lyrics link given.
## One of the major problems here is that we get Syntax Errors that we are incapable
## of dealing with. We toss those out. This is likely the reason for most of our
## attrition.
the_cmd = "curl -d 'link=%s' -d 'genre=rap' 'http://genius-api.com/api/lyricsInfo'" % the_url
the_json_data = c.getoutput(the_cmd)
if ("Server error" in the_json_data) or ('502 Bad Gateway' in the_json_data):
return "not_found"
the_json_data = the_json_data.replace("\\n",'\n')
the_json_data = the_json_data.replace('\r',' ')
just_lyrics = the_json_data.split('{"lyrics"')
just_lyrics = '{"lyrics"'+just_lyrics[-1]
try:
just_lyrics = eval(just_lyrics.replace('\n',' '))
if hasattr(just_lyrics, 'keys'):
if 'lyrics' in just_lyrics.keys():
return just_lyrics
else:
return "not_found"
else:
return "not_found"
except SyntaxError:
return "not_found"
def save_the_lyrics_page(the_dataframe, the_index, the_page_json):
## Once of the lyrics have been obtained, save the lyrics to file.
if the_page_json != "not_found":
ii = the_index
the_artist = the_dataframe.Artist[ii].replace(' ','_')
the_song = the_dataframe.Song[ii].replace(' ','_')
the_song = the_dataframe.Song[ii].replace("\\",'_')
the_song = the_dataframe.Song[ii].replace("/",'_')
title_string = "%s_%s_%i.txt" % (the_artist, the_song, ii)
the_verses = [section['verses'] for section in the_page_json['lyrics']['sections']]
outdir = "reece_results/lyric_files/"
outfile = outdir+"%s" % title_string
fout = open(outfile,'w')
fmt = "%s\n"
for jj in range(len(the_verses)):
for kk in range(len(the_verses[jj])):
if (len(the_verses[jj][kk]) > 0) & ('content' in the_verses[jj][kk].keys()):
fout.write(fmt % the_verses[jj][kk]['content'])
fout.close()
else:
return 'Next...\n'
In [729]:
# done up to 13971
for jj in range(13971, len(lyrics_df)):
save_the_lyrics_page(lyrics_df, jj, get_the_rapgenius_lyrics_page(lyrics_df.Link[jj]))
In [988]:
## Read in all of the file names
indir = 'reece_results/lyric_files/'
files = [f for f in os.listdir(indir) if f.endswith('.txt')]
In [948]:
def mash_it_up(the_files_list):
## This function is the first in a couple whose
## purpose is to get a simple count of words of interest
## throughout all of the songs.
indir = "reece_results/lyric_files/"
bigtext = ""
for ii in range(len(the_files_list)):
infile = indir+the_files_list[ii]
intext = open(infile).readlines()
if len(intext) > 0:
intext = ''.join(intext)
bigtext += intext
return bigtext
def parse_the_lyrics_file(the_file):
## Given a file name, this function will read in
## the lyrics file and make them one large word block,
## devoid of punctuation, with every word (hopefully)
## spaced by 1 space. Easily splittable.
indir = "reece_results/lyric_files/"
infile = indir+the_file
words = open(infile).readlines()
all_the_words = ''
for ii in range(len(words)):
line = words[ii].replace('\n',' ')
line = line.replace('-',' ')
line = line.replace("'",'')
line = line.replace('"','')
line = line.replace('.','')
line = line.replace(',','')
line = line.replace('?','')
line = line.replace('!','')
line = line.replace(':','')
line = line.replace(';','')
line = line.replace('(','')
line = line.replace(')','')
line = line.replace('/','')
line = line.replace('\\','')
line = line.lower()
all_the_words += line
all_the_words = np.array(all_the_words.split())
return all_the_words
def word_count(text, word):
## A nice little copy from some older code.
## Note that the text has to be a numpy array to work.
## Returns a count of the given word in the given text.
## Just returns the number.
if type(text) != np.ndarray:
text = np.array(text)
count = len(np.where(text == word)[0])
return count
def word_dict(the_lyrics, word_list=None):
## Given a flat text block, this function returns a dictionary
## of counts for some words. If a list of words is not given,
## it will return the count of EVERY word within the given lyrics.
## Note that it filters out words that are 1 character in length.
if word_list == None:
word_list = list(set(the_lyrics.lower().split()))
the_lyrics = np.array(the_lyrics.lower().split())
allwords = {}
for i in range(len(word_list)):
if (len(word_list[i]) > 1) & (word_count(the_lyrics, word_list[i]) > 1):
allwords[word_list[i]] = word_count(the_lyrics, word_list[i])
return allwords
def top_words(the_lyrics):
## This function will find the top word counts, and return
## a dataframe of the lyrics dictionary, sorted by counted word.
lyrics_dict = word_dict(the_lyrics)
the_dataframe = DataFrame(data=lyrics_dict, columns=['Word','Count'])
the_dataframe = the_dataframe.sort(column="Count", ascending=False)
return the_dataframe
def unique_artists(file_list):
## This function is a misnomer. It returns the indices in the original
## "song_df" dataframe for all the song files.
song_indices = []
for ii in range(len(file_list)):
split_this = file_list[ii].split('_')
get_index = eval(split_this[-1].split('.')[0])
song_indices.append(get_index)
return song_indices
def lyric_counts(the_file_list, df_lyrics):
## Returns the song author, song title, and counts for Reece's
## six important words.
indir = "reece_results/lyric_files/"
artists = []
songs = []
science = []
education = []
knowledge = []
teacher = []
teach = []
school = []
for ii in range(len(the_file_list)):
## Split the file name and get the index
ind = eval(the_file_list[ii].split('_')[-1].split('.')[0])
artist = df_lyrics.Artist[ind]
song = df_lyrics.Song[ind]
artists.append(artist)
songs.append(song)
## Read the file
lyrics_text = open(indir+the_file_list[ii]).readlines()
## Filter for if file is empty
if len(lyrics_text) != 0:
text_array = np.array(' '.join(lyrics_text).lower().split())
the_counts = word_count(text_array, 'knowledge')
knowledge.append(the_counts)
the_counts = word_count(text_array, 'teacher')
teacher.append(the_counts)
the_counts = word_count(text_array, 'teach')
teach.append(the_counts)
the_counts = word_count(text_array, 'science')
science.append(the_counts)
the_counts = word_count(text_array, 'education')
education.append(the_counts)
the_counts = word_count(text_array, 'school')
school.append(the_counts)
else:
science.append(0)
education.append(0)
knowledge.append(0)
teacher.append(0)
teach.append(0)
school.append(0)
return artists, songs, science, education, knowledge, teacher, teach, school
In [753]:
bigtext = mash_it_up(files) ## big text block
In [782]:
table = word_dict(bigtext, word_list = ['science','education', 'knowledge', 'teacher', 'teach', 'school'])
In [783]:
## Counts of all 6 important words in ALL of the text
print table
In [986]:
## Just counting the number of times a "nigga" variant was used.
print word_count(np.array(bigtext.split()), 'nigga')
print word_count(np.array(bigtext.split()), 'niggas')
print word_count(np.array(bigtext.split()), 'niggaz')
print word_count(np.array(bigtext.split()), 'nigger')
print word_count(np.array(bigtext.split()), 'niggers')
In [808]:
indices = unique_artists(files)
artists = lyrics_df.Artist[indices]
In [819]:
every_unique_word = top_words(bigtext)
In [829]:
every_word = word_dict(bigtext)
In [839]:
import operator
sorted_words = sorted(every_word.items(), key=operator.itemgetter(1))
In [854]:
## Just a quick little loop to save the counts for every single word throughout
## all of the lyric files.
fdir = "reece_results/"
fout = open(fdir+"word_results.csv","w")
fmt = "%s,%i\n"
for jj in range(len(sorted_words)-1,-1,-1):
fout.write(fmt % (sorted_words[jj][0], sorted_words[jj][1]))
fout.close()
In [949]:
artists, songs, science, education, knowledge, teacher, teach, school = lyric_counts(files, lyrics_df)
In [951]:
newdf = DataFrame({'Artist Name':artists, 'Song Name':songs, 'science':science, 'education':education, 'knowledge':knowledge, 'teacher':teacher, 'teach':teach, 'school':school})
In [952]:
## Save sorted results for each word
newdf.sort(column='science', ascending=False)[:50].to_csv('reece_results/songs_top_science.csv')
newdf.sort(column='education', ascending=False)[:50].to_csv('reece_results/songs_top_education.csv')
newdf.sort(column='knowledge', ascending=False)[:50].to_csv('reece_results/songs_top_knowledge.csv')
newdf.sort(column='teacher', ascending=False)[:50].to_csv('reece_results/songs_top_teacher.csv')
newdf.sort(column='teach', ascending=False)[:50].to_csv('reece_results/songs_top_teach.csv')
newdf.sort(column='school', ascending=False)[:50].to_csv('reece_results/songs_top_school.csv')
In [971]:
## Checking to see how many of my saved lyric files were empty
the_count = 0
for mm in range(len(files)):
intext = open('reece_results/lyric_files/'+files[mm]).readlines()
if len(intext) == 0:
the_count = the_count + 1
print the_count