This notebook contains the Python code described on my blog in the scraping genius lyrics post.
Head over to my GitHub repository to clone my Python wrapper.
In [137]:
# Sign up for a free account at Genius.com to access the API
# http://genius.com/api-clients
client_access_token = 'CLIENT_ACCESS_TOKEN'
In [138]:
# Let's take a look at how we might search for an artist using the Genius API.
import requests
import urllib2
# Format a request URL for the Genius API
search_term = 'Andy Shauf'
_URL_API = "https://api.genius.com/"
_URL_SEARCH = "search?q="
querystring = _URL_API + _URL_SEARCH + urllib2.quote(search_term)
request = urllib2.Request(querystring)
request.add_header("Authorization", "Bearer " + client_access_token)
# request.add_header("User-Agent","curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpnSSL 0.9.6b) (ipv6 enabled)")
request.add_header("User-Agent", "")
In [139]:
# Now that we’ve formatted the URL, we can make a request to the database.
import json
response = urllib2.urlopen(request, timeout=3)
raw = response.read()
json_obj = json.loads(raw)
In [140]:
# The JSON object is just a normal python dictionary
json_obj.viewkeys()
Out[140]:
In [141]:
# The 'hits` key stores info on each song in the search result.
# From here it's easy to grab the song title, album, etc.
# List each key contained within a single search hit
[key for key in json_obj['response']['hits'][0]['result']]
Out[141]:
In [94]:
# View the song name for each search hit
[song['result']['title'] for song in json_obj['response']['hits']]
Out[94]:
In [95]:
# URL to artist image
print(json_obj['response']['hits'][0]['result']['primary_artist']['image_url'])
In [161]:
# If you have an artist or song ID, you can access that entry
# directly by reformatting the request URL.
song_id = 82926
querystring = "https://api.genius.com/songs/" + str(song_id)
request = urllib2.Request(querystring)
request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent", "")
response = urllib2.urlopen(request, timeout=3)
raw = response.read()
json_obj = json.loads(raw)
print((json_obj['response']['song']['title'],\
json_obj['response']['song']['primary_artist']['name']))
In [165]:
from bs4 import BeautifulSoup
import re
URL = 'https://genius.com/Andy-shauf-the-magician-lyrics'
page = requests.get(URL)
html = BeautifulSoup(page.text, "html.parser") # Extract the page's HTML as a string
# Scrape the song lyrics from the HTML
lyrics = html.find("div", class_="lyrics").get_text().encode('ascii','ignore')
# lyrics = re.sub('\[.*\]','',lyrics) # Remove [Verse] and [Bridge] stuff
# lyrics = re.sub('\n{2}','',lyrics) # Remove gaps between verses
# lyrics = str(lyrics).strip('\n')
print(lyrics[:150]+'...')
You may need to run this code from the Terminal after cloning the repo https://github.com/johnwmillr/geniusapi
In [ ]:
# Create an instance of the API interface
import genius
api = genius.Genius()
In [ ]:
# Search for an artist
artist = G.search_artist('Andy Shauf', max_songs=5)
print(artist)
In [ ]:
# Search for a specific song
song = G.search_song('Wendell Walker', artist.name)
artist.add_song(song)
print(artist)
print(artist.songs[0])