In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client["lyrical_classifier"]
song_coll = db["songs"]

In [2]:
import requests
artist_name="Kendrick Lamar"
r = requests.get("https://api.spotify.com/v1/search?q={name}&type=track&market=US&limit=50".format(name=artist_name))

In [3]:
matching_artist = filter(lambda track: track["artists"][0]["name"].lower() == artist_name.lower(), r.json()["tracks"]["items"])

seen_songs = set()
songs = []
for track in matching_artist:
    if not track["name"] in seen_songs:
        seen_songs.add(track["name"])
        songs.append(track)

In [ ]:
from bs4 import BeautifulSoup

for track in songs:
    try:
        song_name = track["name"].encode("ascii", "replace")
        spotify_id = track["id"]
        album_name = track["album"]["name"]
        
        search_song = requests.get("http://search.azlyrics.com/search.php?q={artist_name} {song_name}".format(artist_name="Kendrick Lamar", song_name=song_name))
        soup = BeautifulSoup(search_song.text, 'html.parser')
        lyrics_url = soup.select("tr a")[0]['href']
        print lyrics_url
        
        lyrics_page = requests.get(lyrics_url, headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"})
        lyrics_soup = BeautifulSoup(lyrics_page.content, 'html.parser')
        lyrics = lyrics_soup.select(".col-xs-12.col-lg-8.text-center div")[7].get_text()
        
        song_coll.update({
                "spotify_id": spotify_id
            },{
                "spotify_id": spotify_id,
                "artist": artist_name,
                "song": song_name,
                "album": album_name,
                "lyrics": lyrics.encode('ascii', 'ignore')
            }, upsert=True)
    except StandardError as e:
        print e