In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer


import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf

py.sign_in('maxrose', '••••••••••')

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/bs_covers.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6

bs.head(1)
#df.groupby('recordname').hist()


Out[1]:
artistid artist songname recordingid releasename release_group releasedate labelname countryid countryname workid rating
0 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1987-04-30 Capitol Records 82 Ghana 15243 0

In [161]:
print bs.shape
bs.releasedate.min()


(1755420, 14)
Out[161]:
Timestamp('1980-01-01 00:00:00')

In [2]:
# Update various data mistakes
bs.set_value(bs.songname.str.contains('Keep'), 'songname', 'Keep Your Hands Off My Baby')
bs.set_value(bs.songname.str.contains('Baby You’re a Rich Man'), 'songname', 'Baby, You’re a Rich Man')
bs.set_value(bs.songname.str.contains("It's Only Rock 'n' Roll (but I Like It)"), 'songname', "It's Only Rock 'n Roll (But I Like It)")
bs.set_value(bs.songname.str.contains("When I’m Sixty‐Four"), 'songname', "When I'm Sixty-Four")
bs.set_value(bs.songname.str.contains("Ob‐La‐Di, Ob‐La‐Da"), 'songname', "Ob-La-Di, Ob-La-Da")
bs.set_value(bs.songname.str.contains("Sure To Fall"), 'songname', "Sure To Fall (In Love With You)")
bs.set_value(bs.songname.str.contains("Honey, Don’t!"), 'songname', "Honey, Don’t")
bs.set_value(bs.songname.str.contains("Love in Vain Blues"), 'songname', "Love in Vain")


/Users/maxrose/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: UserWarning:

This pattern has match groups. To actually get the groups, use str.extract.

Out[2]:
artistid artist songname recordingid releasename release_group releasedate labelname countryid countryname workid rating
0 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1987-04-30 Capitol Records 82 Ghana 15243 0
1 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1987-04-30 Capitol Records 198 Sudan 15243 0
2 303 The Beatles Back in the U.S.S.R. 159792 1967–1970 17637 2010-10-18 Apple Records 250 Åland Islands 159792 0
3 303 The Beatles Back in the U.S.S.R. 159792 The Beatles 18243 1980-01-01 Capitol Records 189 Slovakia 159792 0
4 303 The Beatles While My Guitar Gently Weeps 159798 The Beatles 18243 1980-01-01 Apple Records 23 Benin 159798 0
5 303 The Beatles While My Guitar Gently Weeps 159798 The Beatles 18243 1987-08-24 Parlophone 128 Malawi 159798 0
6 303 The Beatles While My Guitar Gently Weeps 159798 The Beatles Box Set 1572813 1988-11-15 Parlophone 198 Sudan 159798 0
7 303 The Beatles Happiness Is a Warm Gun 159799 The Beatles 18243 1987-08-24 Parlophone 92 Guyana 159799 0
8 303 The Beatles Happiness Is a Warm Gun 159799 The Beatles 18243 2009-09-09 Apple Records 198 Sudan 159799 0
9 303 The Beatles Blackbird 159802 The Beatles 18243 1980-01-01 Capitol Records 14 Austria 159802 0
10 303 The Beatles Blackbird 159802 The Beatles 18243 1980-01-01 Parlophone 137 Mayotte 159802 0
11 303 The Beatles Blackbird 159802 The Beatles 18243 1987-08-24 Parlophone 137 Mayotte 159802 0
12 303 The Beatles Blackbird 159802 The Beatles 18243 1987-08-25 Capitol Records 7 Anguilla 159802 0
13 303 The Beatles I Will 159807 The Beatles 18243 1987-08-24 Parlophone 84 Greece 6279932 0
14 303 The Beatles I Will 159807 The Beatles 18243 2009-09-09 Apple Records 171 Portugal 6279932 0
15 303 The Beatles Julia 159808 Imagine: John Lennon: Music From the Motion Pi... 52754 1980-01-01 Capitol Records 5 Andorra 9740934 0
16 303 The Beatles Julia 159808 Imagine: John Lennon: Music From the Motion Pi... 52754 1988-10-10 Parlophone 86 Grenada 9740934 0
17 303 The Beatles From Me to You 177449 1 21569 2000-11-13 EMI 160 Norway 177449 0
18 303 The Beatles From Me to You 177449 Meet The Beatles 1399698 2014-07-22 Capitol Records 198 Sudan 177449 0
19 303 The Beatles Can’t Buy Me Love 177452 1 21569 2000-11-13 Apple Records 7 Anguilla 1141092 0
20 303 The Beatles We Can Work It Out 177460 1962–1966 50580 1980-01-01 Euroton 32 Brunei 5682217 0
21 303 The Beatles We Can Work It Out 177460 1962–1966 50580 1993-10-05 Capitol Records 67 Estonia 5682217 0
22 303 The Beatles We Can Work It Out 177460 Past Masters, Volume Two 39741 1980-01-01 Odeon 131 Mali 5682217 0
23 303 The Beatles Yer Blues 182546 The Beatles 18243 1987-08-24 Parlophone 157 Niue 182546 0
24 303 The Beatles Yer Blues 182546 The Beatles Box Set 1572813 1988-11-15 Apple Records 80 Georgia 182546 0
25 303 The Beatles Yer Blues 182546 The Beatles Box Set 1572813 1988-11-15 Apple Records 198 Sudan 182546 0
26 303 The Beatles Sexy Sadie 182549 The Beatles 18243 1980-01-01 Capitol Records 175 Romania 182549 0
27 303 The Beatles Sexy Sadie 182549 The Beatles Box Set 1572813 1988-11-15 Apple Records 92 Guyana 182549 0
28 303 The Beatles Sexy Sadie 182549 The Beatles Box Set 1572813 1988-11-15 Capitol Records 71 Fiji 182549 0
29 303 The Beatles Helter Skelter 182550 The Beatles Box Set 1572813 1988-11-15 Apple Records 254 Serbia 182550 86
... ... ... ... ... ... ... ... ... ... ... ... ...
1755390 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 227 Vatican City 5652898 0
1755391 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 228 Venezuela 5652898 0
1755392 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 229 Vietnam 5652898 0
1755393 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 230 British Virgin Islands 5652898 0
1755394 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 231 U.S. Virgin Islands 5652898 0
1755395 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 232 Wallis and Futuna 5652898 0
1755396 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 233 Western Sahara 5652898 0
1755397 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 234 Yemen 5652898 0
1755398 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 235 Yugoslavia 5652898 0
1755399 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 236 Democratic Republic of the Congo 5652898 0
1755400 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 237 Zambia 5652898 0
1755401 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 238 Zimbabwe 5652898 0
1755402 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 242 Serbia and Montenegro 5652898 0
1755403 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 243 Soviet Union 5652898 0
1755404 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 244 East Germany 5652898 0
1755405 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 245 Czechoslovakia 5652898 0
1755406 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 247 Montenegro 5652898 0
1755407 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 248 South Georgia and the South Sandwich Islands 5652898 0
1755408 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 249 Palestine 5652898 0
1755409 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 250 Åland Islands 5652898 0
1755410 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 251 Guernsey 5652898 0
1755411 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 252 Isle of Man 5652898 0
1755412 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 253 Jersey 5652898 0
1755413 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 254 Serbia 5652898 0
1755414 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 255 Saint Barthélemy 5652898 0
1755415 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 256 Saint Martin (French part) 5652898 0
1755416 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 257 South Sudan 5652898 0
1755417 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 258 Bonaire, Sint Eustatius and Saba 5652898 0
1755418 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 259 Curaçao 5652898 0
1755419 825 The Rolling Stones Honky Tonk Women 17923564 Sticky Fingers 31058 2015-06-09 UMe 260 Sint Maarten (Dutch part) 5652898 0

1755420 rows × 12 columns


In [ ]:
bs.replace

In [3]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()

#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()


aggregations = {
    'releasename':{'num_releases':'count'},
    'releasedate':{'maxreleasedate':'max'},
    'countryname':{'countries':'count'},
    'rating': {'avg_rating':'mean'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
bsagg.columns = ['workid','artist','songname','maxreleasedate','num_releases','countries','avg_rating']

In [ ]:
bsagg[bsagg.avg_rating > 0].songname

In [214]:
bsagg.head()


Out[214]:
workid artist songname maxreleasedate num_releases countries avg_rating
0 3631 The Beatles Tomorrow Never Knows 2014-01-21 3570 3570 0.0
1 6794 The Beatles Only a Northern Song 2012-06-04 3825 3825 0.0
2 6795 The Beatles All Together Now 2012-06-04 3825 3825 0.0
3 6796 The Beatles Hey Bulldog 2012-06-04 3825 3825 0.0
4 6797 The Beatles It’s All Too Much 2012-06-04 3825 3825 0.0

In [234]:
vc = pd.DataFrame(bs.songname.value_counts().reset_index())
#bs.drop(['timesrec_x','timesrec_y'],axis=1, inplace=True)                  

vc.columns = ['songname', 'timesrec']

#trk.head()
bs = pd.merge(bs, vc, on='songname')
bs.head()


Out[234]:
artistid artist songname recordingid releasename release_group releasedate labelname countryid countryname workid rating timesrec
0 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1987-04-30 Capitol Records 82 Ghana 15243 0 2805
1 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1987-04-30 Capitol Records 198 Sudan 15243 0 2805
2 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1980-01-01 Mobile Fidelity Sound Lab 1 Afghanistan 15243 0 2805
3 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1980-01-01 Mobile Fidelity Sound Lab 2 Albania 15243 0 2805
4 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1980-01-01 Mobile Fidelity Sound Lab 3 Algeria 15243 0 2805

In [ ]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1}) 
#bs.boxplot(kind='box')

In [ ]:
import plotly
plotly.offline.iplot([{
    'x': bs.releasedate,
    'y': bs.timesrec
    
}])

In [ ]:
sns.heatmap(bs.corr())

In [ ]:
#bsagg[['artist','num_releases','maxreleasedate']].scatter_matrix()
from pandas.tools.plotting import scatter_matrix

scatter_matrix(bs, alpha=0.5, figsize=(9, 9), diagonal='kde')

In [218]:
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [235]:
bsagg['title_sent'] = bsagg.songname.apply(detect_sentiment)
bsagg['lyric_sent'] = bsagg.lyrics.apply(detect_sentiment)

In [ ]:
bsagg.boxplot(column='sentiment', by='songname')

In [ ]:
bsagg[bsagg.sentiment > 0.4].groupby(bsagg.artist).count()

In [236]:
# encoding: utf-8
import requests as req
from urllib import quote_plus
from bs4 import BeautifulSoup, SoupStrainer
only_lyrics = SoupStrainer('div',class_='lyricbox')

In [10]:
#lyrics = []
#songs = ['please please me', 'yesterday','i wanna hold your hand']
#artist = ['The Beatles']
#quote_plus(song.encode('utf-8'), safe='-')
baseurl = 'http://lyrics.wikia.com/wiki/'
def getLyrics(workid,artist, song):
    ssong = song.replace("\’", '\'')
    print ssong
    tUrl = "%s%s:%s" %(baseurl,artist,ssong)
    result = req.get(tUrl)
    lyric = BeautifulSoup(result.text, 'html.parser', parse_only=only_lyrics).get_text(' ',strip=True)
    #lyrics.append(workid, song, lyric)
    return lyric

In [11]:
lyrics = []

for idx, row in bsagg.iterrows():
    lyrics.append(getLyrics(row.workid, row.artist, row.songname))
bsagg['lyrics'] = lyrics


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-11-5fc7b4f282a5> in <module>()
      2 
      3 for idx, row in bsagg.iterrows():
----> 4     lyrics.append(getLyrics(row.workid, row.artist, row.songname))
      5 bsagg['lyrics'] = lyrics

<ipython-input-10-a892932f5bff> in getLyrics(workid, artist, song)
      5 baseurl = 'http://lyrics.wikia.com/wiki/'
      6 def getLyrics(workid,artist, song):
----> 7     ssong = song.replace("\’", '\'')
      8     print ssong
      9     tUrl = "%s%s:%s" %(baseurl,artist,ssong)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 1: ordinal not in range(128)

In [242]:
bsagg['title_sent'] = bsagg.songname.apply(detect_sentiment)
bsagg['lyric_sent'] = bsagg.lyrics.apply(detect_sentiment)

In [240]:
bsagg.drop(['sentiment'], axis=1, inplace=True)

In [243]:
bsagg.head()


Out[243]:
workid artist songname maxreleasedate num_releases countries avg_rating lyrics title_sent lyric_sent
0 3631 The Beatles Tomorrow Never Knows 2014-01-21 3570 3570 0.0 Turn off your mind Relax and float down stream... 0.0 -0.101587
1 6794 The Beatles Only a Northern Song 2012-06-04 3825 3825 0.0 If you're listening to this song You may think... 0.0 -0.045179
2 6795 The Beatles All Together Now 2012-06-04 3825 3825 0.0 One, two, three, four Can I have a little more... 0.0 0.122569
3 6796 The Beatles Hey Bulldog 2012-06-04 3825 3825 0.0 Sheepdog standing in the rain Bullfrog doing i... 0.0 0.129570
4 6797 The Beatles It’s All Too Much 2012-06-04 3825 3825 0.0 Spoken: To your mother! It's all too much It's... 0.2 0.224747

In [245]:
bsagg[bsagg.lyrics == ''].count()


Out[245]:
workid            31
artist            31
songname          31
maxreleasedate    31
num_releases      31
countries         31
avg_rating        31
lyrics            31
title_sent        31
lyric_sent        31
dtype: int64

In [ ]: