In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf
py.sign_in('maxrose', '••••••••••')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/bs_covers.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6
bs.head(1)
#df.groupby('recordname').hist()
Out[1]:
In [161]:
print bs.shape
bs.releasedate.min()
Out[161]:
In [2]:
# Update various data mistakes
bs.set_value(bs.songname.str.contains('Keep'), 'songname', 'Keep Your Hands Off My Baby')
bs.set_value(bs.songname.str.contains('Baby You’re a Rich Man'), 'songname', 'Baby, You’re a Rich Man')
bs.set_value(bs.songname.str.contains("It's Only Rock 'n' Roll (but I Like It)"), 'songname', "It's Only Rock 'n Roll (But I Like It)")
bs.set_value(bs.songname.str.contains("When I’m Sixty‐Four"), 'songname', "When I'm Sixty-Four")
bs.set_value(bs.songname.str.contains("Ob‐La‐Di, Ob‐La‐Da"), 'songname', "Ob-La-Di, Ob-La-Da")
bs.set_value(bs.songname.str.contains("Sure To Fall"), 'songname', "Sure To Fall (In Love With You)")
bs.set_value(bs.songname.str.contains("Honey, Don’t!"), 'songname', "Honey, Don’t")
bs.set_value(bs.songname.str.contains("Love in Vain Blues"), 'songname', "Love in Vain")
Out[2]:
In [ ]:
bs.replace
In [3]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()
aggregations = {
'releasename':{'num_releases':'count'},
'releasedate':{'maxreleasedate':'max'},
'countryname':{'countries':'count'},
'rating': {'avg_rating':'mean'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
bsagg.columns = ['workid','artist','songname','maxreleasedate','num_releases','countries','avg_rating']
In [ ]:
bsagg[bsagg.avg_rating > 0].songname
In [214]:
bsagg.head()
Out[214]:
In [234]:
vc = pd.DataFrame(bs.songname.value_counts().reset_index())
#bs.drop(['timesrec_x','timesrec_y'],axis=1, inplace=True)
vc.columns = ['songname', 'timesrec']
#trk.head()
bs = pd.merge(bs, vc, on='songname')
bs.head()
Out[234]:
In [ ]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1})
#bs.boxplot(kind='box')
In [ ]:
import plotly
plotly.offline.iplot([{
'x': bs.releasedate,
'y': bs.timesrec
}])
In [ ]:
sns.heatmap(bs.corr())
In [ ]:
#bsagg[['artist','num_releases','maxreleasedate']].scatter_matrix()
from pandas.tools.plotting import scatter_matrix
scatter_matrix(bs, alpha=0.5, figsize=(9, 9), diagonal='kde')
In [218]:
def detect_sentiment(text):
return TextBlob(text).sentiment.polarity
In [235]:
bsagg['title_sent'] = bsagg.songname.apply(detect_sentiment)
bsagg['lyric_sent'] = bsagg.lyrics.apply(detect_sentiment)
In [ ]:
bsagg.boxplot(column='sentiment', by='songname')
In [ ]:
bsagg[bsagg.sentiment > 0.4].groupby(bsagg.artist).count()
In [236]:
# encoding: utf-8
import requests as req
from urllib import quote_plus
from bs4 import BeautifulSoup, SoupStrainer
only_lyrics = SoupStrainer('div',class_='lyricbox')
In [10]:
#lyrics = []
#songs = ['please please me', 'yesterday','i wanna hold your hand']
#artist = ['The Beatles']
#quote_plus(song.encode('utf-8'), safe='-')
baseurl = 'http://lyrics.wikia.com/wiki/'
def getLyrics(workid,artist, song):
ssong = song.replace("\’", '\'')
print ssong
tUrl = "%s%s:%s" %(baseurl,artist,ssong)
result = req.get(tUrl)
lyric = BeautifulSoup(result.text, 'html.parser', parse_only=only_lyrics).get_text(' ',strip=True)
#lyrics.append(workid, song, lyric)
return lyric
In [11]:
lyrics = []
for idx, row in bsagg.iterrows():
lyrics.append(getLyrics(row.workid, row.artist, row.songname))
bsagg['lyrics'] = lyrics
In [242]:
bsagg['title_sent'] = bsagg.songname.apply(detect_sentiment)
bsagg['lyric_sent'] = bsagg.lyrics.apply(detect_sentiment)
In [240]:
bsagg.drop(['sentiment'], axis=1, inplace=True)
In [243]:
bsagg.head()
Out[243]:
In [245]:
bsagg[bsagg.lyrics == ''].count()
Out[245]:
In [ ]: