In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer


import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf

py.sign_in('maxrose', '••••••••••')

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')


covers = pd.read_table('data/bs_covers.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums_newmeta2.csv', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6

bs.head(4)
#covers..groupby('recordname').hist()


Out[1]:
artistid artist songname releasename workid recordingid release_group releasedate label_cnt country_cnt rating
0 303 The Beatles A Taste of Honey 1962 Live Recordings 295302 4732310 420949 1962-01-01 1 1 0
1 303 The Beatles Ask Me Why 1962 Live Recordings 295296 4732323 420949 1962-01-01 1 1 0
2 303 The Beatles Be-Bop-A-Lula 1962 Live Recordings 8387505 4732321 420949 1962-01-01 1 1 0
3 303 The Beatles Everybody’s Trying to Be My Baby 1962 Live Recordings 368508 4732314 420949 1962-01-01 1 1 0

In [3]:
# Update various data mistakes
bs.set_value(bs.songname.str.contains('Keep'), 'songname', 'Keep Your Hands Off My Baby')
bs.set_value(bs.songname.str.contains('Baby You’re a Rich Man'), 'songname', 'Baby, You’re a Rich Man')
bs.set_value(bs.songname.str.contains("It's Only Rock 'n' Roll (but I Like It)"), 'songname', "It's Only Rock 'n Roll (But I Like It)")
bs.set_value(bs.songname.str.contains("When I’m Sixty‐Four"), 'songname', "When I'm Sixty-Four")
bs.set_value(bs.songname.str.contains("Ob‐La‐Di, Ob‐La‐Da"), 'songname', "Ob-La-Di, Ob-La-Da")
bs.set_value(bs.songname.str.contains("Sure To Fall"), 'songname', "Sure To Fall (In Love With You)")
bs.set_value(bs.songname.str.contains("Honey, Don’t!"), 'songname', "Honey, Don’t")
bs.set_value(bs.songname.str.contains("Love in Vain Blues"), 'songname', "Love in Vain")


/Users/maxrose/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: UserWarning:

This pattern has match groups. To actually get the groups, use str.extract.

Out[3]:
artistid artist songname releasename workid recordingid release_group releasedate label_cnt country_cnt rating
0 303 The Beatles A Taste of Honey 1962 Live Recordings 295302 4732310 420949 1962-01-01 1 1 0
1 303 The Beatles Ask Me Why 1962 Live Recordings 295296 4732323 420949 1962-01-01 1 1 0
2 303 The Beatles Be-Bop-A-Lula 1962 Live Recordings 8387505 4732321 420949 1962-01-01 1 1 0
3 303 The Beatles Everybody’s Trying to Be My Baby 1962 Live Recordings 368508 4732314 420949 1962-01-01 1 1 0
4 303 The Beatles Falling in Love Again (Can’t Help It) 1962 Live Recordings 2374813 4732319 420949 1962-01-01 1 1 0
5 303 The Beatles Hallelujah, I Love Her So 1962 Live Recordings 2648186 4732322 420949 1962-01-01 1 1 0
6 303 The Beatles Hippy Hippy Shake 1962 Live Recordings 6569228 4732303 420949 1962-01-01 1 1 0
7 303 The Beatles I Remember You 1962 Live Recordings 506842 4732329 420949 1962-01-01 1 1 0
8 303 The Beatles I Saw Her Standing There 1962 Live Recordings 295291 4732300 420949 1962-01-01 1 1 0
9 303 The Beatles I Wish I Could Shimmy Like My Sister Kate 1962 Live Recordings 12516025 4732327 420949 1962-01-01 1 1 0
10 303 The Beatles I’m Gonna Sit Right Down and Cry (Over You) 1962 Live Recordings 737424 4732301 420949 1962-01-01 1 1 0
11 303 The Beatles I’m Talking About You 1962 Live Recordings 8034555 4732326 420949 1962-01-01 1 1 0
12 303 The Beatles Lend Me Your Comb 1962 Live Recordings 12523637 4732305 420949 1962-01-01 1 1 0
13 303 The Beatles Little Queenie 1962 Live Recordings 6007740 4732318 420949 1962-01-01 1 1 0
14 303 The Beatles Long Tall Sally 1962 Live Recordings 9026995 4732328 420949 1962-01-01 1 1 0
15 303 The Beatles Matchbox 1962 Live Recordings 12446929 4732325 420949 1962-01-01 1 1 0
16 303 The Beatles Red Sails in the Sunset 1962 Live Recordings 2954920 4732324 420949 1962-01-01 1 1 0
17 303 The Beatles Roll Over Beethoven 1962 Live Recordings 8322242 4732302 420949 1962-01-01 2 2 0
18 303 The Beatles Sweet Little Sixteen 1962 Live Recordings 6007733 4732304 420949 1962-01-01 1 1 0
19 303 The Beatles Till There Was You 1962 Live Recordings 286220 4732313 420949 1962-01-01 1 1 0
20 303 The Beatles Twist and Shout 1962 Live Recordings 7426353 4732308 420949 1962-01-01 2 2 0
21 303 The Beatles A Hard Day’s Night Best Of the Beatles 6253455 16359607 1392597 1962-01-01 1 1 0
22 303 The Beatles All My Loving Best Of the Beatles 2261384 14506508 1392597 1962-01-01 1 1 0
23 303 The Beatles Do You Want to Know a Secret Best Of the Beatles 295301 295301 1392597 1962-01-01 1 1 0
24 303 The Beatles Eight Days a Week Best Of the Beatles 368502 6385770 1392597 1962-01-01 1 1 0
25 303 The Beatles I’ll Follow the Sun Best Of the Beatles 368499 3527048 1392597 1962-01-01 1 1 0
26 303 The Beatles Misery Best Of the Beatles 295292 295292 1392597 1962-01-01 1 1 0
27 303 The Beatles No Reply Best Of the Beatles 368495 2710474 1392597 1962-01-01 1 1 0
28 303 The Beatles Roll Over Beethoven Best Of the Beatles 8322242 16359627 1392597 1962-01-01 1 1 0
29 303 The Beatles Twist and Shout Best Of the Beatles 7426353 16359609 1392597 1962-01-01 1 1 0
... ... ... ... ... ... ... ... ... ... ... ...
2063 825 The Rolling Stones Ruby Tuesday GRRR! 277644 14077603 1201539 2012-11-12 1 1 90
2064 825 The Rolling Stones Ruby Tuesday GRRR! 277644 14243089 1201539 2012-11-12 2 2 90
2065 825 The Rolling Stones Salt of the Earth GRRR! 12444489 14243093 1201539 2012-11-12 1 1 90
2066 825 The Rolling Stones She’s a Rainbow GRRR! 306564 14077613 1201539 2012-11-12 2 2 90
2067 825 The Rolling Stones Start Me Up GRRR! 185377 14243121 1201539 2012-11-12 3 3 90
2068 825 The Rolling Stones Street Fighting Man GRRR! 198365 14077611 1201539 2012-11-12 3 3 90
2069 825 The Rolling Stones Streets of Love GRRR! 12726465 4506741 1201539 2012-11-12 2 2 90
2070 825 The Rolling Stones Sympathy for the Devil GRRR! 198360 198360 1201539 2012-11-12 1 1 90
2071 825 The Rolling Stones Sympathy for the Devil GRRR! 198360 14243091 1201539 2012-11-12 2 2 90
2072 825 The Rolling Stones That's How Strong My Love Is GRRR! 8115933 14243064 1201539 2012-11-12 1 1 90
2073 825 The Rolling Stones The Last Time GRRR! 7841507 246835 1201539 2012-11-12 3 3 90
2074 825 The Rolling Stones The Red Rooster GRRR! 587423 14077592 1201539 2012-11-12 1 1 90
2075 825 The Rolling Stones Time Is on My Side GRRR! 4378180 14077595 1201539 2012-11-12 1 1 90
2076 825 The Rolling Stones Time Is on My Side GRRR! 4378180 14243074 1201539 2012-11-12 1 1 90
2077 825 The Rolling Stones Tumbling Dice GRRR! 6788059 14077616 1201539 2012-11-12 1 1 90
2078 825 The Rolling Stones Tumbling Dice GRRR! 6788059 14243103 1201539 2012-11-12 2 2 90
2079 825 The Rolling Stones Under My Thumb GRRR! 435908 14077601 1201539 2012-11-12 1 1 90
2080 825 The Rolling Stones Under My Thumb GRRR! 435908 14243079 1201539 2012-11-12 1 1 90
2081 825 The Rolling Stones Undercover of the Night GRRR! 12460142 685731 1201539 2012-11-12 3 3 90
2082 825 The Rolling Stones Waiting on a Friend GRRR! 12460097 14077627 1201539 2012-11-12 1 1 90
2083 825 The Rolling Stones Waiting on a Friend GRRR! 12460097 14243119 1201539 2012-11-12 2 2 90
2084 825 The Rolling Stones We Love You GRRR! 5652857 14077605 1201539 2012-11-12 2 2 90
2085 825 The Rolling Stones Wild Horses GRRR! 224903 224903 1201539 2012-11-12 1 1 90
2086 825 The Rolling Stones Wild Horses GRRR! 224903 14243100 1201539 2012-11-12 2 2 90
2087 825 The Rolling Stones You Can’t Always Get What You Want GRRR! 245459 14077609 1201539 2012-11-12 1 1 90
2088 825 The Rolling Stones You Can’t Always Get What You Want GRRR! 245459 14243098 1201539 2012-11-12 2 2 90
2089 825 The Rolling Stones (I Can’t Get No) Satisfaction Sweet Summer Sun: Hyde Park Live 357708 15493278 1318409 2013-11-11 1 1 0
2090 825 The Rolling Stones Gimme Shelter Sweet Summer Sun: Hyde Park Live 382868 15493273 1318409 2013-11-11 1 1 0
2091 825 The Rolling Stones (I Can’t Get No) Satisfaction The Rolling Stones Live: The Rolling Stones 50... 357708 17443789 1494552 2013-04-30 1 1 80
2092 825 The Rolling Stones Wild Horses Wild Horses 224903 224903 1524223 2015-05-12 1 1 0

2093 rows × 11 columns


In [2]:
print bs.shape
print covers.shape
print covers.columns
bs.describe()


(2093, 11)
(6123, 7)
Index([u'recording_id', u'source_artist', u'recording_artist', u'record_name',
       u'rec_artist_id', u'workid', u'rating'],
      dtype='object')
Out[2]:
artistid workid recordingid release_group label_cnt country_cnt rating
count 2093.000000 2.093000e+03 2.093000e+03 2.093000e+03 2093.000000 2093.000000 2093.000000
mean 537.688008 3.853344e+06 6.432656e+06 6.387133e+05 4.200191 4.200191 41.798853
std 259.732383 4.367795e+06 6.097454e+06 5.705731e+05 4.811113 4.811113 40.098924
min 303.000000 3.631000e+03 1.524300e+04 2.717000e+03 1.000000 1.000000 0.000000
25% 303.000000 2.862180e+05 3.056350e+05 5.128900e+04 1.000000 1.000000 0.000000
50% 303.000000 6.747610e+05 4.732308e+06 5.035320e+05 2.000000 2.000000 50.000000
75% 825.000000 7.025700e+06 1.252034e+07 1.201539e+06 6.000000 6.000000 80.000000
max 825.000000 1.295406e+07 1.858489e+07 1.618206e+06 25.000000 25.000000 100.000000

In [4]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()

#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()


aggregations = {
    'releasename':{'num_releases':'count'}
    ,'releasedate':{'maxreleasedate':'max'}
    ,'rating': {'avg_rating':'max'}
    ,'country_cnt':{'countries':'count'}
    #,'songname': {'times_rec':'value_counts'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
bsagg.columns = ['workid','artist','songname','minreleasedate','num_releases','countries','avg_rating']

In [5]:
bsagg[bsagg.songname == 'Paint It Black']#[['songname', 'avg_rating']]


Out[5]:
workid artist songname minreleasedate num_releases countries avg_rating
189 2750305 The Rolling Stones Paint It Black 2012-11-12 20 20 92

In [5]:
bsagg.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 7 columns):
workid            374 non-null int64
artist            374 non-null object
songname          374 non-null object
minreleasedate    374 non-null datetime64[ns]
num_releases      374 non-null int64
countries         374 non-null int64
avg_rating        374 non-null int64
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 20.5+ KB

In [10]:
#vc = pd.DataFrame(bs.songname.value_counts().reset_index())
#vc.columns = ['songname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
#bs = pd.merge(bs, vc, on='songname')
#bs[bs.songname == 'Paint It Black']
covers[covers.workid == 2750305].count()


Out[10]:
recording_id        105
source_artist       105
recording_artist    105
record_name         105
rec_artist_id       105
workid              105
rating                8
src_id              105
dtype: int64

In [ ]:
covers['src_id'] = covers.source_artist.map({303:0,825:1}) 
#covers.drop(['srcid'], axis = 1, inplace = True)
#covers.plot(kind='box')
covers.groupby(['src_id', 'workid']).rating.mean()

In [ ]:
import plotly
plotly.offline.iplot([{
    'x': bsagg.num_releases,
    'y': bsagg.avg_rating
    
}])

In [11]:
sns.heatmap(covers.corr())


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ae4d050>

In [ ]:
#bsagg[['artist','num_releases','maxreleasedate']].scatter_matrix()
from pandas.tools.plotting import scatter_matrix

scatter_matrix(bsagg, alpha=0.5, figsize=(12, 12), diagonal='kde')

In [9]:
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [10]:
bsagg['sentiment'] = bsagg.songname.apply(detect_sentiment)

In [118]:
bsagg.groupby('artist')[['num_releases']].boxplot()


/Users/maxrose/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.py:3082: FutureWarning:


The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.

Out[118]:
OrderedDict([(u'The Beatles',
              {'boxes': [<matplotlib.lines.Line2D at 0x12adc1510>,
                <matplotlib.lines.Line2D at 0x12b228a50>,
                <matplotlib.lines.Line2D at 0x12c2449d0>,
                <matplotlib.lines.Line2D at 0x12b7f4c90>,
                <matplotlib.lines.Line2D at 0x12b5ba8d0>],
               'caps': [<matplotlib.lines.Line2D at 0x123a9bb50>,
                <matplotlib.lines.Line2D at 0x12c273d90>,
                <matplotlib.lines.Line2D at 0x12b145710>,
                <matplotlib.lines.Line2D at 0x12b1f4d50>,
                <matplotlib.lines.Line2D at 0x12b7d8350>,
                <matplotlib.lines.Line2D at 0x12b7d8990>,
                <matplotlib.lines.Line2D at 0x12b7cdf50>,
                <matplotlib.lines.Line2D at 0x12b7c95d0>,
                <matplotlib.lines.Line2D at 0x12b5cbb90>,
                <matplotlib.lines.Line2D at 0x12b59b210>],
               'fliers': [<matplotlib.lines.Line2D at 0x12b228950>,
                <matplotlib.lines.Line2D at 0x12c273790>,
                <matplotlib.lines.Line2D at 0x12b7f4650>,
                <matplotlib.lines.Line2D at 0x12b5ba290>,
                <matplotlib.lines.Line2D at 0x12b59be90>],
               'means': [],
               'medians': [<matplotlib.lines.Line2D at 0x12ae71210>,
                <matplotlib.lines.Line2D at 0x12b1eae10>,
                <matplotlib.lines.Line2D at 0x12b7d8fd0>,
                <matplotlib.lines.Line2D at 0x12b7c9c10>,
                <matplotlib.lines.Line2D at 0x12b59b850>],
               'whiskers': [<matplotlib.lines.Line2D at 0x12adc1ad0>,
                <matplotlib.lines.Line2D at 0x12ae7f8d0>,
                <matplotlib.lines.Line2D at 0x12ae1f890>,
                <matplotlib.lines.Line2D at 0x12ad69550>,
                <matplotlib.lines.Line2D at 0x12b7c8690>,
                <matplotlib.lines.Line2D at 0x12b7c8cd0>,
                <matplotlib.lines.Line2D at 0x12b7cd2d0>,
                <matplotlib.lines.Line2D at 0x12b7cd910>,
                <matplotlib.lines.Line2D at 0x12b5baed0>,
                <matplotlib.lines.Line2D at 0x12b5cb550>]}),
             (u'The Rolling Stones',
              {'boxes': [<matplotlib.lines.Line2D at 0x12adc1a90>,
                <matplotlib.lines.Line2D at 0x12b168f90>,
                <matplotlib.lines.Line2D at 0x12c195bd0>,
                <matplotlib.lines.Line2D at 0x12c190810>,
                <matplotlib.lines.Line2D at 0x12ae75450>],
               'caps': [<matplotlib.lines.Line2D at 0x12b163b50>,
                <matplotlib.lines.Line2D at 0x12b1681d0>,
                <matplotlib.lines.Line2D at 0x12b172290>,
                <matplotlib.lines.Line2D at 0x12b1728d0>,
                <matplotlib.lines.Line2D at 0x12c161e90>,
                <matplotlib.lines.Line2D at 0x12c187510>,
                <matplotlib.lines.Line2D at 0x12ae7aad0>,
                <matplotlib.lines.Line2D at 0x12ae44150>,
                <matplotlib.lines.Line2D at 0x12ae68710>,
                <matplotlib.lines.Line2D at 0x12ae68d50>],
               'fliers': [<matplotlib.lines.Line2D at 0x12b181450>,
                <matplotlib.lines.Line2D at 0x12c195590>,
                <matplotlib.lines.Line2D at 0x12c1901d0>,
                <matplotlib.lines.Line2D at 0x12ae44dd0>,
                <matplotlib.lines.Line2D at 0x12ae74a10>],
               'means': [],
               'medians': [<matplotlib.lines.Line2D at 0x12b181890>,
                <matplotlib.lines.Line2D at 0x12b172f10>,
                <matplotlib.lines.Line2D at 0x12c187b50>,
                <matplotlib.lines.Line2D at 0x12ae44790>,
                <matplotlib.lines.Line2D at 0x12ae743d0>],
               'whiskers': [<matplotlib.lines.Line2D at 0x12b163290>,
                <matplotlib.lines.Line2D at 0x12b163590>,
                <matplotlib.lines.Line2D at 0x12b1555d0>,
                <matplotlib.lines.Line2D at 0x12b155c10>,
                <matplotlib.lines.Line2D at 0x12c161210>,
                <matplotlib.lines.Line2D at 0x12c161850>,
                <matplotlib.lines.Line2D at 0x12c190e10>,
                <matplotlib.lines.Line2D at 0x12ae7a490>,
                <matplotlib.lines.Line2D at 0x12ae75a50>,
                <matplotlib.lines.Line2D at 0x12ae680d0>]})])

In [11]:
bsagg[(bsagg.sentiment < 0 ) | (bsagg.sentiment > 0.5)]['sentiment'].groupby(bsagg.artist).count()


Out[11]:
artist
The Beatles           31
The Rolling Stones    20
Name: sentiment, dtype: int64

In [ ]: