In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf

py.sign_in('maxrose', '••••••••••')

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6

print bs.head()
#df.groupby('recordname').hist()


   artistid       artist                      songname  recordingid  \
0       303  The Beatles        And Your Bird Can Sing        15243   
1       303  The Beatles        And Your Bird Can Sing        15243   
2       303  The Beatles          Back in the U.S.S.R.       159792   
3       303  The Beatles          Back in the U.S.S.R.       159792   
4       303  The Beatles  While My Guitar Gently Weeps       159798   

   releasename  release_group releasedate        labelname  countryid  \
0     Revolver         212336  1987-04-30  Capitol Records         82   
1     Revolver         212336  1987-04-30  Capitol Records        198   
2    1967–1970          17637  2010-10-18    Apple Records        250   
3  The Beatles          18243  1980-01-01  Capitol Records        189   
4  The Beatles          18243  1980-01-01    Apple Records         23   

     countryname  workid  rating  
0          Ghana   15243       0  
1          Sudan   15243       0  
2  Åland Islands  159792       0  
3       Slovakia  159792       0  
4          Benin  159798       0  

In [2]:
print bs.shape
bs.releasedate.min()


(1755420, 12)
Out[2]:
Timestamp('1980-01-01 00:00:00')

In [3]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()

#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()


aggregations = {
    'releasename':{'releases':'value_counts'},
    'releasedate':{'maxreleasedate':'max'},
    'countryname':{'countries':'count'},
    'rating': {'avg_rating':'mean'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()

bsagg.columns = bsagg.columns.droplevel(1)

In [4]:
#bsagg.reset_index()
#bsagg.add_suffix('_Count').reset_index()

#bsagg.columns = bsagg.columns.droplevel(1)
#bsagg.reset_index()
bsagg.columns = ['workid','artist','songname','releases','maxreleasedate','countries','avg_rating']
bsagg.tail()


Out[4]:
workid artist songname releases maxreleasedate countries avg_rating
2449 12900117 The Beatles You'll Be Mine 1995-11-21 NaN 1020 0.0
2450 12925430 The Rolling Stones Miss Amanda Jones 2008-12-24 NaN 1020 0.0
2451 12925430 The Rolling Stones Miss Amanda Jones 2008-12-24 NaN 1020 0.0
2452 12954064 The Rolling Stones Ride On, Baby 2010-04-03 NaN 1020 0.0
2453 12954064 The Rolling Stones Ride On, Baby 2010-04-03 NaN 1020 0.0

In [80]:
bsagg.shape


Out[80]:
(95744, 7)

In [2]:
vc = pd.DataFrame(df.recordname.value_counts().reset_index())
                  

vc.columns = ['recordname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, vc, on='recordname')
df.head()


Out[2]:
recordingid sourceartist recording_artist recordname recartistcredit timesrec
0 18869916 The Rolling Stones Depeche Mode Route 66 (Beatmasters Mix) 317 1
1 1454251 The Beatles The Beatles 'Til There Was You 303 2
2 8165114 The Beatles Cassandra Wilson 'Til There Was You 33634 2
3 1532489 The Beatles Rod Stewart 'Till There Was You 4541 1
4 15563136 The Rolling Stones Chuck Berry 'round and 'round 1710 1

In [ ]:
df.sort_values(['sourceartist', 'timesrec'], axis=0, ascending=False, inplace=False)

In [3]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1}) 
df.plot(kind='box')


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x118f045d0>

In [28]:
#df.boxplot(column='a', by='b')

#df.describe()

import plotly
plotly.offline.iplot([{
    'x': df.recordname,
    'y': df.timesrec,
    'name': col
}  for col in df.columns])



In [4]:
import plotly
from plotly.graph_objs import Scatter, Layout

plotly.offline.iplot({
    "data": [Scatter(x=[1, 2, 3, 4], y=[4, 3, 2, 1])],
    "layout": Layout(title="hello world")
})



In [ ]:
#df['trkcnt'] 
trk = df.trackname.value_counts().reset_index()
trk.columns = ['trackname', 'trkcnt']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, trk, on='trackname')

In [ ]:
sns.heatmap(df.corr())

In [ ]:
df.head()

In [ ]:
df.scatter_matrix(filename='cufflinks/scatter-matrix', world_readable=False)

In [ ]: