notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf

py.sign_in('maxrose', '••••••••••')

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6

print bs.head()
#df.groupby('recordname').hist()









    











    



   artistid       artist                      songname  recordingid  \
0       303  The Beatles        And Your Bird Can Sing        15243   
1       303  The Beatles        And Your Bird Can Sing        15243   
2       303  The Beatles          Back in the U.S.S.R.       159792   
3       303  The Beatles          Back in the U.S.S.R.       159792   
4       303  The Beatles  While My Guitar Gently Weeps       159798   

   releasename  release_group releasedate        labelname  countryid  \
0     Revolver         212336  1987-04-30  Capitol Records         82   
1     Revolver         212336  1987-04-30  Capitol Records        198   
2    1967–1970          17637  2010-10-18    Apple Records        250   
3  The Beatles          18243  1980-01-01  Capitol Records        189   
4  The Beatles          18243  1980-01-01    Apple Records         23   

     countryname  workid  rating  
0          Ghana   15243       0  
1          Sudan   15243       0  
2  Åland Islands  159792       0  
3       Slovakia  159792       0  
4          Benin  159798       0



In [2]:

    
print bs.shape
bs.releasedate.min()









    



(1755420, 12)






    Out[2]:





Timestamp('1980-01-01 00:00:00')



In [3]:

    
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()

#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()


aggregations = {
    'releasename':{'releases':'value_counts'},
    'releasedate':{'maxreleasedate':'max'},
    'countryname':{'countries':'count'},
    'rating': {'avg_rating':'mean'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()

bsagg.columns = bsagg.columns.droplevel(1)



In [4]:

    
#bsagg.reset_index()
#bsagg.add_suffix('_Count').reset_index()

#bsagg.columns = bsagg.columns.droplevel(1)
#bsagg.reset_index()
bsagg.columns = ['workid','artist','songname','releases','maxreleasedate','countries','avg_rating']
bsagg.tail()









    Out[4]:






  
    
      
      workid
      artist
      songname
      releases
      maxreleasedate
      countries
      avg_rating
    
  
  
    
      2449
      12900117
      The Beatles
      You'll Be Mine
      1995-11-21
      NaN
      1020
      0.0
    
    
      2450
      12925430
      The Rolling Stones
      Miss Amanda Jones
      2008-12-24
      NaN
      1020
      0.0
    
    
      2451
      12925430
      The Rolling Stones
      Miss Amanda Jones
      2008-12-24
      NaN
      1020
      0.0
    
    
      2452
      12954064
      The Rolling Stones
      Ride On, Baby
      2010-04-03
      NaN
      1020
      0.0
    
    
      2453
      12954064
      The Rolling Stones
      Ride On, Baby
      2010-04-03
      NaN
      1020
      0.0



In [80]:

    
bsagg.shape









    Out[80]:





(95744, 7)



In [2]:

    
vc = pd.DataFrame(df.recordname.value_counts().reset_index())
                  

vc.columns = ['recordname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, vc, on='recordname')
df.head()









    Out[2]:






  
    
      
      recordingid
      sourceartist
      recording_artist
      recordname
      recartistcredit
      timesrec
    
  
  
    
      0
      18869916
      The Rolling Stones
      Depeche Mode
      Route 66 (Beatmasters Mix)
      317
      1
    
    
      1
      1454251
      The Beatles
      The Beatles
      'Til There Was You
      303
      2
    
    
      2
      8165114
      The Beatles
      Cassandra Wilson
      'Til There Was You
      33634
      2
    
    
      3
      1532489
      The Beatles
      Rod Stewart
      'Till There Was You
      4541
      1
    
    
      4
      15563136
      The Rolling Stones
      Chuck Berry
      'round and 'round
      1710
      1



In [ ]:

    
df.sort_values(['sourceartist', 'timesrec'], axis=0, ascending=False, inplace=False)



In [3]:

    
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1}) 
df.plot(kind='box')









    Out[3]:





<matplotlib.axes._subplots.AxesSubplot at 0x118f045d0>



In [28]:

    
#df.boxplot(column='a', by='b')

#df.describe()

import plotly
plotly.offline.iplot([{
    'x': df.recordname,
    'y': df.timesrec,
    'name': col
}  for col in df.columns])



In [4]:

    
import plotly
from plotly.graph_objs import Scatter, Layout

plotly.offline.iplot({
    "data": [Scatter(x=[1, 2, 3, 4], y=[4, 3, 2, 1])],
    "layout": Layout(title="hello world")
})



In [ ]:

    
#df['trkcnt'] 
trk = df.trackname.value_counts().reset_index()
trk.columns = ['trackname', 'trkcnt']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, trk, on='trackname')



In [ ]:

    
sns.heatmap(df.corr())



In [ ]:

    
df.head()



In [ ]:

    
df.scatter_matrix(filename='cufflinks/scatter-matrix', world_readable=False)



In [ ]:

	workid	artist	songname	releases	maxreleasedate	countries
2449	12900117	The Beatles	You'll Be Mine	1995-11-21	NaN	1020
2450	12925430	The Rolling Stones	Miss Amanda Jones	2008-12-24	NaN	1020
2451	12925430	The Rolling Stones	Miss Amanda Jones	2008-12-24	NaN	1020
2452	12954064	The Rolling Stones	Ride On, Baby	2010-04-03	NaN	1020
2453	12954064	The Rolling Stones	Ride On, Baby	2010-04-03	NaN	1020

	recordingid	sourceartist	recording_artist	recordname	recartistcredit	timesrec
0	18869916	The Rolling Stones	Depeche Mode	Route 66 (Beatmasters Mix)	317	1
1	1454251	The Beatles	The Beatles	'Til There Was You	303	2
2	8165114	The Beatles	Cassandra Wilson	'Til There Was You	33634	2
3	1532489	The Beatles	Rod Stewart	'Till There Was You	4541	1
4	15563136	The Rolling Stones	Chuck Berry	'round and 'round	1710	1