In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf
py.sign_in('maxrose', '••••••••••')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6
print bs.head()
#df.groupby('recordname').hist()
Out[3]:
artistid
artist
songname
recordingid
releasename
release_group
releasedate
labelname
countryname
workid
rating
0
303
The Beatles
And Your Bird Can Sing
15243
Revolver
212336
1987-4- 30
Apple Records
Afghanistan
15243
0
1
303
The Beatles
And Your Bird Can Sing
15243
Revolver
212336
1987-4- 30
Apple Records
Albania
15243
0
2
303
The Beatles
And Your Bird Can Sing
15243
Revolver
212336
1987-4- 30
Apple Records
Algeria
15243
0
3
303
The Beatles
And Your Bird Can Sing
15243
Revolver
212336
1987-4- 30
Apple Records
American Samoa
15243
0
4
303
The Beatles
And Your Bird Can Sing
15243
Revolver
212336
1987-4- 30
Apple Records
Andorra
15243
0
In [4]:
print bs.shape
print bs.head()
(1755420, 11)
artistid artist songname recordingid releasename \
0 303 The Beatles And Your Bird Can Sing 15243 Revolver
1 303 The Beatles And Your Bird Can Sing 15243 Revolver
2 303 The Beatles And Your Bird Can Sing 15243 Revolver
3 303 The Beatles And Your Bird Can Sing 15243 Revolver
4 303 The Beatles And Your Bird Can Sing 15243 Revolver
release_group releasedate labelname countryname workid rating
0 212336 1987-4- 30 Apple Records Afghanistan 15243 0
1 212336 1987-4- 30 Apple Records Albania 15243 0
2 212336 1987-4- 30 Apple Records Algeria 15243 0
3 212336 1987-4- 30 Apple Records American Samoa 15243 0
4 212336 1987-4- 30 Apple Records Andorra 15243 0
In [46]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()
aggregations = {
'countryname':{'countries':'value_counts'},
'rating': {'avg_rating':'mean'}
}
bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations)
bsagg.add_suffix('_Count').reset_index()
Out[46]:
level_0
level_1
level_2
level_3
level_4
level_5
countryname_Count
rating_Count
countries_Count
avg_rating_Count
0
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
1
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
2
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
3
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
4
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
5
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
6
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
7
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
8
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
9
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
10
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
11
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
12
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
13
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
14
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
15
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
16
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
17
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
18
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
19
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
20
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
21
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
22
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
23
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
24
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
25
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
26
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
27
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
28
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
29
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
...
...
...
...
...
...
...
...
...
1177826
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177827
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177828
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177829
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177830
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177831
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177832
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177833
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177834
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177835
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177836
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177837
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177838
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177839
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177840
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177841
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177842
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177843
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177844
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177845
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177846
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177847
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177848
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177849
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177850
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177851
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177852
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177853
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177854
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177855
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177856 rows × 8 columns
In [44]:
bsagg.reset_index()
Out[44]:
level_0
level_1
level_2
level_3
level_4
level_5
countryname
rating
0
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
1
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
2
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
3
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
4
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
5
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
6
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
7
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
8
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
9
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
10
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
11
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
12
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
13
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
14
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
15
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
16
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
17
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
18
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
19
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
20
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
21
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
22
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
23
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
24
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
25
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
26
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
27
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
28
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
29
3631
The Beatles
Tomorrow Never Knows
Anthology 2
35848
0
NaN
0.0
...
...
...
...
...
...
...
...
...
1177826
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177827
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177828
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177829
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177830
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177831
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177832
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177833
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177834
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177835
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177836
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177837
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177838
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177839
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177840
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177841
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177842
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177843
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177844
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177845
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177846
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177847
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177848
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177849
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177850
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177851
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177852
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177853
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177854
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177855
12954064
The Rolling Stones
Ride On, Baby
Flowers
210935
2010-4-3
NaN
0.0
1177856 rows × 8 columns
In [2]:
vc = pd.DataFrame(df.recordname.value_counts().reset_index())
vc.columns = ['recordname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, vc, on='recordname')
df.head()
Out[2]:
recordingid
sourceartist
recording_artist
recordname
recartistcredit
timesrec
0
18869916
The Rolling Stones
Depeche Mode
Route 66 (Beatmasters Mix)
317
1
1
1454251
The Beatles
The Beatles
'Til There Was You
303
2
2
8165114
The Beatles
Cassandra Wilson
'Til There Was You
33634
2
3
1532489
The Beatles
Rod Stewart
'Till There Was You
4541
1
4
15563136
The Rolling Stones
Chuck Berry
'round and 'round
1710
1
In [ ]:
df.sort_values(['sourceartist', 'timesrec'], axis=0, ascending=False, inplace=False)
In [3]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1})
df.plot(kind='box')
Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x118f045d0>
In [28]:
#df.boxplot(column='a', by='b')
#df.describe()
import plotly
plotly.offline.iplot([{
'x': df.recordname,
'y': df.timesrec,
'name': col
} for col in df.columns])
In [4]:
import plotly
from plotly.graph_objs import Scatter, Layout
plotly.offline.iplot({
"data": [Scatter(x=[1, 2, 3, 4], y=[4, 3, 2, 1])],
"layout": Layout(title="hello world")
})
In [ ]:
#df['trkcnt']
trk = df.trackname.value_counts().reset_index()
trk.columns = ['trackname', 'trkcnt']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, trk, on='trackname')
In [ ]:
sns.heatmap(df.corr())
In [ ]:
df.head()
In [ ]:
df.scatter_matrix(filename='cufflinks/scatter-matrix', world_readable=False)
In [ ]:
Content source: maxrose61/GA_DS
Similar notebooks: