In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf
py.sign_in('maxrose', '••••••••••')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6
print bs.head()
#df.groupby('recordname').hist()
In [2]:
print bs.shape
bs.releasedate.min()
Out[2]:
In [3]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()
aggregations = {
'releasename':{'releases':'value_counts'},
'releasedate':{'maxreleasedate':'max'},
'countryname':{'countries':'count'},
'rating': {'avg_rating':'mean'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
In [4]:
#bsagg.reset_index()
#bsagg.add_suffix('_Count').reset_index()
#bsagg.columns = bsagg.columns.droplevel(1)
#bsagg.reset_index()
bsagg.columns = ['workid','artist','songname','releases','maxreleasedate','countries','avg_rating']
bsagg.tail()
Out[4]:
In [80]:
bsagg.shape
Out[80]:
In [2]:
vc = pd.DataFrame(df.recordname.value_counts().reset_index())
vc.columns = ['recordname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, vc, on='recordname')
df.head()
Out[2]:
In [ ]:
df.sort_values(['sourceartist', 'timesrec'], axis=0, ascending=False, inplace=False)
In [3]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1})
df.plot(kind='box')
Out[3]:
In [28]:
#df.boxplot(column='a', by='b')
#df.describe()
import plotly
plotly.offline.iplot([{
'x': df.recordname,
'y': df.timesrec,
'name': col
} for col in df.columns])
In [4]:
import plotly
from plotly.graph_objs import Scatter, Layout
plotly.offline.iplot({
"data": [Scatter(x=[1, 2, 3, 4], y=[4, 3, 2, 1])],
"layout": Layout(title="hello world")
})
In [ ]:
#df['trkcnt']
trk = df.trackname.value_counts().reset_index()
trk.columns = ['trackname', 'trkcnt']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
df = pd.merge(df, trk, on='trackname')
In [ ]:
sns.heatmap(df.corr())
In [ ]:
df.head()
In [ ]:
df.scatter_matrix(filename='cufflinks/scatter-matrix', world_readable=False)
In [ ]: