In [60]:
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import pandas as pd
import numpy as np
% matplotlib inline
In [11]:
df = pd.read_csv('https://s3.amazonaws.com/marweezys-bucket/all_state_insurance_prediction/train.csv')
df.head()
Out[11]:
In [ ]:
df.describe()
In [7]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
print df.head(5)
In [16]:
cols = df.columns[1:]
n_cols = 4
n_rows = 29
for i in range(n_rows):
fg,ax = plt.subplots(nrows=1,ncols=n_cols,sharey=True,figsize=(12, 8))
for j in range(n_cols):
sns.countplot(title=cols[i*n_cols+j], data=df, ax=ax[j])
In [50]:
cols = df.columns[-15:]
n_cols = 5
n_rows = 3
for i in range(n_rows):
fg,ax = plt.subplots(nrows=1,ncols=n_cols,sharey=False,figsize=(12, 10))
for j in range(n_cols):
col = cols[i*n_cols+j]
df[col].plot(kind='kde',title=col, ax=ax[j])
In [57]:
cols = df.ix[:,15:]
plt.figure(figsize=(15,15))
sns.set(font_scale=1.4)
hm = sns.pairplot(cols)
plt.show()
In [34]:
cols = list(df.columns[-15:])
cm = np.corrcoef(df[cols].values.T)
plt.figure(figsize=(15,15))
sns.set(font_scale=1.4)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size':5},yticklabels=cols,xticklabels=cols)
plt.show()
In [61]:
def CohenEffectSize(group1,group2):
diff = abs(group1.mean() - group2.mean())
var1, var2 = group1.var(),group2.var()
n1, n2 = len(group1), len(group2)
pooled_var = (n1*var1 + n2*var2)/(n1+n2)
d = diff/sqrt(pooled_var)
return d
In [62]:
for i in range(1,73):
temp_vec = []
group1 = df[df['cat{}'.format(i)]=='A'].loss
group2 = df[df['cat{}'.format(i)]=='B'].loss
d = CohenEffectSize(group1,group2)
if d <= 0.01:
print 'VERY SMALL ==> cat{} | d={}'.format(i,d)
elif d <= 0.2:
print 'SMALL ==> cat{} | d={}'.format(i,d)
elif d <= 0.5:
print 'MEDIUM ==> cat{} | d={}'.format(i,d)
elif d <= 0.8:
print 'LARGE ==> cat{} | d={}'.format(i,d)
elif d <= 1.2:
print 'VERY LARGE ==> cat{} | d={}'.format(i,d)
elif d <= 2.0:
print 'HUGE ==> cat{} | d={}'.format(i,d)
In [ ]:
# for i in range(1,73):
# temp_vec = []
# group1 = df[df['cat{}'.format(i)]=='A'].loss
# group2 = df[df['cat{}'.format(i)]=='B'].loss
# d = CohenEffectSize(group1,group2)
# if d >= 1.0:
# plt.figure(figsize=(10,10))
# df[df['cat{}'.format(i)]=='A'].loss.plot(kind='kde',title='cat{}'.format(i))
# df[df['cat{}'.format(i)]=='B'].loss.plot(kind='kde',title='cat{}'.format(i))