EXPLORATORY DATA ANALYSIS


In [60]:
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import pandas as pd
import numpy as np
% matplotlib inline

LOAD DATA


In [11]:
df = pd.read_csv('https://s3.amazonaws.com/marweezys-bucket/all_state_insurance_prediction/train.csv')
df.head()


Out[11]:
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10 cat11 cat12 cat13 cat14 cat15 cat16 cat17 cat18 cat19 cat20 cat21 cat22 cat23 cat24 cat25 cat26 cat27 cat28 cat29 cat30 cat31 cat32 cat33 cat34 cat35 cat36 cat37 cat38 cat39 cat40 cat41 cat42 cat43 cat44 cat45 cat46 cat47 cat48 cat49 cat50 cat51 cat52 cat53 cat54 cat55 cat56 cat57 cat58 cat59 cat60 cat61 cat62 cat63 cat64 cat65 cat66 cat67 cat68 cat69 cat70 cat71 cat72 cat73 cat74 cat75 cat76 cat77 cat78 cat79 cat80 cat81 cat82 cat83 cat84 cat85 cat86 cat87 cat88 cat89 cat90 cat91 cat92 cat93 cat94 cat95 cat96 cat97 cat98 cat99 cat100 cat101 cat102 cat103 cat104 cat105 cat106 cat107 cat108 cat109 cat110 cat111 cat112 cat113 cat114 cat115 cat116 cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
0 1 A B A B A A A A B A B A A A A A A A A A A A B A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A B A D B B D D B D C B D B A A A A A D B C E A C T B G A A I E G J G BU BC C AS S A O LB 0.726300 0.245921 0.187583 0.789639 0.310061 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843 2213.18
1 2 A B A A A A A A B B A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A D B B D D A B C B D B A A A A A D D C E E D T L F A A E E I K K BI CQ A AV BM A O DP 0.330514 0.737068 0.592681 0.614134 0.885834 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496 1283.60
2 5 A B A A B A A A B B B B B A A A A A A A A A A A A A A A A A A A A A A B A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A D B B B D B D C B B B A A A A A D D C E E A D L O A B E F H F A AB DK A C AF A I GK 0.261841 0.358319 0.484196 0.236924 0.397069 0.289648 0.315545 0.27320 0.26076 0.32446 0.381398 0.373424 0.195709 0.774425 3005.09
3 10 B B A B A A A A B A A A A A A A A A A A A A B A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A B A A A D B B D D D B C B D B A A A A A D D C E E D T I D A A E E I K K BI CS C N AE A O DJ 0.321594 0.555782 0.527991 0.373816 0.422268 0.440945 0.391128 0.31796 0.32128 0.44467 0.327915 0.321570 0.605077 0.602642 939.85
4 11 A B A B A A A A B B A B A A A A A A A A A A B A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A B A A A A D B D B D B B C B B C A A A B H D B D E E A P F J A A D E K G B H C C Y BM A K CK 0.273204 0.159990 0.527991 0.473202 0.704268 0.178193 0.247408 0.24564 0.22089 0.21230 0.204687 0.202213 0.246011 0.432606 2763.85

In [ ]:
df.describe()

VIEW HEAD


In [7]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
print df.head(5)


   id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10 cat11 cat12 cat13  \
0   1    A    B    A    B    A    A    A    A    B     A     B     A     A   
1   2    A    B    A    A    A    A    A    A    B     B     A     A     A   
2   5    A    B    A    A    B    A    A    A    B     B     B     B     B   
3  10    B    B    A    B    A    A    A    A    B     A     A     A     A   
4  11    A    B    A    B    A    A    A    A    B     B     A     B     A   

  cat14 cat15 cat16 cat17 cat18 cat19 cat20 cat21 cat22 cat23 cat24 cat25  \
0     A     A     A     A     A     A     A     A     A     B     A     A   
1     A     A     A     A     A     A     A     A     A     A     A     A   
2     A     A     A     A     A     A     A     A     A     A     A     A   
3     A     A     A     A     A     A     A     A     A     B     A     A   
4     A     A     A     A     A     A     A     A     A     B     A     A   

  cat26 cat27 cat28 cat29 cat30 cat31 cat32 cat33 cat34 cat35 cat36 cat37  \
0     A     A     A     A     A     A     A     A     A     A     A     A   
1     A     A     A     A     A     A     A     A     A     A     A     A   
2     A     A     A     A     A     A     A     A     A     A     B     A   
3     A     A     A     A     A     A     A     A     A     A     A     A   
4     A     A     A     A     A     A     A     A     A     A     A     A   

  cat38 cat39 cat40 cat41 cat42 cat43 cat44 cat45 cat46 cat47 cat48 cat49  \
0     A     A     A     A     A     A     A     A     A     A     A     A   
1     A     A     A     A     A     A     A     A     A     A     A     A   
2     A     A     A     A     A     A     A     A     A     A     A     A   
3     A     A     A     A     A     A     A     A     A     A     A     A   
4     A     A     A     A     A     A     A     A     A     A     A     A   

  cat50 cat51 cat52 cat53 cat54 cat55 cat56 cat57 cat58 cat59 cat60 cat61  \
0     A     A     A     A     A     A     A     A     A     A     A     A   
1     A     A     A     A     A     A     A     A     A     A     A     A   
2     A     A     A     A     A     A     A     A     A     A     A     A   
3     A     A     A     A     A     A     A     A     A     A     A     A   
4     A     A     A     A     A     A     A     A     A     A     A     A   

  cat62 cat63 cat64 cat65 cat66 cat67 cat68 cat69 cat70 cat71 cat72 cat73  \
0     A     A     A     A     A     A     A     A     A     A     A     A   
1     A     A     A     A     A     A     A     A     A     A     A     A   
2     A     A     A     A     A     A     A     A     A     A     A     A   
3     A     A     A     A     A     A     A     A     A     A     A     B   
4     A     A     A     A     A     A     A     A     A     A     B     A   

  cat74 cat75 cat76 cat77 cat78 cat79 cat80 cat81 cat82 cat83 cat84 cat85  \
0     A     B     A     D     B     B     D     D     B     D     C     B   
1     A     A     A     D     B     B     D     D     A     B     C     B   
2     A     A     A     D     B     B     B     D     B     D     C     B   
3     A     A     A     D     B     B     D     D     D     B     C     B   
4     A     A     A     D     B     D     B     D     B     B     C     B   

  cat86 cat87 cat88 cat89 cat90 cat91 cat92 cat93 cat94 cat95 cat96 cat97  \
0     D     B     A     A     A     A     A     D     B     C     E     A   
1     D     B     A     A     A     A     A     D     D     C     E     E   
2     B     B     A     A     A     A     A     D     D     C     E     E   
3     D     B     A     A     A     A     A     D     D     C     E     E   
4     B     C     A     A     A     B     H     D     B     D     E     E   

  cat98 cat99 cat100 cat101 cat102 cat103 cat104 cat105 cat106 cat107 cat108  \
0     C     T      B      G      A      A      I      E      G      J      G   
1     D     T      L      F      A      A      E      E      I      K      K   
2     A     D      L      O      A      B      E      F      H      F      A   
3     D     T      I      D      A      A      E      E      I      K      K   
4     A     P      F      J      A      A      D      E      K      G      B   

  cat109 cat110 cat111 cat112 cat113 cat114 cat115 cat116     cont1     cont2  \
0     BU     BC      C     AS      S      A      O     LB  0.726300  0.245921   
1     BI     CQ      A     AV     BM      A      O     DP  0.330514  0.737068   
2     AB     DK      A      C     AF      A      I     GK  0.261841  0.358319   
3     BI     CS      C      N     AE      A      O     DJ  0.321594  0.555782   
4      H      C      C      Y     BM      A      K     CK  0.273204  0.159990   

      cont3     cont4     cont5     cont6     cont7    cont8    cont9  \
0  0.187583  0.789639  0.310061  0.718367  0.335060  0.30260  0.67135   
1  0.592681  0.614134  0.885834  0.438917  0.436585  0.60087  0.35127   
2  0.484196  0.236924  0.397069  0.289648  0.315545  0.27320  0.26076   
3  0.527991  0.373816  0.422268  0.440945  0.391128  0.31796  0.32128   
4  0.527991  0.473202  0.704268  0.178193  0.247408  0.24564  0.22089   

    cont10    cont11    cont12    cont13    cont14     loss  
0  0.83510  0.569745  0.594646  0.822493  0.714843  2213.18  
1  0.43919  0.338312  0.366307  0.611431  0.304496  1283.60  
2  0.32446  0.381398  0.373424  0.195709  0.774425  3005.09  
3  0.44467  0.327915  0.321570  0.605077  0.602642   939.85  
4  0.21230  0.204687  0.202213  0.246011  0.432606  2763.85  

VIEW CATEGORICAL DATA


In [16]:
cols = df.columns[1:]
n_cols = 4
n_rows = 29
for i in range(n_rows):
    fg,ax = plt.subplots(nrows=1,ncols=n_cols,sharey=True,figsize=(12, 8))
    for j in range(n_cols):
        sns.countplot(title=cols[i*n_cols+j], data=df, ax=ax[j])


/home/ubuntu/anaconda2/lib/python2.7/site-packages/matplotlib/pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

VIEW CONTINUOUS DATA

  • Kernal Density Estimation
  • Scatter (pair plot)
  • Heat Map (correlation)

In [50]:
cols = df.columns[-15:]
n_cols = 5
n_rows = 3
for i in range(n_rows):
    fg,ax = plt.subplots(nrows=1,ncols=n_cols,sharey=False,figsize=(12, 10))
    for j in range(n_cols):
        col = cols[i*n_cols+j]
        df[col].plot(kind='kde',title=col, ax=ax[j])



In [57]:
cols = df.ix[:,15:]
plt.figure(figsize=(15,15))
sns.set(font_scale=1.4)
hm = sns.pairplot(cols)
plt.show()


<matplotlib.figure.Figure at 0x7ff172ac11d0>

In [34]:
cols = list(df.columns[-15:])
cm = np.corrcoef(df[cols].values.T)
plt.figure(figsize=(15,15))
sns.set(font_scale=1.4)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size':5},yticklabels=cols,xticklabels=cols)
plt.show()


VIEW EFFECT SIZE (Cohen's D)

  • effect size for binary categories

In [61]:
def CohenEffectSize(group1,group2):
    diff = abs(group1.mean() - group2.mean())
    var1, var2 = group1.var(),group2.var()
    n1, n2 = len(group1), len(group2)
    pooled_var = (n1*var1 + n2*var2)/(n1+n2)
    d = diff/sqrt(pooled_var)
    return d

In [62]:
for i in range(1,73):
    temp_vec = []
    group1 = df[df['cat{}'.format(i)]=='A'].loss
    group2 = df[df['cat{}'.format(i)]=='B'].loss
    d = CohenEffectSize(group1,group2)
    if d <= 0.01:
        print 'VERY SMALL ==> cat{} | d={}'.format(i,d)
    elif d <= 0.2:
        print 'SMALL ==> cat{} | d={}'.format(i,d)
    elif d <= 0.5:
        print 'MEDIUM ==> cat{} | d={}'.format(i,d)
    elif d <= 0.8:
        print 'LARGE ==> cat{} | d={}'.format(i,d)
    elif d <= 1.2:
        print 'VERY LARGE ==> cat{} | d={}'.format(i,d)
    elif d <= 2.0:
        print 'HUGE ==> cat{} | d={}'.format(i,d)


LARGE ==> cat1 | d=0.527234418906
MEDIUM ==> cat2 | d=0.476200229156
VERY LARGE ==> cat3 | d=0.864891977738
MEDIUM ==> cat4 | d=0.22909764367
MEDIUM ==> cat5 | d=0.224872349815
MEDIUM ==> cat6 | d=0.256651478496
HUGE ==> cat7 | d=1.89618840506
MEDIUM ==> cat8 | d=0.364065233569
MEDIUM ==> cat9 | d=0.464599483161
VERY LARGE ==> cat10 | d=0.825395544668
LARGE ==> cat11 | d=0.74854499016
VERY LARGE ==> cat12 | d=0.91341827625
LARGE ==> cat13 | d=0.747744872695
VERY LARGE ==> cat14 | d=0.945628535296
SMALL ==> cat15 | d=0.133990554796
VERY LARGE ==> cat16 | d=1.06832510567
VERY LARGE ==> cat17 | d=0.949183875689
LARGE ==> cat18 | d=0.581194935944
MEDIUM ==> cat19 | d=0.359983686376
VERY LARGE ==> cat20 | d=0.952731278095
SMALL ==> cat21 | d=0.13749638646
MEDIUM ==> cat22 | d=0.41657200561
MEDIUM ==> cat23 | d=0.438553357322
LARGE ==> cat24 | d=0.512596933675
MEDIUM ==> cat25 | d=0.30364469796
MEDIUM ==> cat26 | d=0.208346462747
SMALL ==> cat27 | d=0.0300049104631
LARGE ==> cat28 | d=0.638742376891
LARGE ==> cat29 | d=0.590954605747
MEDIUM ==> cat30 | d=0.357727835686
SMALL ==> cat31 | d=0.0756446943794
MEDIUM ==> cat32 | d=0.289987028772
LARGE ==> cat33 | d=0.6002769331
LARGE ==> cat34 | d=0.56805334523
LARGE ==> cat35 | d=0.62738876615
MEDIUM ==> cat36 | d=0.421954036113
SMALL ==> cat37 | d=0.0338703340429
MEDIUM ==> cat38 | d=0.319156158517
SMALL ==> cat39 | d=0.101401863903
LARGE ==> cat40 | d=0.618296035364
MEDIUM ==> cat41 | d=0.455196551506
MEDIUM ==> cat42 | d=0.205224856921
MEDIUM ==> cat43 | d=0.323592448861
SMALL ==> cat44 | d=0.174239660301
LARGE ==> cat45 | d=0.504990338599
LARGE ==> cat46 | d=0.679270980475
LARGE ==> cat47 | d=0.584549304685
MEDIUM ==> cat48 | d=0.427686293808
SMALL ==> cat49 | d=0.119705243102
MEDIUM ==> cat50 | d=0.250186188957
MEDIUM ==> cat51 | d=0.425094812247
SMALL ==> cat52 | d=0.15113295985
SMALL ==> cat53 | d=0.178486076375
SMALL ==> cat54 | d=0.0557488286747
LARGE ==> cat55 | d=0.524848550913
MEDIUM ==> cat56 | d=0.399175232038
VERY LARGE ==> cat58 | d=1.04457660513
VERY LARGE ==> cat59 | d=1.06900426242
MEDIUM ==> cat60 | d=0.307840539322
MEDIUM ==> cat61 | d=0.490799149368
VERY LARGE ==> cat62 | d=0.946441810851
MEDIUM ==> cat63 | d=0.306541398539
HUGE ==> cat64 | d=1.36584814527
LARGE ==> cat65 | d=0.580289838339
MEDIUM ==> cat66 | d=0.281030882426
LARGE ==> cat67 | d=0.669063596502
LARGE ==> cat68 | d=0.506688822337
MEDIUM ==> cat69 | d=0.417518434656
SMALL ==> cat70 | d=0.028926119978
MEDIUM ==> cat71 | d=0.289268768283
MEDIUM ==> cat72 | d=0.485068650309

In [ ]:
# for i in range(1,73):
#     temp_vec = []
#     group1 = df[df['cat{}'.format(i)]=='A'].loss
#     group2 = df[df['cat{}'.format(i)]=='B'].loss
#     d = CohenEffectSize(group1,group2)
#     if d >= 1.0:
#         plt.figure(figsize=(10,10))
#         df[df['cat{}'.format(i)]=='A'].loss.plot(kind='kde',title='cat{}'.format(i))
#         df[df['cat{}'.format(i)]=='B'].loss.plot(kind='kde',title='cat{}'.format(i))