We're going to check the quality of the a new data set...

Download the data file from /home/data/kelleher/MotorInsuranceFraudClaimABTFull.csv


In [2]:
import os, sys
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/home/data/kelleher/MotorInsuranceFraudClaimABTFull.csv')
df.head()


Out[3]:
ID Insurance Type Income of Policy Holder Marital Status Num Claimants Injury Type Overnight Hospital Stay Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
0 1 CI 0 NaN 2 Soft Tissue No 1625 3250 2 2.0 1.0 0 1
1 2 CI 0 NaN 2 Back Yes 15028 60112 1 0.0 0.0 15028 0
2 3 CI 54613 Married 1 Broken Limb No -99999 0 0 0.0 0.0 572 0
3 4 CI 0 NaN 3 Serious Yes 270200 0 0 0.0 0.0 270200 0
4 5 CI 0 NaN 4 Soft Tissue No 8869 0 0 0.0 0.0 0 1

In [4]:
df.shape


Out[4]:
(500, 14)

In [6]:
df.dtypes


Out[6]:
ID                           int64
Insurance Type              object
Income of Policy Holder      int64
Marital Status              object
Num Claimants                int64
Injury Type                 object
Overnight Hospital Stay     object
Claim Amount                 int64
Total Claimed                int64
Num Claims                   int64
Num Soft Tissue            float64
% Soft Tissue              float64
Claim Amount Received        int64
Fraud Flag                   int64
dtype: object

In [7]:
df.ID[:10]


Out[7]:
0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: ID, dtype: int64

In [9]:
df.ID.min(), df.ID.max()


Out[9]:
(1, 500)

In [11]:
df.ID.unique().shape


Out[11]:
(500,)

In [12]:
df['Income of Policy Holder'].hist()


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x6392d50>

In [18]:
df[df['Income of Policy Holder']<30000][['Income of Policy Holder']].hist(bins=100)


Out[18]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7626490>]], dtype=object)

In [20]:
df[df['Income of Policy Holder']<10000].shape, df[df['Income of Policy Holder']==0].shape


Out[20]:
((330, 14), (330, 14))

In [22]:
df[df['Income of Policy Holder']>0][['Income of Policy Holder']].hist(bins=10)


Out[22]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7ee7310>]], dtype=object)

In [26]:
df.loc[df['Income of Policy Holder']==0, 'Income of Policy Holder'] = np.nan

In [28]:
df['Income of Policy Holder'].mean()


Out[28]:
40411.74705882353

In [29]:
df['Income of Policy Holder'].median()


Out[29]:
40455.0

In [ ]:


In [30]:
df.describe()


Out[30]:
ID Income of Policy Holder Num Claimants Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
count 500.000000 170.000000 500.000000 500.00000 500.00000 500.000000 490.000000 500.000000 500.000000 500.000000
mean 250.500000 40411.747059 1.908000 16373.20400 9597.18600 0.798000 0.234694 0.172012 13051.942000 0.336000
std 144.481833 10319.138946 1.012713 29426.27696 35655.68622 2.666724 0.589635 0.428015 30547.194864 0.472812
min 1.000000 16730.000000 1.000000 -99999.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 125.750000 33552.750000 1.000000 3322.25000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 250.500000 40455.000000 2.000000 5663.00000 0.00000 0.000000 0.000000 0.000000 3253.500000 0.000000
75% 375.250000 46913.500000 3.000000 12245.50000 11282.75000 1.000000 0.000000 0.000000 8191.750000 1.000000
max 500.000000 71284.000000 4.000000 270200.00000 729792.00000 56.000000 5.000000 2.000000 295303.000000 1.000000

In [31]:
df.head()


Out[31]:
ID Insurance Type Income of Policy Holder Marital Status Num Claimants Injury Type Overnight Hospital Stay Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
0 1 CI NaN NaN 2 Soft Tissue No 1625 3250 2 2.0 1.0 0 1
1 2 CI NaN NaN 2 Back Yes 15028 60112 1 0.0 0.0 15028 0
2 3 CI 54613.0 Married 1 Broken Limb No -99999 0 0 0.0 0.0 572 0
3 4 CI NaN NaN 3 Serious Yes 270200 0 0 0.0 0.0 270200 0
4 5 CI NaN NaN 4 Soft Tissue No 8869 0 0 0.0 0.0 0 1

In [39]:
(1.0-1.0*df.dropna().shape[0]/df.shape[0])


Out[39]:
0.6659999999999999

In [44]:
df[df['Income of Policy Holder'] == np.nan].shape


Out[44]:
(0, 14)

In [47]:
df[~ np.isnan(df['Income of Policy Holder'])].shape


Out[47]:
(170, 14)

In [48]:
df.dropna(subset=['Income of Policy Holder'])


Out[48]:
ID Insurance Type Income of Policy Holder Marital Status Num Claimants Injury Type Overnight Hospital Stay Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
2 3 CI 54613.0 Married 1 Broken Limb No -99999 0 0 0.0 0.000000 572 0
6 7 CI 52567.0 Single 3 Broken Limb No 3017 18102 2 1.0 0.500000 0 1
9 10 CI 42300.0 Married 4 Back No 2260 0 0 0.0 0.000000 2260 0
14 15 CI 50654.0 Married 2 Soft Tissue No 1893 5679 4 3.0 0.750000 0 1
18 19 CI 37435.0 Divorced 2 Back No 3071 6142 2 0.0 0.000000 3071 0
27 28 CI 38179.0 Single 2 Back No 5212 10424 1 0.0 0.000000 5212 0
35 36 CI 42239.0 Divorced 3 Soft Tissue No 74365 0 0 0.0 0.000000 0 1
37 38 CI 37001.0 Married 2 Broken Limb No 3649 0 0 0.0 0.000000 3649 0
39 40 CI 48898.0 Married 1 Back No 10650 0 0 0.0 0.000000 0 1
40 41 CI 35139.0 Married 2 Broken Limb No 3873 17578 1 0.0 0.000000 3873 0
41 42 CI 42575.0 Married 3 Broken Limb No 29366 0 0 0.0 0.000000 18607 0
45 46 CI 45351.0 Single 3 Back No 4568 0 0 0.0 0.000000 4568 0
46 47 CI 57509.0 Divorced 4 Soft Tissue No 36758 0 0 0.0 0.000000 0 1
51 52 CI 30294.0 Married 1 Back No 11342 7887 1 1.0 1.000000 0 1
52 53 CI 35469.0 Married 1 Broken Limb Yes 3601 11468 1 2.0 2.000000 8986 0
56 57 CI 32366.0 Married 1 Back No 6341 10598 1 0.0 0.000000 6341 0
57 58 CI 51016.0 Married 1 Serious Yes 1065 24679 1 0.0 0.000000 24627 0
63 64 CI 49844.0 Divorced 1 Soft Tissue No 2017 0 0 0.0 0.000000 0 1
68 69 CI 42780.0 Divorced 1 Back No 27313 0 0 0.0 0.000000 0 1
72 73 CI 23850.0 Single 1 Broken Limb No 9221 0 0 0.0 0.000000 9221 0
73 74 CI 53109.0 Single 1 Back No 2957 0 0 0.0 0.000000 0 1
76 77 CI 52319.0 Married 2 Soft Tissue Yes 7484 5152 1 0.0 0.000000 7484 0
77 78 CI 26422.0 Married 1 Broken Limb No 2487 0 0 0.0 0.000000 2487 0
79 80 CI 40461.0 Single 2 Soft Tissue No 2747 0 0 0.0 0.000000 0 1
80 81 CI 42380.0 Married 1 Back No 42163 0 0 0.0 0.000000 39609 0
85 86 CI 21796.0 Single 1 Soft Tissue No 1052 0 0 0.0 0.000000 0 1
87 88 CI 29510.0 Divorced 2 Soft Tissue No 4961 13595 1 1.0 1.000000 0 1
96 97 CI 43015.0 Single 1 Serious Yes 75619 6727 1 0.0 0.000000 75619 0
100 101 CI 40913.0 Married 3 Broken Limb No 1310 0 0 0.0 0.000000 0 1
101 102 CI 46636.0 Married 1 Broken Limb No 3572 13284 1 2.0 2.000000 4026 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
408 409 CI 53764.0 Single 4 Back No 4498 9613 1 0.0 0.000000 4498 0
409 410 CI 54253.0 Married 2 Back No 11333 0 0 0.0 0.000000 0 1
410 411 CI 49071.0 Divorced 1 Soft Tissue No 3757 0 0 0.0 0.000000 0 1
414 415 CI 35239.0 Married 2 Broken Limb No 11254 9059 1 1.0 1.000000 0 1
417 418 CI 32512.0 Single 4 Soft Tissue No 64756 12314 1 0.0 0.000000 64756 0
427 428 CI 42855.0 Single 2 Serious Yes 2059 22887 1 1.0 1.000000 2059 0
429 430 CI 25584.0 Single 1 Broken Limb No 3445 0 0 0.0 0.000000 6363 0
432 433 CI 41622.0 Married 3 Soft Tissue No 3848 0 0 0.0 0.000000 0 1
440 441 CI 37956.0 Married 1 Broken Limb No 3231 26926 1 0.0 0.000000 3231 0
443 444 CI 39741.0 Married 1 Soft Tissue No 7012 0 0 0.0 0.000000 7012 0
446 447 CI 40811.0 Divorced 1 Back Yes 7872 0 0 0.0 0.000000 0 1
451 452 CI 17455.0 Married 1 Back No 4112 0 0 0.0 0.000000 0 1
453 454 CI 35857.0 Divorced 2 Soft Tissue No 3389 0 0 0.0 0.000000 0 1
454 455 CI 54500.0 Single 4 Soft Tissue No 30091 6164 1 1.0 1.000000 30091 0
457 458 CI 48176.0 Married 3 Soft Tissue Yes 4653 8203 1 0.0 0.000000 4653 0
460 461 CI 47371.0 Married 1 Broken Limb Yes 3194 11668 1 0.0 0.000000 3194 0
463 464 CI 42768.0 Married 1 Broken Limb Yes 45273 0 0 0.0 0.000000 45273 0
464 465 CI 49491.0 Married 1 Serious Yes 27326 42726 1 0.0 0.000000 27326 0
469 470 CI 26165.0 Married 1 Back No 5679 42832 3 1.0 0.333333 5679 0
472 473 CI 51570.0 Single 1 Broken Limb No 28953 0 0 0.0 0.000000 28953 0
474 475 CI 52665.0 Married 2 Serious Yes 3160 0 0 0.0 0.000000 3160 0
475 476 CI 43162.0 Married 1 Soft Tissue Yes 6473 0 0 0.0 0.000000 0 1
478 479 CI 34820.0 Married 2 Broken Limb No 4435 0 0 0.0 0.000000 0 1
482 483 CI 41333.0 Single 1 Back No 9899 0 0 0.0 0.000000 10646 0
488 489 CI 41381.0 Married 1 Back No 5391 0 0 0.0 0.000000 5391 0
489 490 CI 34756.0 Divorced 1 Soft Tissue No 2122 0 0 0.0 0.000000 0 1
490 491 CI 40204.0 Single 1 Back No 75748 11116 1 0.0 0.000000 0 1
493 494 CI 31951.0 Married 1 Broken Limb No 5227 22095 1 0.0 0.000000 5227 0
496 497 CI 29280.0 Married 4 Broken Limb Yes 3199 0 0 NaN 0.000000 0 1
498 499 CI 46683.0 Married 1 Broken Limb No 179448 0 0 0.0 0.000000 179448 0

170 rows × 14 columns


In [ ]:


In [51]:
df.groupby('Insurance Type ').ID.count()


Out[51]:
Insurance Type 
CI    500
Name: ID, dtype: int64

In [ ]:
df.groupby('Insurance Type ').ID.count()

In [57]:
df.groupby('Injury Type').ID.count().sort_values(ascending=False)


Out[57]:
Injury Type
Broken Limb    177
Soft Tissue    172
Back           116
Serious         35
Name: ID, dtype: int64

In [58]:
df.columns


Out[58]:
Index([u'ID', u'Insurance Type ', u'Income of Policy Holder',
       u'Marital Status', u'Num Claimants', u'Injury Type',
       u'Overnight Hospital Stay', u'Claim Amount', u'Total Claimed',
       u'Num Claims', u'Num Soft Tissue', u'% Soft Tissue',
       u'Claim Amount Received', u'Fraud Flag'],
      dtype='object')

In [59]:
df.describe()


Out[59]:
ID Income of Policy Holder Num Claimants Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
count 500.000000 170.000000 500.000000 500.00000 500.00000 500.000000 490.000000 500.000000 500.000000 500.000000
mean 250.500000 40411.747059 1.908000 16373.20400 9597.18600 0.798000 0.234694 0.172012 13051.942000 0.336000
std 144.481833 10319.138946 1.012713 29426.27696 35655.68622 2.666724 0.589635 0.428015 30547.194864 0.472812
min 1.000000 16730.000000 1.000000 -99999.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 125.750000 33552.750000 1.000000 3322.25000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 250.500000 40455.000000 2.000000 5663.00000 0.00000 0.000000 0.000000 0.000000 3253.500000 0.000000
75% 375.250000 46913.500000 3.000000 12245.50000 11282.75000 1.000000 0.000000 0.000000 8191.750000 1.000000
max 500.000000 71284.000000 4.000000 270200.00000 729792.00000 56.000000 5.000000 2.000000 295303.000000 1.000000

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [60]:
df.boxplot(column='Claim Amount', by='Injury Type')


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x8690590>

In [61]:
df.boxplot(column='Income of Policy Holder', by='Injury Type')


Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x88c3690>

In [84]:
cols = df.describe().columns[1:4]
Ncol = len(cols)
plt.figure(figsize=(15,15))
n = 0
for i in cols:
    for j in cols:
        n += 1
        if i != j:
            plt.subplot(Ncol, Ncol, n)
            plt.plot(df[i], df[j], '*', alpha=0.2)
            plt.title('%s - %s'%(i, j))
        else:
            plt.subplot(Ncol, Ncol, n)
            plt.hist(df.dropna()[j])
plt.show()