We're going to check the quality of the a new data set...

Download the data file from /home/data/kelleher/MotorInsuranceFraudClaimABTFull.csv


In [2]:
import os, sys
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/home/data/kelleher/MotorInsuranceFraudClaimABTFull.csv')
df.head()


Out[3]:
ID Insurance Type Income of Policy Holder Marital Status Num Claimants Injury Type Overnight Hospital Stay Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
0 1 CI 0 NaN 2 Soft Tissue No 1625 3250 2 2.0 1.0 0 1
1 2 CI 0 NaN 2 Back Yes 15028 60112 1 0.0 0.0 15028 0
2 3 CI 54613 Married 1 Broken Limb No -99999 0 0 0.0 0.0 572 0
3 4 CI 0 NaN 3 Serious Yes 270200 0 0 0.0 0.0 270200 0
4 5 CI 0 NaN 4 Soft Tissue No 8869 0 0 0.0 0.0 0 1

In [4]:
df.shape


Out[4]:
(500, 14)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [4]:
df.describe()


Out[4]:
ID Income of Policy Holder Num Claimants Claim Amount Total Claimed Num Claims Num Soft Tissue % Soft Tissue Claim Amount Received Fraud Flag
count 500.000000 500.000000 500.000000 500.00000 500.00000 500.000000 490.000000 500.000000 500.000000 500.000000
mean 250.500000 13739.994000 1.908000 16373.20400 9597.18600 0.798000 0.234694 0.172012 13051.942000 0.336000
std 144.481833 20081.535489 1.012713 29426.27696 35655.68622 2.666724 0.589635 0.428015 30547.194864 0.472812
min 1.000000 0.000000 1.000000 -99999.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 125.750000 0.000000 1.000000 3322.25000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 250.500000 0.000000 2.000000 5663.00000 0.00000 0.000000 0.000000 0.000000 3253.500000 0.000000
75% 375.250000 33918.500000 3.000000 12245.50000 11282.75000 1.000000 0.000000 0.000000 8191.750000 1.000000
max 500.000000 71284.000000 4.000000 270200.00000 729792.00000 56.000000 5.000000 2.000000 295303.000000 1.000000

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [5]:
df.boxplot(column='Claim Amount', by='Injury Type')


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x3bfb150>