We're going to check the quality of the a new data set...
Download the data file from /home/data/kelleher/MotorInsuranceFraudClaimABTFull.csv
In [2]:
import os, sys
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
df = pd.read_csv('/home/data/kelleher/MotorInsuranceFraudClaimABTFull.csv')
df.head()
Out[3]:
In [4]:
df.shape
Out[4]:
In [6]:
df.dtypes
Out[6]:
In [7]:
df.ID[:10]
Out[7]:
In [9]:
df.ID.min(), df.ID.max()
Out[9]:
In [11]:
df.ID.unique().shape
Out[11]:
In [12]:
df['Income of Policy Holder'].hist()
Out[12]:
In [18]:
df[df['Income of Policy Holder']<30000][['Income of Policy Holder']].hist(bins=100)
Out[18]:
In [20]:
df[df['Income of Policy Holder']<10000].shape, df[df['Income of Policy Holder']==0].shape
Out[20]:
In [22]:
df[df['Income of Policy Holder']>0][['Income of Policy Holder']].hist(bins=10)
Out[22]:
In [26]:
df.loc[df['Income of Policy Holder']==0, 'Income of Policy Holder'] = np.nan
In [28]:
df['Income of Policy Holder'].mean()
Out[28]:
In [29]:
df['Income of Policy Holder'].median()
Out[29]:
In [ ]:
In [30]:
df.describe()
Out[30]:
In [31]:
df.head()
Out[31]:
In [39]:
(1.0-1.0*df.dropna().shape[0]/df.shape[0])
Out[39]:
In [44]:
df[df['Income of Policy Holder'] == np.nan].shape
Out[44]:
In [47]:
df[~ np.isnan(df['Income of Policy Holder'])].shape
Out[47]:
In [48]:
df.dropna(subset=['Income of Policy Holder'])
Out[48]:
In [ ]:
In [51]:
df.groupby('Insurance Type ').ID.count()
Out[51]:
In [ ]:
df.groupby('Insurance Type ').ID.count()
In [57]:
df.groupby('Injury Type').ID.count().sort_values(ascending=False)
Out[57]:
In [58]:
df.columns
Out[58]:
In [59]:
df.describe()
Out[59]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [60]:
df.boxplot(column='Claim Amount', by='Injury Type')
Out[60]:
In [61]:
df.boxplot(column='Income of Policy Holder', by='Injury Type')
Out[61]:
In [84]:
cols = df.describe().columns[1:4]
Ncol = len(cols)
plt.figure(figsize=(15,15))
n = 0
for i in cols:
for j in cols:
n += 1
if i != j:
plt.subplot(Ncol, Ncol, n)
plt.plot(df[i], df[j], '*', alpha=0.2)
plt.title('%s - %s'%(i, j))
else:
plt.subplot(Ncol, Ncol, n)
plt.hist(df.dropna()[j])
plt.show()