BNP Paribas Cardif Claims Management

BNP Paribas Cardif Claims Management


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import ensemble
import xgboost as xgb
from sklearn.metrics import log_loss, make_scorer, accuracy_score
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import f_classif
import random

# random numbers replication
random.seed(2016)

%matplotlib inline

Preprocessing


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()


Out[2]:
ID target v1 v2 v3 v4 v5 v6 v7 v8 ... v122 v123 v124 v125 v126 v127 v128 v129 v130 v131
0 3 1 1.335739 8.727474 C 3.921026 7.915266 2.599278 3.176895 0.012941 ... 8.000000 1.989780 0.035754 AU 1.804126 3.113719 2.024285 0 0.636365 2.857144
1 4 1 NaN NaN C NaN 9.191265 NaN NaN 2.301630 ... NaN NaN 0.598896 AF NaN NaN 1.957825 0 NaN NaN
2 5 1 0.943877 5.310079 C 4.410969 5.326159 3.979592 3.928571 0.019645 ... 9.333333 2.477596 0.013452 AE 1.773709 3.922193 1.120468 2 0.883118 1.176472
3 6 1 0.797415 8.304757 C 4.225930 11.627438 2.097700 1.987549 0.171947 ... 7.018256 1.812795 0.002267 CJ 1.415230 2.954381 1.990847 1 1.677108 1.034483
4 8 1 NaN NaN C NaN NaN NaN NaN NaN ... NaN NaN NaN Z NaN NaN NaN 0 NaN NaN

5 rows × 133 columns


In [3]:
train.describe()


Out[3]:
ID target v1 v2 v4 v5 v6 v7 v8 v9 ... v121 v122 v123 v124 v126 v127 v128 v129 v130 v131
count 114321.000000 114321.000000 6.448900e+04 6.452500e+04 6.452500e+04 6.569700e+04 6.448900e+04 6.448900e+04 6.570200e+04 6.447000e+04 ... 6.448100e+04 6.447000e+04 63643.000000 6.570200e+04 6.448900e+04 6.448900e+04 6.569700e+04 114321.000000 6.447800e+04 6.442600e+04
mean 114228.928228 0.761199 1.630686e+00 7.464411e+00 4.145098e+00 8.742359e+00 2.436402e+00 2.483921e+00 1.496569e+00 9.031859e+00 ... 2.737596e+00 6.822439e+00 3.549938 9.198120e-01 1.672658e+00 3.239542e+00 2.030373e+00 0.310144 1.925763e+00 1.739389e+00
std 65934.487362 0.426353 1.082813e+00 2.961676e+00 1.148263e+00 2.036018e+00 5.999653e-01 5.894485e-01 2.783003e+00 1.930262e+00 ... 1.356294e+00 1.795978e+00 2.604704 2.099407e+00 5.031683e-01 1.625988e+00 1.074232e+00 0.693262 1.264497e+00 1.134702e+00
min 3.000000 0.000000 -9.996497e-07 -9.817614e-07 -6.475929e-07 -5.287068e-07 -9.055091e-07 -9.468765e-07 -7.783778e-07 -9.828757e-07 ... -9.820642e-07 -9.978497e-07 0.019139 -9.994953e-07 -9.564174e-07 -9.223798e-07 8.197812e-07 0.000000 -9.901257e-07 -9.999134e-07
25% 57280.000000 1.000000 9.135798e-01 5.316428e+00 3.487398e+00 7.605918e+00 2.065064e+00 2.101477e+00 8.658986e-02 7.853659e+00 ... 1.786965e+00 5.647712e+00 1.963315 2.053777e-02 1.417600e+00 2.101900e+00 1.393830e+00 0.000000 1.106172e+00 1.012658e+00
50% 114189.000000 1.000000 1.469550e+00 7.023803e+00 4.205991e+00 8.670867e+00 2.412790e+00 2.452166e+00 3.860317e-01 9.059582e+00 ... 2.436195e+00 6.749117e+00 2.739239 1.398639e-01 1.614802e+00 2.963620e+00 1.798436e+00 0.000000 1.560138e+00 1.589403e+00
75% 171206.000000 1.000000 2.136128e+00 9.465497e+00 4.833250e+00 9.771353e+00 2.775285e+00 2.834285e+00 1.625246e+00 1.023256e+01 ... 3.379175e+00 7.911392e+00 4.075361 8.718333e-01 1.843886e+00 4.108146e+00 2.390158e+00 0.000000 2.332425e+00 2.261905e+00
max 228713.000000 1.000000 2.000000e+01 2.000000e+01 2.000000e+01 2.000000e+01 2.000000e+01 2.000000e+01 2.000000e+01 2.000000e+01 ... 2.000000e+01 2.000000e+01 19.686069 2.000000e+01 1.563161e+01 2.000000e+01 2.000000e+01 11.000000 2.000000e+01 2.000000e+01

8 rows × 114 columns


In [4]:
# Check for null values
train.isnull().any().any()


Out[4]:
True

In [14]:
# features that are string
types = train.columns.to_series().groupby(train.dtypes).groups
ctext = types[np.dtype('object')]
ctext


Out[14]:
['v3',
 'v22',
 'v24',
 'v30',
 'v31',
 'v47',
 'v52',
 'v56',
 'v66',
 'v71',
 'v74',
 'v75',
 'v79',
 'v91',
 'v107',
 'v110',
 'v112',
 'v113',
 'v125']

In [13]:
types[np.dtype('object')]


Out[13]:
['v3',
 'v22',
 'v24',
 'v30',
 'v31',
 'v47',
 'v52',
 'v56',
 'v66',
 'v71',
 'v74',
 'v75',
 'v79',
 'v91',
 'v107',
 'v110',
 'v112',
 'v113',
 'v125']

In [6]:
# fill na with mode for categorical data
for c in ctext:
    mode = train[c].mode()[0]
    train[c] = train[c].fillna(mode)
    mode = test[c].mode()[0]
    test[c] = test[c].fillna(mode)
train[ctext].head()


Out[6]:
v3 v22 v24 v30 v31 v47 v52 v56 v66 v71 v74 v75 v79 v91 v107 v110 v112 v113 v125
0 C XDX C C A C G DI C F B D E A E B O G AU
1 C GUV C C A E G DY A F B D D B B A U G AF
2 C FQ E C A C F AS A B B B E G C B S G AE
3 C ACUE D C B C H BW A F B D B B B B J G CJ
4 C HIT E C A I H BW C F B D C G C A T G Z

In [7]:
# transform string columns to numbers
enc = LabelEncoder()
for c in ctext:
    train[c] = enc.fit_transform(train[c])
    test[c] = enc.fit_transform(test[c])
train[ctext].head()


Out[7]:
v3 v22 v24 v30 v31 v47 v52 v56 v66 v71 v74 v75 v79 v91 v107 v110 v112 v113 v125
0 2 16670 2 2 0 2 6 85 2 4 1 3 4 0 4 1 14 17 21
1 2 7733 2 2 0 4 6 101 0 4 1 3 3 1 1 0 20 17 6
2 2 7086 4 2 0 2 5 17 0 1 1 1 4 6 2 1 18 17 5
3 2 1510 3 2 1 2 7 47 0 4 1 3 1 1 1 1 9 17 64
4 2 8037 4 2 0 8 7 47 2 4 1 3 2 6 2 0 19 17 89

In [8]:
# fill na with mean for numerical data
columns = list(test.columns)
for c in columns:
    mean = train[c].mean()
    train[c] = train[c].fillna(mean)
    mean = test[c].mean()
    test[c] = test[c].fillna(mean)
train.head()


Out[8]:
ID target v1 v2 v3 v4 v5 v6 v7 v8 ... v122 v123 v124 v125 v126 v127 v128 v129 v130 v131
0 3 1 1.335739 8.727474 2 3.921026 7.915266 2.599278 3.176895 0.012941 ... 8.000000 1.989780 0.035754 21 1.804126 3.113719 2.024285 0 0.636365 2.857144
1 4 1 1.630686 7.464411 2 4.145098 9.191265 2.436402 2.483921 2.301630 ... 6.822439 3.549938 0.598896 6 1.672658 3.239542 1.957825 0 1.925763 1.739389
2 5 1 0.943877 5.310079 2 4.410969 5.326159 3.979592 3.928571 0.019645 ... 9.333333 2.477596 0.013452 5 1.773709 3.922193 1.120468 2 0.883118 1.176472
3 6 1 0.797415 8.304757 2 4.225930 11.627438 2.097700 1.987549 0.171947 ... 7.018256 1.812795 0.002267 64 1.415230 2.954381 1.990847 1 1.677108 1.034483
4 8 1 1.630686 7.464411 2 4.145098 8.742359 2.436402 2.483921 1.496569 ... 6.822439 3.549938 0.919812 89 1.672658 3.239542 2.030373 0 1.925763 1.739389

5 rows × 133 columns


In [9]:
# Check for null values
train.isnull().any().any(), test.isnull().any().any()


Out[9]:
(False, False)

In [10]:
# Save transform datasets
train.to_csv("train_new.csv", index=False)
test.to_csv("test_new.csv", index=False)

Feature selection


In [11]:
y_train = train['target']
train_data = train.drop(['target'], axis=1)
features_names = list(train_data.columns.values)
train_new = SelectKBest(f_classif, k=40).fit(train_data, y_train)
train_data.shape


Out[11]:
(114321, 132)

In [12]:
fe=train_new.get_support() 
fe


Out[12]:
array([False, False, False, False,  True, False, False, False, False,
       False,  True, False,  True, False,  True, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False,  True,  True, False,
        True, False,  True, False,  True, False, False, False,  True,
       False, False,  True,  True, False,  True,  True, False, False,
       False,  True,  True, False, False, False, False,  True,  True,
       False,  True, False,  True, False, False, False, False, False,
        True, False, False, False,  True, False, False,  True, False,
       False, False,  True, False,  True, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False, False,  True,  True, False, False,  True, False, False,
       False, False,  True, False,  True, False,  True, False, False,
       False, False, False,  True,  True, False], dtype=bool)

In [13]:
features_selected= [features_names[i] for i in list(fe.nonzero()[0])]
features_selected


Out[13]:
['v4',
 'v10',
 'v12',
 'v14',
 'v17',
 'v21',
 'v31',
 'v33',
 'v34',
 'v36',
 'v38',
 'v40',
 'v44',
 'v47',
 'v48',
 'v50',
 'v51',
 'v55',
 'v56',
 'v61',
 'v62',
 'v64',
 'v66',
 'v72',
 'v76',
 'v79',
 'v83',
 'v85',
 'v88',
 'v93',
 'v101',
 'v106',
 'v110',
 'v111',
 'v114',
 'v119',
 'v121',
 'v123',
 'v129',
 'v130']

In [14]:
# dataset with selected features
train_new = pd.concat([train['ID'], train[features_selected]], axis=1)
test_new = pd.concat([test['ID'], test[features_selected]], axis=1)
train_new.head()


Out[14]:
ID v4 v10 v12 v14 v17 v21 v31 v33 v34 ... v101 v106 v110 v111 v114 v119 v121 v123 v129 v130
0 3 3.921026 0.503281 6.085711 11.636387 3.670350 7.730923 0 1.010829 7.270147 ... 8.389237 12.579184 1 0.433213 15.634907 -6.297423e-07 0.803572 1.989780 0 0.636365
1 4 4.145098 1.312910 6.507647 11.636386 3.832270 6.763110 0 2.161633 3.615077 ... 6.866414 11.791360 0 3.365314 10.308044 3.168970e+00 2.737596 3.549938 0 1.925763
2 5 4.410969 0.765864 6.384670 9.603542 3.170847 5.245035 0 1.734693 4.043864 ... 5.879353 11.670572 1 3.367348 11.205561 -2.792745e-07 2.238806 2.477596 2 0.883118
3 6 4.225930 6.542669 9.646653 14.094723 3.610789 7.517125 1 1.666667 8.703550 ... 8.507281 12.554274 1 2.643678 13.777666 5.655086e-01 1.956521 1.812795 1 1.677108
4 8 4.145098 1.050328 6.320087 10.991098 3.832270 6.414567 0 2.161633 6.083151 ... 6.866414 11.791360 0 3.365314 14.097099 3.168970e+00 2.737596 3.549938 0 1.925763

5 rows × 41 columns

Model


In [18]:
# models parameters
g={'ne':200,'md':20,'mf':40,'rs':2016}

etc = ensemble.ExtraTreesClassifier(n_estimators=g['ne'], max_depth=g['md'], 
    max_features=g['mf'], random_state=g['rs'], criterion='entropy', 
    min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)   

etr = ensemble.ExtraTreesRegressor(n_estimators=g['ne'], max_depth=g['md'],
    max_features=g['mf'], random_state=g['rs'], min_samples_split= 4, 
    min_samples_leaf= 2, verbose = 0, n_jobs =-1)  

rfc = ensemble.RandomForestClassifier(n_estimators=g['ne'], max_depth=g['md'],
    max_features=g['mf'], random_state=g['rs'], criterion='entropy', 
    min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)

rfr = ensemble.RandomForestRegressor(n_estimators=g['ne'], max_depth=g['md'],
    max_features=g['mf'], random_state=g['rs'], min_samples_split= 4,
    min_samples_leaf= 2, verbose = 0, n_jobs =-1)

xgr = xgb.XGBRegressor(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'],
    missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85,
    objective='reg:linear')

xgc = xgb.XGBClassifier(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'],
    missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85, 
        objective='binary:logistic') #try 'binary:logitraw'

#clf = {'etc':etc, 'etr':etr, 'rfc':rfc, 'rfr':rfr, 'xgr':xgr, 'xgc':xgc}
#clf = {'etc':etc, 'rfc':rfc, 'xgc':xgc}
clf = {'etr':etr, 'rfr':rfr, 'rfc':rfc}

In [19]:
# train models with gridSerach
import time
start_time = time.time()

id_test = test['ID']
y_pred = []
best_score = 0.0
id_results = id_test[:]
LL = make_scorer(log_loss, greater_is_better=False)

for c in clf:
    if c[:1] != "x":  #not xgb
        model = GridSearchCV(
            estimator=clf[c],
            param_grid={},
            n_jobs=-1,
            cv=2,
            verbose=0,
            scoring=LL)
        model.fit(train_new, y_train.values)
        if c[-1:] != "c":  #not classifier
            y_pred = model.predict(test_new)
            print("Ensemble Model: ", c, " Best CV score: ", model.best_score_,
                  " Time: ", round(((time.time() - start_time) / 60), 2))

        else:  #classifier
            best_score = (
                log_loss(y_train.values, model.predict_proba(train))) * -1
            y_pred = model.predict_proba(test_new)[:, 1]
            print("Ensemble Model: ", c, " Best CV score: ", best_score,
                  " Time: ", round(((time.time() - start_time) / 60), 2))
    else:  #xgb
        X_fit, X_eval, y_fit, y_eval = train_test_split(
            train_new,
            y_train,
            test_size=0.35,
            train_size=0.65,
            random_state=g['rs'])
        model = clf[c]
        model.fit(
            X_fit,
            y_fit.values,
            early_stopping_rounds=20,
            eval_metric="logloss",
            eval_set=[(X_eval, y_eval)],
            verbose=0)
        if c == "xgr":  #xgb regressor
            best_score = (
                log_loss(y_train.values, model.predict(train_new))) * -1
            y_pred = model.predict(test_new)
        else:  #xgb classifier
            best_score = (
                log_loss(y_train.values, model.predict_proba(train_new))) * -1
            y_pred = model.predict_proba(test_new)[:, 1]
        print("Ensemble Model: ", c, " Best CV score: ", best_score, " Time: ",
              round(((time.time() - start_time) / 60), 2))

    for i in range(len(y_pred)):
        if y_pred[i] < 0.0:
            y_pred[i] = 0.0
        if y_pred[i] > 1.0:
            y_pred[i] = 1.0

    df_in = pd.DataFrame({"ID": id_test, c: y_pred})
    id_results = pd.concat([id_results, df_in[c]], axis=1)


Ensemble Model:  etr  Best CV score:  -0.475762871196  Time:  7.01
Ensemble Model:  rfr  Best CV score:  -0.503053270884  Time:  26.28
Ensemble Model:  xgr  Best CV score:  nan  Time:  29.1

In [20]:
id_results['avg'] = id_results.drop('ID', axis=1).apply(np.average, axis=1)
id_results['min'] = id_results.drop('ID', axis=1).apply(min, axis=1)
id_results['max'] = id_results.drop('ID', axis=1).apply(max, axis=1)
id_results['diff'] = id_results['max'] - id_results['min']
for i in range(10):
    print(i, len(id_results[id_results['diff'] > (i / 10)]))
id_results.to_csv("results_analysis.csv", index=False)
ds = id_results[['ID', 'avg']]
ds.columns = ['ID', 'PredictedProb']
ds.to_csv('submission.csv', index=False)


0 114393
1 26152
2 2709
3 285
4 40
5 9
6 1
7 0
8 0
9 0

In [ ]: