In [1]:
import os
import pandas as pd
import math
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [29]:
data = pd.read_csv('../kaggle_porto_seguro/train.csv')
data.head()


Out[29]:
id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ... ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 7 0 2 2 5 1 0 0 1 0 ... 9 1 5 8 0 1 1 0 0 1
1 9 0 1 1 7 0 0 0 0 1 ... 3 1 1 9 0 1 1 0 1 0
2 13 0 5 4 9 1 0 0 0 1 ... 4 2 7 7 0 1 1 0 1 0
3 16 0 0 1 2 0 0 1 0 0 ... 2 2 4 9 0 0 0 0 0 0
4 17 0 0 2 0 1 0 1 0 0 ... 3 1 1 3 0 0 0 1 1 0

5 rows × 59 columns


In [30]:
pos_data = data[(data.target == 1)]
neg_data = data[(data.target == 0)]
pos_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 21694 entries, 9 to 595158
Data columns (total 59 columns):
id                21694 non-null int64
target            21694 non-null int64
ps_ind_01         21694 non-null int64
ps_ind_02_cat     21694 non-null int64
ps_ind_03         21694 non-null int64
ps_ind_04_cat     21694 non-null int64
ps_ind_05_cat     21694 non-null int64
ps_ind_06_bin     21694 non-null int64
ps_ind_07_bin     21694 non-null int64
ps_ind_08_bin     21694 non-null int64
ps_ind_09_bin     21694 non-null int64
ps_ind_10_bin     21694 non-null int64
ps_ind_11_bin     21694 non-null int64
ps_ind_12_bin     21694 non-null int64
ps_ind_13_bin     21694 non-null int64
ps_ind_14         21694 non-null int64
ps_ind_15         21694 non-null int64
ps_ind_16_bin     21694 non-null int64
ps_ind_17_bin     21694 non-null int64
ps_ind_18_bin     21694 non-null int64
ps_reg_01         21694 non-null float64
ps_reg_02         21694 non-null float64
ps_reg_03         21694 non-null float64
ps_car_01_cat     21694 non-null int64
ps_car_02_cat     21694 non-null int64
ps_car_03_cat     21694 non-null int64
ps_car_04_cat     21694 non-null int64
ps_car_05_cat     21694 non-null int64
ps_car_06_cat     21694 non-null int64
ps_car_07_cat     21694 non-null int64
ps_car_08_cat     21694 non-null int64
ps_car_09_cat     21694 non-null int64
ps_car_10_cat     21694 non-null int64
ps_car_11_cat     21694 non-null int64
ps_car_11         21694 non-null int64
ps_car_12         21694 non-null float64
ps_car_13         21694 non-null float64
ps_car_14         21694 non-null float64
ps_car_15         21694 non-null float64
ps_calc_01        21694 non-null float64
ps_calc_02        21694 non-null float64
ps_calc_03        21694 non-null float64
ps_calc_04        21694 non-null int64
ps_calc_05        21694 non-null int64
ps_calc_06        21694 non-null int64
ps_calc_07        21694 non-null int64
ps_calc_08        21694 non-null int64
ps_calc_09        21694 non-null int64
ps_calc_10        21694 non-null int64
ps_calc_11        21694 non-null int64
ps_calc_12        21694 non-null int64
ps_calc_13        21694 non-null int64
ps_calc_14        21694 non-null int64
ps_calc_15_bin    21694 non-null int64
ps_calc_16_bin    21694 non-null int64
ps_calc_17_bin    21694 non-null int64
ps_calc_18_bin    21694 non-null int64
ps_calc_19_bin    21694 non-null int64
ps_calc_20_bin    21694 non-null int64
dtypes: float64(10), int64(49)
memory usage: 9.9 MB

In [31]:
# sample negatives
neg_data = neg_data.sample(n=21694)
neg_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 21694 entries, 564168 to 583460
Data columns (total 59 columns):
id                21694 non-null int64
target            21694 non-null int64
ps_ind_01         21694 non-null int64
ps_ind_02_cat     21694 non-null int64
ps_ind_03         21694 non-null int64
ps_ind_04_cat     21694 non-null int64
ps_ind_05_cat     21694 non-null int64
ps_ind_06_bin     21694 non-null int64
ps_ind_07_bin     21694 non-null int64
ps_ind_08_bin     21694 non-null int64
ps_ind_09_bin     21694 non-null int64
ps_ind_10_bin     21694 non-null int64
ps_ind_11_bin     21694 non-null int64
ps_ind_12_bin     21694 non-null int64
ps_ind_13_bin     21694 non-null int64
ps_ind_14         21694 non-null int64
ps_ind_15         21694 non-null int64
ps_ind_16_bin     21694 non-null int64
ps_ind_17_bin     21694 non-null int64
ps_ind_18_bin     21694 non-null int64
ps_reg_01         21694 non-null float64
ps_reg_02         21694 non-null float64
ps_reg_03         21694 non-null float64
ps_car_01_cat     21694 non-null int64
ps_car_02_cat     21694 non-null int64
ps_car_03_cat     21694 non-null int64
ps_car_04_cat     21694 non-null int64
ps_car_05_cat     21694 non-null int64
ps_car_06_cat     21694 non-null int64
ps_car_07_cat     21694 non-null int64
ps_car_08_cat     21694 non-null int64
ps_car_09_cat     21694 non-null int64
ps_car_10_cat     21694 non-null int64
ps_car_11_cat     21694 non-null int64
ps_car_11         21694 non-null int64
ps_car_12         21694 non-null float64
ps_car_13         21694 non-null float64
ps_car_14         21694 non-null float64
ps_car_15         21694 non-null float64
ps_calc_01        21694 non-null float64
ps_calc_02        21694 non-null float64
ps_calc_03        21694 non-null float64
ps_calc_04        21694 non-null int64
ps_calc_05        21694 non-null int64
ps_calc_06        21694 non-null int64
ps_calc_07        21694 non-null int64
ps_calc_08        21694 non-null int64
ps_calc_09        21694 non-null int64
ps_calc_10        21694 non-null int64
ps_calc_11        21694 non-null int64
ps_calc_12        21694 non-null int64
ps_calc_13        21694 non-null int64
ps_calc_14        21694 non-null int64
ps_calc_15_bin    21694 non-null int64
ps_calc_16_bin    21694 non-null int64
ps_calc_17_bin    21694 non-null int64
ps_calc_18_bin    21694 non-null int64
ps_calc_19_bin    21694 non-null int64
ps_calc_20_bin    21694 non-null int64
dtypes: float64(10), int64(49)
memory usage: 9.9 MB

In [32]:
all_data = pd.concat([pos_data, neg_data])
all_data = all_data.sample(frac=1).reset_index(drop=True) # shuffle
all_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43388 entries, 0 to 43387
Data columns (total 59 columns):
id                43388 non-null int64
target            43388 non-null int64
ps_ind_01         43388 non-null int64
ps_ind_02_cat     43388 non-null int64
ps_ind_03         43388 non-null int64
ps_ind_04_cat     43388 non-null int64
ps_ind_05_cat     43388 non-null int64
ps_ind_06_bin     43388 non-null int64
ps_ind_07_bin     43388 non-null int64
ps_ind_08_bin     43388 non-null int64
ps_ind_09_bin     43388 non-null int64
ps_ind_10_bin     43388 non-null int64
ps_ind_11_bin     43388 non-null int64
ps_ind_12_bin     43388 non-null int64
ps_ind_13_bin     43388 non-null int64
ps_ind_14         43388 non-null int64
ps_ind_15         43388 non-null int64
ps_ind_16_bin     43388 non-null int64
ps_ind_17_bin     43388 non-null int64
ps_ind_18_bin     43388 non-null int64
ps_reg_01         43388 non-null float64
ps_reg_02         43388 non-null float64
ps_reg_03         43388 non-null float64
ps_car_01_cat     43388 non-null int64
ps_car_02_cat     43388 non-null int64
ps_car_03_cat     43388 non-null int64
ps_car_04_cat     43388 non-null int64
ps_car_05_cat     43388 non-null int64
ps_car_06_cat     43388 non-null int64
ps_car_07_cat     43388 non-null int64
ps_car_08_cat     43388 non-null int64
ps_car_09_cat     43388 non-null int64
ps_car_10_cat     43388 non-null int64
ps_car_11_cat     43388 non-null int64
ps_car_11         43388 non-null int64
ps_car_12         43388 non-null float64
ps_car_13         43388 non-null float64
ps_car_14         43388 non-null float64
ps_car_15         43388 non-null float64
ps_calc_01        43388 non-null float64
ps_calc_02        43388 non-null float64
ps_calc_03        43388 non-null float64
ps_calc_04        43388 non-null int64
ps_calc_05        43388 non-null int64
ps_calc_06        43388 non-null int64
ps_calc_07        43388 non-null int64
ps_calc_08        43388 non-null int64
ps_calc_09        43388 non-null int64
ps_calc_10        43388 non-null int64
ps_calc_11        43388 non-null int64
ps_calc_12        43388 non-null int64
ps_calc_13        43388 non-null int64
ps_calc_14        43388 non-null int64
ps_calc_15_bin    43388 non-null int64
ps_calc_16_bin    43388 non-null int64
ps_calc_17_bin    43388 non-null int64
ps_calc_18_bin    43388 non-null int64
ps_calc_19_bin    43388 non-null int64
ps_calc_20_bin    43388 non-null int64
dtypes: float64(10), int64(49)
memory usage: 19.5 MB

In [33]:
# standardize data
df_x = all_data.iloc[:,2:]
df_y = all_data.iloc[:,:2]

scaler = StandardScaler().fit(df_x)
df_x = pd.DataFrame(scaler.transform(df_x))

all_data = pd.concat([df_y, df_x], axis=1)
all_data.head()


Out[33]:
id target 0 1 2 3 4 5 6 7 ... 47 48 49 50 51 52 53 54 55 56
0 397909 1 0.006190 -0.540064 -0.176771 1.149286 -0.336169 -0.735220 1.537339 -0.462444 ... 1.095924 -0.361753 -0.511807 -0.564091 -0.375711 0.770888 0.895072 -0.632721 1.376676 -0.421670
1 993433 0 -0.491145 -0.540064 1.643982 -0.864413 0.324924 1.360137 -0.650475 -0.462444 ... -1.042133 0.470427 0.666527 0.524612 -0.375711 0.770888 -1.117228 -0.632721 1.376676 -0.421670
2 804416 1 1.498193 3.892554 2.372283 1.149286 -0.336169 -0.735220 1.537339 -0.462444 ... 1.095924 -0.361753 1.844861 2.339118 -0.375711 0.770888 0.895072 -0.632721 1.376676 -0.421670
3 576857 0 -0.491145 0.937475 -0.176771 1.149286 -0.336169 -0.735220 -0.650475 2.162426 ... 0.668313 2.134787 0.666527 0.524612 -0.375711 -1.297206 -1.117228 -0.632721 1.376676 -0.421670
4 267011 0 -0.988479 0.937475 0.187380 1.149286 -0.336169 -0.735220 1.537339 -0.462444 ... 0.668313 2.134787 0.077360 0.161711 -0.375711 0.770888 -1.117228 -0.632721 -0.726388 2.371523

5 rows × 59 columns


In [34]:
x_train = all_data.iloc[:40000,2:]
y_train = all_data.iloc[:40000,1]
x_test = all_data.iloc[40000:,2:]
y_test = all_data.iloc[40000:,1]
y_test.head()


Out[34]:
40000    0
40001    1
40002    0
40003    1
40004    1
Name: target, dtype: int64

In [39]:
mlp = MLPClassifier(hidden_layer_sizes=(300), 
                    activation='logistic')

mlp.fit(x_train, y_train)


Out[39]:
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [40]:
mlp.score(x_test, y_test)


Out[40]:
0.5832349468713105

In [41]:
test_data = pd.read_csv('../kaggle_porto_seguro/test.csv')
test_data.head()


Out[41]:
id ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ... ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 0 0 1 8 1 0 0 1 0 0 ... 1 1 1 12 0 1 1 0 0 1
1 1 4 2 5 1 0 0 0 0 1 ... 2 0 3 10 0 0 1 1 0 1
2 2 5 1 3 0 0 0 0 0 1 ... 4 0 2 4 0 0 0 0 0 0
3 3 0 1 6 0 0 1 0 0 0 ... 5 1 0 5 1 0 1 0 0 0
4 4 5 1 7 0 0 0 0 0 1 ... 4 0 0 4 0 1 1 0 0 1

5 rows × 58 columns


In [42]:
predictions = mlp.predict(test_data.iloc[:,1:])

In [43]:
submission = pd.DataFrame()
submission['id'] = test_data.iloc[:, 0]
submission['target'] = predictions
submission.head()


Out[43]:
id target
0 0 0
1 1 1
2 2 0
3 3 0
4 4 1

In [44]:
submission.to_csv('kaggle_submission_nn.csv', index=False)

In [ ]: