In [1]:
import os
import pandas as pd
import math
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
In [29]:
data = pd.read_csv('../kaggle_porto_seguro/train.csv')
data.head()
Out[29]:
id
target
ps_ind_01
ps_ind_02_cat
ps_ind_03
ps_ind_04_cat
ps_ind_05_cat
ps_ind_06_bin
ps_ind_07_bin
ps_ind_08_bin
...
ps_calc_11
ps_calc_12
ps_calc_13
ps_calc_14
ps_calc_15_bin
ps_calc_16_bin
ps_calc_17_bin
ps_calc_18_bin
ps_calc_19_bin
ps_calc_20_bin
0
7
0
2
2
5
1
0
0
1
0
...
9
1
5
8
0
1
1
0
0
1
1
9
0
1
1
7
0
0
0
0
1
...
3
1
1
9
0
1
1
0
1
0
2
13
0
5
4
9
1
0
0
0
1
...
4
2
7
7
0
1
1
0
1
0
3
16
0
0
1
2
0
0
1
0
0
...
2
2
4
9
0
0
0
0
0
0
4
17
0
0
2
0
1
0
1
0
0
...
3
1
1
3
0
0
0
1
1
0
5 rows × 59 columns
In [30]:
pos_data = data[(data.target == 1)]
neg_data = data[(data.target == 0)]
pos_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21694 entries, 9 to 595158
Data columns (total 59 columns):
id 21694 non-null int64
target 21694 non-null int64
ps_ind_01 21694 non-null int64
ps_ind_02_cat 21694 non-null int64
ps_ind_03 21694 non-null int64
ps_ind_04_cat 21694 non-null int64
ps_ind_05_cat 21694 non-null int64
ps_ind_06_bin 21694 non-null int64
ps_ind_07_bin 21694 non-null int64
ps_ind_08_bin 21694 non-null int64
ps_ind_09_bin 21694 non-null int64
ps_ind_10_bin 21694 non-null int64
ps_ind_11_bin 21694 non-null int64
ps_ind_12_bin 21694 non-null int64
ps_ind_13_bin 21694 non-null int64
ps_ind_14 21694 non-null int64
ps_ind_15 21694 non-null int64
ps_ind_16_bin 21694 non-null int64
ps_ind_17_bin 21694 non-null int64
ps_ind_18_bin 21694 non-null int64
ps_reg_01 21694 non-null float64
ps_reg_02 21694 non-null float64
ps_reg_03 21694 non-null float64
ps_car_01_cat 21694 non-null int64
ps_car_02_cat 21694 non-null int64
ps_car_03_cat 21694 non-null int64
ps_car_04_cat 21694 non-null int64
ps_car_05_cat 21694 non-null int64
ps_car_06_cat 21694 non-null int64
ps_car_07_cat 21694 non-null int64
ps_car_08_cat 21694 non-null int64
ps_car_09_cat 21694 non-null int64
ps_car_10_cat 21694 non-null int64
ps_car_11_cat 21694 non-null int64
ps_car_11 21694 non-null int64
ps_car_12 21694 non-null float64
ps_car_13 21694 non-null float64
ps_car_14 21694 non-null float64
ps_car_15 21694 non-null float64
ps_calc_01 21694 non-null float64
ps_calc_02 21694 non-null float64
ps_calc_03 21694 non-null float64
ps_calc_04 21694 non-null int64
ps_calc_05 21694 non-null int64
ps_calc_06 21694 non-null int64
ps_calc_07 21694 non-null int64
ps_calc_08 21694 non-null int64
ps_calc_09 21694 non-null int64
ps_calc_10 21694 non-null int64
ps_calc_11 21694 non-null int64
ps_calc_12 21694 non-null int64
ps_calc_13 21694 non-null int64
ps_calc_14 21694 non-null int64
ps_calc_15_bin 21694 non-null int64
ps_calc_16_bin 21694 non-null int64
ps_calc_17_bin 21694 non-null int64
ps_calc_18_bin 21694 non-null int64
ps_calc_19_bin 21694 non-null int64
ps_calc_20_bin 21694 non-null int64
dtypes: float64(10), int64(49)
memory usage: 9.9 MB
In [31]:
# sample negatives
neg_data = neg_data.sample(n=21694)
neg_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21694 entries, 564168 to 583460
Data columns (total 59 columns):
id 21694 non-null int64
target 21694 non-null int64
ps_ind_01 21694 non-null int64
ps_ind_02_cat 21694 non-null int64
ps_ind_03 21694 non-null int64
ps_ind_04_cat 21694 non-null int64
ps_ind_05_cat 21694 non-null int64
ps_ind_06_bin 21694 non-null int64
ps_ind_07_bin 21694 non-null int64
ps_ind_08_bin 21694 non-null int64
ps_ind_09_bin 21694 non-null int64
ps_ind_10_bin 21694 non-null int64
ps_ind_11_bin 21694 non-null int64
ps_ind_12_bin 21694 non-null int64
ps_ind_13_bin 21694 non-null int64
ps_ind_14 21694 non-null int64
ps_ind_15 21694 non-null int64
ps_ind_16_bin 21694 non-null int64
ps_ind_17_bin 21694 non-null int64
ps_ind_18_bin 21694 non-null int64
ps_reg_01 21694 non-null float64
ps_reg_02 21694 non-null float64
ps_reg_03 21694 non-null float64
ps_car_01_cat 21694 non-null int64
ps_car_02_cat 21694 non-null int64
ps_car_03_cat 21694 non-null int64
ps_car_04_cat 21694 non-null int64
ps_car_05_cat 21694 non-null int64
ps_car_06_cat 21694 non-null int64
ps_car_07_cat 21694 non-null int64
ps_car_08_cat 21694 non-null int64
ps_car_09_cat 21694 non-null int64
ps_car_10_cat 21694 non-null int64
ps_car_11_cat 21694 non-null int64
ps_car_11 21694 non-null int64
ps_car_12 21694 non-null float64
ps_car_13 21694 non-null float64
ps_car_14 21694 non-null float64
ps_car_15 21694 non-null float64
ps_calc_01 21694 non-null float64
ps_calc_02 21694 non-null float64
ps_calc_03 21694 non-null float64
ps_calc_04 21694 non-null int64
ps_calc_05 21694 non-null int64
ps_calc_06 21694 non-null int64
ps_calc_07 21694 non-null int64
ps_calc_08 21694 non-null int64
ps_calc_09 21694 non-null int64
ps_calc_10 21694 non-null int64
ps_calc_11 21694 non-null int64
ps_calc_12 21694 non-null int64
ps_calc_13 21694 non-null int64
ps_calc_14 21694 non-null int64
ps_calc_15_bin 21694 non-null int64
ps_calc_16_bin 21694 non-null int64
ps_calc_17_bin 21694 non-null int64
ps_calc_18_bin 21694 non-null int64
ps_calc_19_bin 21694 non-null int64
ps_calc_20_bin 21694 non-null int64
dtypes: float64(10), int64(49)
memory usage: 9.9 MB
In [32]:
all_data = pd.concat([pos_data, neg_data])
all_data = all_data.sample(frac=1).reset_index(drop=True) # shuffle
all_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43388 entries, 0 to 43387
Data columns (total 59 columns):
id 43388 non-null int64
target 43388 non-null int64
ps_ind_01 43388 non-null int64
ps_ind_02_cat 43388 non-null int64
ps_ind_03 43388 non-null int64
ps_ind_04_cat 43388 non-null int64
ps_ind_05_cat 43388 non-null int64
ps_ind_06_bin 43388 non-null int64
ps_ind_07_bin 43388 non-null int64
ps_ind_08_bin 43388 non-null int64
ps_ind_09_bin 43388 non-null int64
ps_ind_10_bin 43388 non-null int64
ps_ind_11_bin 43388 non-null int64
ps_ind_12_bin 43388 non-null int64
ps_ind_13_bin 43388 non-null int64
ps_ind_14 43388 non-null int64
ps_ind_15 43388 non-null int64
ps_ind_16_bin 43388 non-null int64
ps_ind_17_bin 43388 non-null int64
ps_ind_18_bin 43388 non-null int64
ps_reg_01 43388 non-null float64
ps_reg_02 43388 non-null float64
ps_reg_03 43388 non-null float64
ps_car_01_cat 43388 non-null int64
ps_car_02_cat 43388 non-null int64
ps_car_03_cat 43388 non-null int64
ps_car_04_cat 43388 non-null int64
ps_car_05_cat 43388 non-null int64
ps_car_06_cat 43388 non-null int64
ps_car_07_cat 43388 non-null int64
ps_car_08_cat 43388 non-null int64
ps_car_09_cat 43388 non-null int64
ps_car_10_cat 43388 non-null int64
ps_car_11_cat 43388 non-null int64
ps_car_11 43388 non-null int64
ps_car_12 43388 non-null float64
ps_car_13 43388 non-null float64
ps_car_14 43388 non-null float64
ps_car_15 43388 non-null float64
ps_calc_01 43388 non-null float64
ps_calc_02 43388 non-null float64
ps_calc_03 43388 non-null float64
ps_calc_04 43388 non-null int64
ps_calc_05 43388 non-null int64
ps_calc_06 43388 non-null int64
ps_calc_07 43388 non-null int64
ps_calc_08 43388 non-null int64
ps_calc_09 43388 non-null int64
ps_calc_10 43388 non-null int64
ps_calc_11 43388 non-null int64
ps_calc_12 43388 non-null int64
ps_calc_13 43388 non-null int64
ps_calc_14 43388 non-null int64
ps_calc_15_bin 43388 non-null int64
ps_calc_16_bin 43388 non-null int64
ps_calc_17_bin 43388 non-null int64
ps_calc_18_bin 43388 non-null int64
ps_calc_19_bin 43388 non-null int64
ps_calc_20_bin 43388 non-null int64
dtypes: float64(10), int64(49)
memory usage: 19.5 MB
In [33]:
# standardize data
df_x = all_data.iloc[:,2:]
df_y = all_data.iloc[:,:2]
scaler = StandardScaler().fit(df_x)
df_x = pd.DataFrame(scaler.transform(df_x))
all_data = pd.concat([df_y, df_x], axis=1)
all_data.head()
Out[33]:
id
target
0
1
2
3
4
5
6
7
...
47
48
49
50
51
52
53
54
55
56
0
397909
1
0.006190
-0.540064
-0.176771
1.149286
-0.336169
-0.735220
1.537339
-0.462444
...
1.095924
-0.361753
-0.511807
-0.564091
-0.375711
0.770888
0.895072
-0.632721
1.376676
-0.421670
1
993433
0
-0.491145
-0.540064
1.643982
-0.864413
0.324924
1.360137
-0.650475
-0.462444
...
-1.042133
0.470427
0.666527
0.524612
-0.375711
0.770888
-1.117228
-0.632721
1.376676
-0.421670
2
804416
1
1.498193
3.892554
2.372283
1.149286
-0.336169
-0.735220
1.537339
-0.462444
...
1.095924
-0.361753
1.844861
2.339118
-0.375711
0.770888
0.895072
-0.632721
1.376676
-0.421670
3
576857
0
-0.491145
0.937475
-0.176771
1.149286
-0.336169
-0.735220
-0.650475
2.162426
...
0.668313
2.134787
0.666527
0.524612
-0.375711
-1.297206
-1.117228
-0.632721
1.376676
-0.421670
4
267011
0
-0.988479
0.937475
0.187380
1.149286
-0.336169
-0.735220
1.537339
-0.462444
...
0.668313
2.134787
0.077360
0.161711
-0.375711
0.770888
-1.117228
-0.632721
-0.726388
2.371523
5 rows × 59 columns
In [34]:
x_train = all_data.iloc[:40000,2:]
y_train = all_data.iloc[:40000,1]
x_test = all_data.iloc[40000:,2:]
y_test = all_data.iloc[40000:,1]
y_test.head()
Out[34]:
40000 0
40001 1
40002 0
40003 1
40004 1
Name: target, dtype: int64
In [39]:
mlp = MLPClassifier(hidden_layer_sizes=(300),
activation='logistic')
mlp.fit(x_train, y_train)
Out[39]:
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=300, learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=None,
shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
verbose=False, warm_start=False)
In [40]:
mlp.score(x_test, y_test)
Out[40]:
0.5832349468713105
In [41]:
test_data = pd.read_csv('../kaggle_porto_seguro/test.csv')
test_data.head()
Out[41]:
id
ps_ind_01
ps_ind_02_cat
ps_ind_03
ps_ind_04_cat
ps_ind_05_cat
ps_ind_06_bin
ps_ind_07_bin
ps_ind_08_bin
ps_ind_09_bin
...
ps_calc_11
ps_calc_12
ps_calc_13
ps_calc_14
ps_calc_15_bin
ps_calc_16_bin
ps_calc_17_bin
ps_calc_18_bin
ps_calc_19_bin
ps_calc_20_bin
0
0
0
1
8
1
0
0
1
0
0
...
1
1
1
12
0
1
1
0
0
1
1
1
4
2
5
1
0
0
0
0
1
...
2
0
3
10
0
0
1
1
0
1
2
2
5
1
3
0
0
0
0
0
1
...
4
0
2
4
0
0
0
0
0
0
3
3
0
1
6
0
0
1
0
0
0
...
5
1
0
5
1
0
1
0
0
0
4
4
5
1
7
0
0
0
0
0
1
...
4
0
0
4
0
1
1
0
0
1
5 rows × 58 columns
In [42]:
predictions = mlp.predict(test_data.iloc[:,1:])
In [43]:
submission = pd.DataFrame()
submission['id'] = test_data.iloc[:, 0]
submission['target'] = predictions
submission.head()
Out[43]:
id
target
0
0
0
1
1
1
2
2
0
3
3
0
4
4
1
In [44]:
submission.to_csv('kaggle_submission_nn.csv', index=False)
In [ ]:
Content source: abevieiramota/data-science-cookbook
Similar notebooks: