notebook.community

Edit and run



In [1]:

    
import os
import pandas as pd
import math
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler



In [29]:

    
data = pd.read_csv('../kaggle_porto_seguro/train.csv')
data.head()









    Out[29]:






  
    
      
      id
      target
      ps_ind_01
      ps_ind_02_cat
      ps_ind_03
      ps_ind_04_cat
      ps_ind_05_cat
      ps_ind_06_bin
      ps_ind_07_bin
      ps_ind_08_bin
      ...
      ps_calc_11
      ps_calc_12
      ps_calc_13
      ps_calc_14
      ps_calc_15_bin
      ps_calc_16_bin
      ps_calc_17_bin
      ps_calc_18_bin
      ps_calc_19_bin
      ps_calc_20_bin
    
  
  
    
      0
      7
      0
      2
      2
      5
      1
      0
      0
      1
      0
      ...
      9
      1
      5
      8
      0
      1
      1
      0
      0
      1
    
    
      1
      9
      0
      1
      1
      7
      0
      0
      0
      0
      1
      ...
      3
      1
      1
      9
      0
      1
      1
      0
      1
      0
    
    
      2
      13
      0
      5
      4
      9
      1
      0
      0
      0
      1
      ...
      4
      2
      7
      7
      0
      1
      1
      0
      1
      0
    
    
      3
      16
      0
      0
      1
      2
      0
      0
      1
      0
      0
      ...
      2
      2
      4
      9
      0
      0
      0
      0
      0
      0
    
    
      4
      17
      0
      0
      2
      0
      1
      0
      1
      0
      0
      ...
      3
      1
      1
      3
      0
      0
      0
      1
      1
      0
    
  

5 rows × 59 columns



In [30]:

    
pos_data = data[(data.target == 1)]
neg_data = data[(data.target == 0)]
pos_data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 21694 entries, 9 to 595158
Data columns (total 59 columns):
id                21694 non-null int64
target            21694 non-null int64
ps_ind_01         21694 non-null int64
ps_ind_02_cat     21694 non-null int64
ps_ind_03         21694 non-null int64
ps_ind_04_cat     21694 non-null int64
ps_ind_05_cat     21694 non-null int64
ps_ind_06_bin     21694 non-null int64
ps_ind_07_bin     21694 non-null int64
ps_ind_08_bin     21694 non-null int64
ps_ind_09_bin     21694 non-null int64
ps_ind_10_bin     21694 non-null int64
ps_ind_11_bin     21694 non-null int64
ps_ind_12_bin     21694 non-null int64
ps_ind_13_bin     21694 non-null int64
ps_ind_14         21694 non-null int64
ps_ind_15         21694 non-null int64
ps_ind_16_bin     21694 non-null int64
ps_ind_17_bin     21694 non-null int64
ps_ind_18_bin     21694 non-null int64
ps_reg_01         21694 non-null float64
ps_reg_02         21694 non-null float64
ps_reg_03         21694 non-null float64
ps_car_01_cat     21694 non-null int64
ps_car_02_cat     21694 non-null int64
ps_car_03_cat     21694 non-null int64
ps_car_04_cat     21694 non-null int64
ps_car_05_cat     21694 non-null int64
ps_car_06_cat     21694 non-null int64
ps_car_07_cat     21694 non-null int64
ps_car_08_cat     21694 non-null int64
ps_car_09_cat     21694 non-null int64
ps_car_10_cat     21694 non-null int64
ps_car_11_cat     21694 non-null int64
ps_car_11         21694 non-null int64
ps_car_12         21694 non-null float64
ps_car_13         21694 non-null float64
ps_car_14         21694 non-null float64
ps_car_15         21694 non-null float64
ps_calc_01        21694 non-null float64
ps_calc_02        21694 non-null float64
ps_calc_03        21694 non-null float64
ps_calc_04        21694 non-null int64
ps_calc_05        21694 non-null int64
ps_calc_06        21694 non-null int64
ps_calc_07        21694 non-null int64
ps_calc_08        21694 non-null int64
ps_calc_09        21694 non-null int64
ps_calc_10        21694 non-null int64
ps_calc_11        21694 non-null int64
ps_calc_12        21694 non-null int64
ps_calc_13        21694 non-null int64
ps_calc_14        21694 non-null int64
ps_calc_15_bin    21694 non-null int64
ps_calc_16_bin    21694 non-null int64
ps_calc_17_bin    21694 non-null int64
ps_calc_18_bin    21694 non-null int64
ps_calc_19_bin    21694 non-null int64
ps_calc_20_bin    21694 non-null int64
dtypes: float64(10), int64(49)
memory usage: 9.9 MB



In [31]:

    
# sample negatives
neg_data = neg_data.sample(n=21694)
neg_data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 21694 entries, 564168 to 583460
Data columns (total 59 columns):
id                21694 non-null int64
target            21694 non-null int64
ps_ind_01         21694 non-null int64
ps_ind_02_cat     21694 non-null int64
ps_ind_03         21694 non-null int64
ps_ind_04_cat     21694 non-null int64
ps_ind_05_cat     21694 non-null int64
ps_ind_06_bin     21694 non-null int64
ps_ind_07_bin     21694 non-null int64
ps_ind_08_bin     21694 non-null int64
ps_ind_09_bin     21694 non-null int64
ps_ind_10_bin     21694 non-null int64
ps_ind_11_bin     21694 non-null int64
ps_ind_12_bin     21694 non-null int64
ps_ind_13_bin     21694 non-null int64
ps_ind_14         21694 non-null int64
ps_ind_15         21694 non-null int64
ps_ind_16_bin     21694 non-null int64
ps_ind_17_bin     21694 non-null int64
ps_ind_18_bin     21694 non-null int64
ps_reg_01         21694 non-null float64
ps_reg_02         21694 non-null float64
ps_reg_03         21694 non-null float64
ps_car_01_cat     21694 non-null int64
ps_car_02_cat     21694 non-null int64
ps_car_03_cat     21694 non-null int64
ps_car_04_cat     21694 non-null int64
ps_car_05_cat     21694 non-null int64
ps_car_06_cat     21694 non-null int64
ps_car_07_cat     21694 non-null int64
ps_car_08_cat     21694 non-null int64
ps_car_09_cat     21694 non-null int64
ps_car_10_cat     21694 non-null int64
ps_car_11_cat     21694 non-null int64
ps_car_11         21694 non-null int64
ps_car_12         21694 non-null float64
ps_car_13         21694 non-null float64
ps_car_14         21694 non-null float64
ps_car_15         21694 non-null float64
ps_calc_01        21694 non-null float64
ps_calc_02        21694 non-null float64
ps_calc_03        21694 non-null float64
ps_calc_04        21694 non-null int64
ps_calc_05        21694 non-null int64
ps_calc_06        21694 non-null int64
ps_calc_07        21694 non-null int64
ps_calc_08        21694 non-null int64
ps_calc_09        21694 non-null int64
ps_calc_10        21694 non-null int64
ps_calc_11        21694 non-null int64
ps_calc_12        21694 non-null int64
ps_calc_13        21694 non-null int64
ps_calc_14        21694 non-null int64
ps_calc_15_bin    21694 non-null int64
ps_calc_16_bin    21694 non-null int64
ps_calc_17_bin    21694 non-null int64
ps_calc_18_bin    21694 non-null int64
ps_calc_19_bin    21694 non-null int64
ps_calc_20_bin    21694 non-null int64
dtypes: float64(10), int64(49)
memory usage: 9.9 MB



In [32]:

    
all_data = pd.concat([pos_data, neg_data])
all_data = all_data.sample(frac=1).reset_index(drop=True) # shuffle
all_data.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43388 entries, 0 to 43387
Data columns (total 59 columns):
id                43388 non-null int64
target            43388 non-null int64
ps_ind_01         43388 non-null int64
ps_ind_02_cat     43388 non-null int64
ps_ind_03         43388 non-null int64
ps_ind_04_cat     43388 non-null int64
ps_ind_05_cat     43388 non-null int64
ps_ind_06_bin     43388 non-null int64
ps_ind_07_bin     43388 non-null int64
ps_ind_08_bin     43388 non-null int64
ps_ind_09_bin     43388 non-null int64
ps_ind_10_bin     43388 non-null int64
ps_ind_11_bin     43388 non-null int64
ps_ind_12_bin     43388 non-null int64
ps_ind_13_bin     43388 non-null int64
ps_ind_14         43388 non-null int64
ps_ind_15         43388 non-null int64
ps_ind_16_bin     43388 non-null int64
ps_ind_17_bin     43388 non-null int64
ps_ind_18_bin     43388 non-null int64
ps_reg_01         43388 non-null float64
ps_reg_02         43388 non-null float64
ps_reg_03         43388 non-null float64
ps_car_01_cat     43388 non-null int64
ps_car_02_cat     43388 non-null int64
ps_car_03_cat     43388 non-null int64
ps_car_04_cat     43388 non-null int64
ps_car_05_cat     43388 non-null int64
ps_car_06_cat     43388 non-null int64
ps_car_07_cat     43388 non-null int64
ps_car_08_cat     43388 non-null int64
ps_car_09_cat     43388 non-null int64
ps_car_10_cat     43388 non-null int64
ps_car_11_cat     43388 non-null int64
ps_car_11         43388 non-null int64
ps_car_12         43388 non-null float64
ps_car_13         43388 non-null float64
ps_car_14         43388 non-null float64
ps_car_15         43388 non-null float64
ps_calc_01        43388 non-null float64
ps_calc_02        43388 non-null float64
ps_calc_03        43388 non-null float64
ps_calc_04        43388 non-null int64
ps_calc_05        43388 non-null int64
ps_calc_06        43388 non-null int64
ps_calc_07        43388 non-null int64
ps_calc_08        43388 non-null int64
ps_calc_09        43388 non-null int64
ps_calc_10        43388 non-null int64
ps_calc_11        43388 non-null int64
ps_calc_12        43388 non-null int64
ps_calc_13        43388 non-null int64
ps_calc_14        43388 non-null int64
ps_calc_15_bin    43388 non-null int64
ps_calc_16_bin    43388 non-null int64
ps_calc_17_bin    43388 non-null int64
ps_calc_18_bin    43388 non-null int64
ps_calc_19_bin    43388 non-null int64
ps_calc_20_bin    43388 non-null int64
dtypes: float64(10), int64(49)
memory usage: 19.5 MB



In [33]:

    
# standardize data
df_x = all_data.iloc[:,2:]
df_y = all_data.iloc[:,:2]

scaler = StandardScaler().fit(df_x)
df_x = pd.DataFrame(scaler.transform(df_x))

all_data = pd.concat([df_y, df_x], axis=1)
all_data.head()









    Out[33]:






  
    
      
      id
      target
      0
      1
      2
      3
      4
      5
      6
      7
      ...
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
    
  
  
    
      0
      397909
      1
      0.006190
      -0.540064
      -0.176771
      1.149286
      -0.336169
      -0.735220
      1.537339
      -0.462444
      ...
      1.095924
      -0.361753
      -0.511807
      -0.564091
      -0.375711
      0.770888
      0.895072
      -0.632721
      1.376676
      -0.421670
    
    
      1
      993433
      0
      -0.491145
      -0.540064
      1.643982
      -0.864413
      0.324924
      1.360137
      -0.650475
      -0.462444
      ...
      -1.042133
      0.470427
      0.666527
      0.524612
      -0.375711
      0.770888
      -1.117228
      -0.632721
      1.376676
      -0.421670
    
    
      2
      804416
      1
      1.498193
      3.892554
      2.372283
      1.149286
      -0.336169
      -0.735220
      1.537339
      -0.462444
      ...
      1.095924
      -0.361753
      1.844861
      2.339118
      -0.375711
      0.770888
      0.895072
      -0.632721
      1.376676
      -0.421670
    
    
      3
      576857
      0
      -0.491145
      0.937475
      -0.176771
      1.149286
      -0.336169
      -0.735220
      -0.650475
      2.162426
      ...
      0.668313
      2.134787
      0.666527
      0.524612
      -0.375711
      -1.297206
      -1.117228
      -0.632721
      1.376676
      -0.421670
    
    
      4
      267011
      0
      -0.988479
      0.937475
      0.187380
      1.149286
      -0.336169
      -0.735220
      1.537339
      -0.462444
      ...
      0.668313
      2.134787
      0.077360
      0.161711
      -0.375711
      0.770888
      -1.117228
      -0.632721
      -0.726388
      2.371523
    
  

5 rows × 59 columns



In [34]:

    
x_train = all_data.iloc[:40000,2:]
y_train = all_data.iloc[:40000,1]
x_test = all_data.iloc[40000:,2:]
y_test = all_data.iloc[40000:,1]
y_test.head()









    Out[34]:





40000    0
40001    1
40002    0
40003    1
40004    1
Name: target, dtype: int64



In [39]:

    
mlp = MLPClassifier(hidden_layer_sizes=(300), 
                    activation='logistic')

mlp.fit(x_train, y_train)









    Out[39]:





MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)



In [40]:

    
mlp.score(x_test, y_test)









    Out[40]:





0.5832349468713105



In [41]:

    
test_data = pd.read_csv('../kaggle_porto_seguro/test.csv')
test_data.head()









    Out[41]:






  
    
      
      id
      ps_ind_01
      ps_ind_02_cat
      ps_ind_03
      ps_ind_04_cat
      ps_ind_05_cat
      ps_ind_06_bin
      ps_ind_07_bin
      ps_ind_08_bin
      ps_ind_09_bin
      ...
      ps_calc_11
      ps_calc_12
      ps_calc_13
      ps_calc_14
      ps_calc_15_bin
      ps_calc_16_bin
      ps_calc_17_bin
      ps_calc_18_bin
      ps_calc_19_bin
      ps_calc_20_bin
    
  
  
    
      0
      0
      0
      1
      8
      1
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      12
      0
      1
      1
      0
      0
      1
    
    
      1
      1
      4
      2
      5
      1
      0
      0
      0
      0
      1
      ...
      2
      0
      3
      10
      0
      0
      1
      1
      0
      1
    
    
      2
      2
      5
      1
      3
      0
      0
      0
      0
      0
      1
      ...
      4
      0
      2
      4
      0
      0
      0
      0
      0
      0
    
    
      3
      3
      0
      1
      6
      0
      0
      1
      0
      0
      0
      ...
      5
      1
      0
      5
      1
      0
      1
      0
      0
      0
    
    
      4
      4
      5
      1
      7
      0
      0
      0
      0
      0
      1
      ...
      4
      0
      0
      4
      0
      1
      1
      0
      0
      1
    
  

5 rows × 58 columns



In [42]:

    
predictions = mlp.predict(test_data.iloc[:,1:])



In [43]:

    
submission = pd.DataFrame()
submission['id'] = test_data.iloc[:, 0]
submission['target'] = predictions
submission.head()



In [44]:

    
submission.to_csv('kaggle_submission_nn.csv', index=False)



In [ ]:

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	...	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	...	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	...	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	...	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	...	3	1	1	3	0	0	1	1	0

	id	target	0	1	2	3	4	5	6	7	...	47	48	49	50	51	52	53	54	55	56
0	397909	1	0.006190	-0.540064	-0.176771	1.149286	-0.336169	-0.735220	1.537339	-0.462444	...	1.095924	-0.361753	-0.511807	-0.564091	-0.375711	0.770888	0.895072	-0.632721	1.376676	-0.421670
1	993433	0	-0.491145	-0.540064	1.643982	-0.864413	0.324924	1.360137	-0.650475	-0.462444	...	-1.042133	0.470427	0.666527	0.524612	-0.375711	0.770888	-1.117228	-0.632721	1.376676	-0.421670
2	804416	1	1.498193	3.892554	2.372283	1.149286	-0.336169	-0.735220	1.537339	-0.462444	...	1.095924	-0.361753	1.844861	2.339118	-0.375711	0.770888	0.895072	-0.632721	1.376676	-0.421670
3	576857	0	-0.491145	0.937475	-0.176771	1.149286	-0.336169	-0.735220	-0.650475	2.162426	...	0.668313	2.134787	0.666527	0.524612	-0.375711	-1.297206	-1.117228	-0.632721	1.376676	-0.421670
4	267011	0	-0.988479	0.937475	0.187380	1.149286	-0.336169	-0.735220	1.537339	-0.462444	...	0.668313	2.134787	0.077360	0.161711	-0.375711	0.770888	-1.117228	-0.632721	-0.726388	2.371523

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_09_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
0	0	0	1	8	1	0	1	0	...	1	1	1	12	0	1	1	0	1
1	1	4	2	5	1	0	0	1	...	2	0	3	10	0	0	1	1	1
2	2	5	1	3	0	0	0	1	...	4	0	2	4	0	0	0	0	0
3	3	0	1	6	0	1	0	0	...	5	1	0	5	1	0	1	0	0
4	4	5	1	7	0	0	0	1	...	4	0	0	4	0	1	1	0	1

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	...	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	...	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	...	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	...	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	...	3	1	1	3	0	0	1	1	0

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_09_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
0	0	0	1	8	1	0	1	0	...	1	1	1	12	0	1	1	0	1
1	1	4	2	5	1	0	0	1	...	2	0	3	10	0	0	1	1	1
2	2	5	1	3	0	0	0	1	...	4	0	2	4	0	0	0	0	0
3	3	0	1	6	0	1	0	0	...	5	1	0	5	1	0	1	0	0
4	4	5	1	7	0	0	0	1	...	4	0	0	4	0	1	1	0	1

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	...	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	...	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	...	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	...	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	...	3	1	1	3	0	0	1	1	0

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_09_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
0	0	0	1	8	1	0	1	0	...	1	1	1	12	0	1	1	0	1
1	1	4	2	5	1	0	0	1	...	2	0	3	10	0	0	1	1	1
2	2	5	1	3	0	0	0	1	...	4	0	2	4	0	0	0	0	0
3	3	0	1	6	0	1	0	0	...	5	1	0	5	1	0	1	0	0
4	4	5	1	7	0	0	0	1	...	4	0	0	4	0	1	1	0	1