In [1]:
import pandas as pd
import numpy as np
In [2]:
import matplotlib.pyplot as plt
In [3]:
% matplotlib inline
In [4]:
data=pd.read_csv('train.csv')
In [5]:
data
Out[5]:
id
target
ps_ind_01
ps_ind_02_cat
ps_ind_03
ps_ind_04_cat
ps_ind_05_cat
ps_ind_06_bin
ps_ind_07_bin
ps_ind_08_bin
...
ps_calc_11
ps_calc_12
ps_calc_13
ps_calc_14
ps_calc_15_bin
ps_calc_16_bin
ps_calc_17_bin
ps_calc_18_bin
ps_calc_19_bin
ps_calc_20_bin
0
7
0
2
2
5
1
0
0
1
0
...
9
1
5
8
0
1
1
0
0
1
1
9
0
1
1
7
0
0
0
0
1
...
3
1
1
9
0
1
1
0
1
0
2
13
0
5
4
9
1
0
0
0
1
...
4
2
7
7
0
1
1
0
1
0
3
16
0
0
1
2
0
0
1
0
0
...
2
2
4
9
0
0
0
0
0
0
4
17
0
0
2
0
1
0
1
0
0
...
3
1
1
3
0
0
0
1
1
0
5
19
0
5
1
4
0
0
0
0
0
...
4
2
0
9
0
1
0
1
1
1
6
20
0
2
1
3
1
0
0
1
0
...
3
0
0
10
0
1
0
0
1
0
7
22
0
5
1
4
0
0
1
0
0
...
7
1
3
6
1
0
1
0
1
0
8
26
0
5
1
3
1
0
0
0
1
...
4
2
1
5
0
1
0
0
0
1
9
28
1
1
1
2
0
0
0
1
0
...
3
5
0
6
0
1
0
0
1
0
10
34
0
5
2
2
0
0
0
0
1
...
6
3
3
6
0
1
1
0
1
1
11
35
0
2
1
3
1
0
0
1
0
...
9
2
1
10
0
1
0
1
0
0
12
36
0
2
1
1
1
0
0
0
1
...
6
3
3
8
0
0
1
0
0
1
13
43
0
1
1
3
1
0
0
1
0
...
2
0
4
3
0
0
1
0
1
0
14
46
0
5
1
11
0
0
0
0
0
...
4
1
3
9
0
0
0
0
1
0
15
48
0
5
1
3
1
0
0
1
0
...
5
2
6
7
0
1
1
0
1
0
16
50
0
1
2
1
0
0
0
0
1
...
3
3
1
8
0
0
1
0
0
0
17
58
0
5
1
6
0
1
1
0
0
...
9
1
3
9
0
1
1
0
0
0
18
61
0
5
1
8
0
0
1
0
0
...
3
1
6
5
0
0
0
1
0
0
19
64
1
0
1
2
1
0
1
0
0
...
10
3
1
11
0
1
1
0
1
0
20
65
0
0
1
2
0
0
1
0
0
...
7
2
2
4
0
1
0
0
1
0
21
66
0
0
1
5
1
4
1
0
0
...
5
5
3
8
0
0
0
0
1
0
22
72
0
5
3
6
1
3
0
1
0
...
8
1
3
5
0
0
0
0
0
1
23
74
0
2
1
2
1
0
0
1
0
...
7
1
3
9
0
1
0
1
0
0
24
77
0
0
1
2
0
0
1
0
0
...
5
1
2
8
0
1
1
1
0
0
25
78
0
0
1
7
0
0
1
0
0
...
6
4
4
4
0
0
1
1
0
1
26
79
0
0
1
4
1
0
0
0
1
...
4
2
4
3
0
1
1
1
0
1
27
80
0
4
1
6
0
0
0
0
0
...
7
2
2
11
0
1
0
1
0
0
28
84
1
0
2
0
1
4
1
0
0
...
3
2
0
8
0
1
1
0
0
0
29
85
0
3
2
2
0
0
1
0
0
...
2
0
4
7
0
1
0
0
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
595182
1487945
0
0
1
2
0
0
1
0
0
...
12
0
5
8
0
0
0
1
1
0
595183
1487951
0
1
1
9
0
0
1
0
0
...
2
2
3
7
0
1
1
0
1
0
595184
1487952
0
1
1
5
0
0
1
0
0
...
7
0
3
9
0
0
0
0
0
1
595185
1487954
0
0
2
8
1
0
0
0
1
...
10
1
1
11
0
1
0
1
0
0
595186
1487957
0
0
1
3
1
0
0
1
0
...
10
0
2
9
0
0
1
1
0
0
595187
1487958
0
0
1
7
0
4
0
0
0
...
5
1
4
9
0
1
1
0
0
1
595188
1487962
0
2
1
3
0
0
1
0
0
...
8
1
6
6
0
1
1
0
0
0
595189
1487963
0
3
1
9
0
0
1
0
0
...
7
0
8
4
0
1
0
1
1
0
595190
1487964
0
1
2
3
0
0
1
0
0
...
1
0
4
6
0
0
1
0
0
0
595191
1487968
0
5
1
3
0
0
0
0
1
...
8
2
1
11
0
0
1
1
0
1
595192
1487973
0
0
1
1
0
0
0
1
0
...
5
1
5
5
0
1
1
0
1
1
595193
1487975
0
0
2
0
0
-1
0
1
0
...
6
1
3
5
0
1
0
1
0
0
595194
1487976
0
0
1
7
1
0
0
1
0
...
6
1
4
12
0
1
1
0
0
0
595195
1487980
0
0
3
3
1
0
0
1
0
...
8
0
2
5
0
1
0
0
1
1
595196
1487983
0
1
1
2
0
0
1
0
0
...
6
0
1
11
1
1
0
0
1
1
595197
1487988
0
1
1
7
1
0
0
1
0
...
4
1
3
6
0
0
1
1
1
0
595198
1487990
0
5
1
7
1
0
0
0
0
...
6
2
2
14
0
1
1
0
0
0
595199
1487992
0
3
1
3
1
0
0
0
1
...
1
1
1
10
0
0
1
0
1
0
595200
1487994
0
1
1
1
0
3
1
0
0
...
2
3
3
12
0
0
1
0
0
0
595201
1487996
0
0
2
2
1
0
0
0
0
...
4
1
2
6
1
1
0
1
1
0
595202
1488001
0
4
1
3
0
0
0
1
0
...
11
6
2
6
0
1
1
0
1
0
595203
1488005
0
3
2
3
1
0
0
1
0
...
5
2
1
6
0
0
0
0
0
0
595204
1488008
0
1
2
2
1
0
0
1
0
...
3
1
3
9
0
1
0
0
1
0
595205
1488009
0
0
3
6
1
0
0
1
0
...
7
1
2
6
1
0
1
0
0
0
595206
1488011
0
0
1
2
0
0
0
0
0
...
8
0
1
7
0
1
1
0
0
0
595207
1488013
0
3
1
10
0
0
0
0
0
...
4
1
9
6
0
1
1
0
1
1
595208
1488016
0
5
1
3
0
0
0
0
0
...
4
1
3
8
1
0
1
0
1
1
595209
1488017
0
1
1
10
0
0
1
0
0
...
3
2
2
6
0
0
1
0
0
0
595210
1488021
0
5
2
3
1
0
0
0
1
...
4
1
4
2
0
1
1
1
0
0
595211
1488027
0
0
1
8
0
0
1
0
0
...
4
4
3
8
0
1
0
0
0
0
595212 rows × 59 columns
In [11]:
from xgboost import XGBClassifier
In [13]:
model=XGBClassifier(max_depth=5, learning_rate=0.095, n_estimators=300, silent=True, objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
In [15]:
y=data.target
data = data.drop(['id','target'],axis=1)
In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [19]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=test_size, random_state=seed)
In [26]:
model.fit(data,y)
Out[26]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
gamma=0, learning_rate=0.095, max_delta_step=0, max_depth=5,
min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
objective='binary:logistic', reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=0, silent=True, subsample=1)
In [23]:
y_pred=model.predict(X_test)
In [24]:
accuracy = accuracy_score(y_pred,y_test)
In [25]:
accuracy
Out[25]:
0.96349658894206291
In [36]:
test_data=pd.read_csv('test.csv')
id_01=test_data.id
In [30]:
test_data = test_data.drop(['id'],axis=1)
test_data.head()
Out[30]:
ps_ind_01
ps_ind_02_cat
ps_ind_03
ps_ind_04_cat
ps_ind_05_cat
ps_ind_06_bin
ps_ind_07_bin
ps_ind_08_bin
ps_ind_09_bin
ps_ind_10_bin
...
ps_calc_11
ps_calc_12
ps_calc_13
ps_calc_14
ps_calc_15_bin
ps_calc_16_bin
ps_calc_17_bin
ps_calc_18_bin
ps_calc_19_bin
ps_calc_20_bin
0
0
1
8
1
0
0
1
0
0
0
...
1
1
1
12
0
1
1
0
0
1
1
4
2
5
1
0
0
0
0
1
0
...
2
0
3
10
0
0
1
1
0
1
2
5
1
3
0
0
0
0
0
1
0
...
4
0
2
4
0
0
0
0
0
0
3
0
1
6
0
0
1
0
0
0
0
...
5
1
0
5
1
0
1
0
0
0
4
5
1
7
0
0
0
0
0
1
0
...
4
0
0
4
0
1
1
0
0
1
5 rows × 57 columns
In [31]:
predictions=model.predict_proba(test_data)
In [32]:
predictions
Out[32]:
array([[ 0.97149295, 0.02850703],
[ 0.97010922, 0.02989077],
[ 0.97593033, 0.02406965],
...,
[ 0.9570936 , 0.0429064 ],
[ 0.97389305, 0.02610697],
[ 0.97206235, 0.02793766]], dtype=float32)
In [37]:
p=pd.DataFrame(predictions).drop([0],axis=1)
p
Out[37]:
1
0
0.028507
1
0.029891
2
0.024070
3
0.013895
4
0.033502
5
0.044280
6
0.017185
7
0.036855
8
0.051538
9
0.049058
10
0.031651
11
0.017965
12
0.055400
13
0.056822
14
0.040306
15
0.018269
16
0.023472
17
0.052463
18
0.010320
19
0.050178
20
0.034624
21
0.039136
22
0.048343
23
0.031339
24
0.021085
25
0.024448
26
0.090273
27
0.035440
28
0.027273
29
0.018544
...
...
892786
0.017111
892787
0.036884
892788
0.042038
892789
0.036242
892790
0.028578
892791
0.029502
892792
0.025095
892793
0.046357
892794
0.021107
892795
0.028772
892796
0.049289
892797
0.073454
892798
0.033845
892799
0.101784
892800
0.028131
892801
0.025324
892802
0.025526
892803
0.030707
892804
0.046696
892805
0.042055
892806
0.024705
892807
0.027491
892808
0.024442
892809
0.022619
892810
0.020697
892811
0.069824
892812
0.040311
892813
0.042906
892814
0.026107
892815
0.027938
892816 rows × 1 columns
In [38]:
answer=pd.concat([pd.DataFrame(test_data['id']),p],axis=1)
answer.columns=[['id','target']]
answer.to_csv('prediction_final_xgbParamsChanged.csv',index=False)
In [39]:
answer
Out[39]:
id
target
0
0
0.028507
1
1
0.029891
2
2
0.024070
3
3
0.013895
4
4
0.033502
5
5
0.044280
6
6
0.017185
7
8
0.036855
8
10
0.051538
9
11
0.049058
10
12
0.031651
11
14
0.017965
12
15
0.055400
13
18
0.056822
14
21
0.040306
15
23
0.018269
16
24
0.023472
17
25
0.052463
18
27
0.010320
19
29
0.050178
20
30
0.034624
21
31
0.039136
22
32
0.048343
23
33
0.031339
24
37
0.021085
25
38
0.024448
26
39
0.090273
27
40
0.035440
28
41
0.027273
29
42
0.018544
...
...
...
892786
1487982
0.017111
892787
1487984
0.036884
892788
1487985
0.042038
892789
1487986
0.036242
892790
1487987
0.028578
892791
1487989
0.029502
892792
1487991
0.025095
892793
1487993
0.046357
892794
1487995
0.021107
892795
1487997
0.028772
892796
1487998
0.049289
892797
1487999
0.073454
892798
1488000
0.033845
892799
1488002
0.101784
892800
1488003
0.028131
892801
1488004
0.025324
892802
1488006
0.025526
892803
1488007
0.030707
892804
1488010
0.046696
892805
1488012
0.042055
892806
1488014
0.024705
892807
1488015
0.027491
892808
1488018
0.024442
892809
1488019
0.022619
892810
1488020
0.020697
892811
1488022
0.069824
892812
1488023
0.040311
892813
1488024
0.042906
892814
1488025
0.026107
892815
1488026
0.027938
892816 rows × 2 columns
In [ ]:
Content source: bhupendrathore/Porto-Seguro-s-Safe-Driver-Prediction-
Similar notebooks: