In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt

In [3]:
% matplotlib inline

Reading Data


In [4]:
data=pd.read_csv('train.csv')

In [5]:
data


Out[5]:
id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ... ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 7 0 2 2 5 1 0 0 1 0 ... 9 1 5 8 0 1 1 0 0 1
1 9 0 1 1 7 0 0 0 0 1 ... 3 1 1 9 0 1 1 0 1 0
2 13 0 5 4 9 1 0 0 0 1 ... 4 2 7 7 0 1 1 0 1 0
3 16 0 0 1 2 0 0 1 0 0 ... 2 2 4 9 0 0 0 0 0 0
4 17 0 0 2 0 1 0 1 0 0 ... 3 1 1 3 0 0 0 1 1 0
5 19 0 5 1 4 0 0 0 0 0 ... 4 2 0 9 0 1 0 1 1 1
6 20 0 2 1 3 1 0 0 1 0 ... 3 0 0 10 0 1 0 0 1 0
7 22 0 5 1 4 0 0 1 0 0 ... 7 1 3 6 1 0 1 0 1 0
8 26 0 5 1 3 1 0 0 0 1 ... 4 2 1 5 0 1 0 0 0 1
9 28 1 1 1 2 0 0 0 1 0 ... 3 5 0 6 0 1 0 0 1 0
10 34 0 5 2 2 0 0 0 0 1 ... 6 3 3 6 0 1 1 0 1 1
11 35 0 2 1 3 1 0 0 1 0 ... 9 2 1 10 0 1 0 1 0 0
12 36 0 2 1 1 1 0 0 0 1 ... 6 3 3 8 0 0 1 0 0 1
13 43 0 1 1 3 1 0 0 1 0 ... 2 0 4 3 0 0 1 0 1 0
14 46 0 5 1 11 0 0 0 0 0 ... 4 1 3 9 0 0 0 0 1 0
15 48 0 5 1 3 1 0 0 1 0 ... 5 2 6 7 0 1 1 0 1 0
16 50 0 1 2 1 0 0 0 0 1 ... 3 3 1 8 0 0 1 0 0 0
17 58 0 5 1 6 0 1 1 0 0 ... 9 1 3 9 0 1 1 0 0 0
18 61 0 5 1 8 0 0 1 0 0 ... 3 1 6 5 0 0 0 1 0 0
19 64 1 0 1 2 1 0 1 0 0 ... 10 3 1 11 0 1 1 0 1 0
20 65 0 0 1 2 0 0 1 0 0 ... 7 2 2 4 0 1 0 0 1 0
21 66 0 0 1 5 1 4 1 0 0 ... 5 5 3 8 0 0 0 0 1 0
22 72 0 5 3 6 1 3 0 1 0 ... 8 1 3 5 0 0 0 0 0 1
23 74 0 2 1 2 1 0 0 1 0 ... 7 1 3 9 0 1 0 1 0 0
24 77 0 0 1 2 0 0 1 0 0 ... 5 1 2 8 0 1 1 1 0 0
25 78 0 0 1 7 0 0 1 0 0 ... 6 4 4 4 0 0 1 1 0 1
26 79 0 0 1 4 1 0 0 0 1 ... 4 2 4 3 0 1 1 1 0 1
27 80 0 4 1 6 0 0 0 0 0 ... 7 2 2 11 0 1 0 1 0 0
28 84 1 0 2 0 1 4 1 0 0 ... 3 2 0 8 0 1 1 0 0 0
29 85 0 3 2 2 0 0 1 0 0 ... 2 0 4 7 0 1 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
595182 1487945 0 0 1 2 0 0 1 0 0 ... 12 0 5 8 0 0 0 1 1 0
595183 1487951 0 1 1 9 0 0 1 0 0 ... 2 2 3 7 0 1 1 0 1 0
595184 1487952 0 1 1 5 0 0 1 0 0 ... 7 0 3 9 0 0 0 0 0 1
595185 1487954 0 0 2 8 1 0 0 0 1 ... 10 1 1 11 0 1 0 1 0 0
595186 1487957 0 0 1 3 1 0 0 1 0 ... 10 0 2 9 0 0 1 1 0 0
595187 1487958 0 0 1 7 0 4 0 0 0 ... 5 1 4 9 0 1 1 0 0 1
595188 1487962 0 2 1 3 0 0 1 0 0 ... 8 1 6 6 0 1 1 0 0 0
595189 1487963 0 3 1 9 0 0 1 0 0 ... 7 0 8 4 0 1 0 1 1 0
595190 1487964 0 1 2 3 0 0 1 0 0 ... 1 0 4 6 0 0 1 0 0 0
595191 1487968 0 5 1 3 0 0 0 0 1 ... 8 2 1 11 0 0 1 1 0 1
595192 1487973 0 0 1 1 0 0 0 1 0 ... 5 1 5 5 0 1 1 0 1 1
595193 1487975 0 0 2 0 0 -1 0 1 0 ... 6 1 3 5 0 1 0 1 0 0
595194 1487976 0 0 1 7 1 0 0 1 0 ... 6 1 4 12 0 1 1 0 0 0
595195 1487980 0 0 3 3 1 0 0 1 0 ... 8 0 2 5 0 1 0 0 1 1
595196 1487983 0 1 1 2 0 0 1 0 0 ... 6 0 1 11 1 1 0 0 1 1
595197 1487988 0 1 1 7 1 0 0 1 0 ... 4 1 3 6 0 0 1 1 1 0
595198 1487990 0 5 1 7 1 0 0 0 0 ... 6 2 2 14 0 1 1 0 0 0
595199 1487992 0 3 1 3 1 0 0 0 1 ... 1 1 1 10 0 0 1 0 1 0
595200 1487994 0 1 1 1 0 3 1 0 0 ... 2 3 3 12 0 0 1 0 0 0
595201 1487996 0 0 2 2 1 0 0 0 0 ... 4 1 2 6 1 1 0 1 1 0
595202 1488001 0 4 1 3 0 0 0 1 0 ... 11 6 2 6 0 1 1 0 1 0
595203 1488005 0 3 2 3 1 0 0 1 0 ... 5 2 1 6 0 0 0 0 0 0
595204 1488008 0 1 2 2 1 0 0 1 0 ... 3 1 3 9 0 1 0 0 1 0
595205 1488009 0 0 3 6 1 0 0 1 0 ... 7 1 2 6 1 0 1 0 0 0
595206 1488011 0 0 1 2 0 0 0 0 0 ... 8 0 1 7 0 1 1 0 0 0
595207 1488013 0 3 1 10 0 0 0 0 0 ... 4 1 9 6 0 1 1 0 1 1
595208 1488016 0 5 1 3 0 0 0 0 0 ... 4 1 3 8 1 0 1 0 1 1
595209 1488017 0 1 1 10 0 0 1 0 0 ... 3 2 2 6 0 0 1 0 0 0
595210 1488021 0 5 2 3 1 0 0 0 1 ... 4 1 4 2 0 1 1 1 0 0
595211 1488027 0 0 1 8 0 0 1 0 0 ... 4 4 3 8 0 1 0 0 0 0

595212 rows × 59 columns


In [11]:
from xgboost import XGBClassifier

In [13]:
model=XGBClassifier(max_depth=5, learning_rate=0.095, n_estimators=300, silent=True, objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)

In [15]:
y=data.target
data = data.drop(['id','target'],axis=1)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [19]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=test_size, random_state=seed)

In [26]:
model.fit(data,y)


Out[26]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.095, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [23]:
y_pred=model.predict(X_test)

In [24]:
accuracy = accuracy_score(y_pred,y_test)

In [25]:
accuracy


Out[25]:
0.96349658894206291

In [36]:
test_data=pd.read_csv('test.csv')
id_01=test_data.id

In [30]:
test_data = test_data.drop(['id'],axis=1)
test_data.head()


Out[30]:
ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ... ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 0 1 8 1 0 0 1 0 0 0 ... 1 1 1 12 0 1 1 0 0 1
1 4 2 5 1 0 0 0 0 1 0 ... 2 0 3 10 0 0 1 1 0 1
2 5 1 3 0 0 0 0 0 1 0 ... 4 0 2 4 0 0 0 0 0 0
3 0 1 6 0 0 1 0 0 0 0 ... 5 1 0 5 1 0 1 0 0 0
4 5 1 7 0 0 0 0 0 1 0 ... 4 0 0 4 0 1 1 0 0 1

5 rows × 57 columns


In [31]:
predictions=model.predict_proba(test_data)

In [32]:
predictions


Out[32]:
array([[ 0.97149295,  0.02850703],
       [ 0.97010922,  0.02989077],
       [ 0.97593033,  0.02406965],
       ..., 
       [ 0.9570936 ,  0.0429064 ],
       [ 0.97389305,  0.02610697],
       [ 0.97206235,  0.02793766]], dtype=float32)

In [37]:
p=pd.DataFrame(predictions).drop([0],axis=1)
p


Out[37]:
1
0 0.028507
1 0.029891
2 0.024070
3 0.013895
4 0.033502
5 0.044280
6 0.017185
7 0.036855
8 0.051538
9 0.049058
10 0.031651
11 0.017965
12 0.055400
13 0.056822
14 0.040306
15 0.018269
16 0.023472
17 0.052463
18 0.010320
19 0.050178
20 0.034624
21 0.039136
22 0.048343
23 0.031339
24 0.021085
25 0.024448
26 0.090273
27 0.035440
28 0.027273
29 0.018544
... ...
892786 0.017111
892787 0.036884
892788 0.042038
892789 0.036242
892790 0.028578
892791 0.029502
892792 0.025095
892793 0.046357
892794 0.021107
892795 0.028772
892796 0.049289
892797 0.073454
892798 0.033845
892799 0.101784
892800 0.028131
892801 0.025324
892802 0.025526
892803 0.030707
892804 0.046696
892805 0.042055
892806 0.024705
892807 0.027491
892808 0.024442
892809 0.022619
892810 0.020697
892811 0.069824
892812 0.040311
892813 0.042906
892814 0.026107
892815 0.027938

892816 rows × 1 columns


In [38]:
answer=pd.concat([pd.DataFrame(test_data['id']),p],axis=1)
answer.columns=[['id','target']]
answer.to_csv('prediction_final_xgbParamsChanged.csv',index=False)

In [39]:
answer


Out[39]:
id target
0 0 0.028507
1 1 0.029891
2 2 0.024070
3 3 0.013895
4 4 0.033502
5 5 0.044280
6 6 0.017185
7 8 0.036855
8 10 0.051538
9 11 0.049058
10 12 0.031651
11 14 0.017965
12 15 0.055400
13 18 0.056822
14 21 0.040306
15 23 0.018269
16 24 0.023472
17 25 0.052463
18 27 0.010320
19 29 0.050178
20 30 0.034624
21 31 0.039136
22 32 0.048343
23 33 0.031339
24 37 0.021085
25 38 0.024448
26 39 0.090273
27 40 0.035440
28 41 0.027273
29 42 0.018544
... ... ...
892786 1487982 0.017111
892787 1487984 0.036884
892788 1487985 0.042038
892789 1487986 0.036242
892790 1487987 0.028578
892791 1487989 0.029502
892792 1487991 0.025095
892793 1487993 0.046357
892794 1487995 0.021107
892795 1487997 0.028772
892796 1487998 0.049289
892797 1487999 0.073454
892798 1488000 0.033845
892799 1488002 0.101784
892800 1488003 0.028131
892801 1488004 0.025324
892802 1488006 0.025526
892803 1488007 0.030707
892804 1488010 0.046696
892805 1488012 0.042055
892806 1488014 0.024705
892807 1488015 0.027491
892808 1488018 0.024442
892809 1488019 0.022619
892810 1488020 0.020697
892811 1488022 0.069824
892812 1488023 0.040311
892813 1488024 0.042906
892814 1488025 0.026107
892815 1488026 0.027938

892816 rows × 2 columns


In [ ]: