notebook.community

Edit and run



In [1]:

    
import os
import pandas as pd
import math
import numpy as np
from sklearn.tree import DecisionTreeClassifier



In [2]:

    
data = pd.read_csv('kaggle_porto_seguro/train.csv')
data.head()









    Out[2]:






  
    
      
      id
      target
      ps_ind_01
      ps_ind_02_cat
      ps_ind_03
      ps_ind_04_cat
      ps_ind_05_cat
      ps_ind_06_bin
      ps_ind_07_bin
      ps_ind_08_bin
      ...
      ps_calc_11
      ps_calc_12
      ps_calc_13
      ps_calc_14
      ps_calc_15_bin
      ps_calc_16_bin
      ps_calc_17_bin
      ps_calc_18_bin
      ps_calc_19_bin
      ps_calc_20_bin
    
  
  
    
      0
      7
      0
      2
      2
      5
      1
      0
      0
      1
      0
      ...
      9
      1
      5
      8
      0
      1
      1
      0
      0
      1
    
    
      1
      9
      0
      1
      1
      7
      0
      0
      0
      0
      1
      ...
      3
      1
      1
      9
      0
      1
      1
      0
      1
      0
    
    
      2
      13
      0
      5
      4
      9
      1
      0
      0
      0
      1
      ...
      4
      2
      7
      7
      0
      1
      1
      0
      1
      0
    
    
      3
      16
      0
      0
      1
      2
      0
      0
      1
      0
      0
      ...
      2
      2
      4
      9
      0
      0
      0
      0
      0
      0
    
    
      4
      17
      0
      0
      2
      0
      1
      0
      1
      0
      0
      ...
      3
      1
      1
      3
      0
      0
      0
      1
      1
      0
    
  

5 rows × 59 columns



In [3]:

    
data.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     595212 non-null int64
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595212 non-null int64
ps_ind_05_cat     595212 non-null int64
ps_ind_06_bin     595212 non-null int64
ps_ind_07_bin     595212 non-null int64
ps_ind_08_bin     595212 non-null int64
ps_ind_09_bin     595212 non-null int64
ps_ind_10_bin     595212 non-null int64
ps_ind_11_bin     595212 non-null int64
ps_ind_12_bin     595212 non-null int64
ps_ind_13_bin     595212 non-null int64
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null int64
ps_ind_17_bin     595212 non-null int64
ps_ind_18_bin     595212 non-null int64
ps_reg_01         595212 non-null float64
ps_reg_02         595212 non-null float64
ps_reg_03         595212 non-null float64
ps_car_01_cat     595212 non-null int64
ps_car_02_cat     595212 non-null int64
ps_car_03_cat     595212 non-null int64
ps_car_04_cat     595212 non-null int64
ps_car_05_cat     595212 non-null int64
ps_car_06_cat     595212 non-null int64
ps_car_07_cat     595212 non-null int64
ps_car_08_cat     595212 non-null int64
ps_car_09_cat     595212 non-null int64
ps_car_10_cat     595212 non-null int64
ps_car_11_cat     595212 non-null int64
ps_car_11         595212 non-null int64
ps_car_12         595212 non-null float64
ps_car_13         595212 non-null float64
ps_car_14         595212 non-null float64
ps_car_15         595212 non-null float64
ps_calc_01        595212 non-null float64
ps_calc_02        595212 non-null float64
ps_calc_03        595212 non-null float64
ps_calc_04        595212 non-null int64
ps_calc_05        595212 non-null int64
ps_calc_06        595212 non-null int64
ps_calc_07        595212 non-null int64
ps_calc_08        595212 non-null int64
ps_calc_09        595212 non-null int64
ps_calc_10        595212 non-null int64
ps_calc_11        595212 non-null int64
ps_calc_12        595212 non-null int64
ps_calc_13        595212 non-null int64
ps_calc_14        595212 non-null int64
ps_calc_15_bin    595212 non-null int64
ps_calc_16_bin    595212 non-null int64
ps_calc_17_bin    595212 non-null int64
ps_calc_18_bin    595212 non-null int64
ps_calc_19_bin    595212 non-null int64
ps_calc_20_bin    595212 non-null int64
dtypes: float64(10), int64(49)
memory usage: 267.9 MB



In [4]:

    
data = data.sample(frac=1).reset_index(drop=True)



In [6]:

    
x_train = data.iloc[:585212,2:]
y_train = data.iloc[:585212,1]
x_test = data.iloc[585212:,2:]
y_test = data.iloc[585212:,1]
y_test.head()









    Out[6]:





585212    0
585213    0
585214    0
585215    0
585216    0
Name: target, dtype: int64



In [7]:

    
d_tree = DecisionTreeClassifier(criterion='entropy')
d_tree.fit(x_train, y_train)









    Out[7]:





DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [8]:

    
d_tree.score(x_test, y_test)









    Out[8]:





0.92749999999999999



In [9]:

    
test_data = pd.read_csv('kaggle_porto_seguro/test.csv')
test_data.head()









    Out[9]:






  
    
      
      id
      ps_ind_01
      ps_ind_02_cat
      ps_ind_03
      ps_ind_04_cat
      ps_ind_05_cat
      ps_ind_06_bin
      ps_ind_07_bin
      ps_ind_08_bin
      ps_ind_09_bin
      ...
      ps_calc_11
      ps_calc_12
      ps_calc_13
      ps_calc_14
      ps_calc_15_bin
      ps_calc_16_bin
      ps_calc_17_bin
      ps_calc_18_bin
      ps_calc_19_bin
      ps_calc_20_bin
    
  
  
    
      0
      0
      0
      1
      8
      1
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      12
      0
      1
      1
      0
      0
      1
    
    
      1
      1
      4
      2
      5
      1
      0
      0
      0
      0
      1
      ...
      2
      0
      3
      10
      0
      0
      1
      1
      0
      1
    
    
      2
      2
      5
      1
      3
      0
      0
      0
      0
      0
      1
      ...
      4
      0
      2
      4
      0
      0
      0
      0
      0
      0
    
    
      3
      3
      0
      1
      6
      0
      0
      1
      0
      0
      0
      ...
      5
      1
      0
      5
      1
      0
      1
      0
      0
      0
    
    
      4
      4
      5
      1
      7
      0
      0
      0
      0
      0
      1
      ...
      4
      0
      0
      4
      0
      1
      1
      0
      0
      1
    
  

5 rows × 58 columns



In [10]:

    
predictions = d_tree.predict(test_data.iloc[:,1:])



In [11]:

    
submission = pd.DataFrame()
submission['id'] = test_data.iloc[:, 0]
submission['target'] = predictions
submission.head()



In [13]:

    
submission.to_csv('kaggle_submission.csv', index=False)



In [ ]:

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	...	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	...	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	...	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	...	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	...	3	1	1	3	0	0	1	1	0

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_09_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
0	0	0	1	8	1	0	1	0	...	1	1	1	12	0	1	1	0	1
1	1	4	2	5	1	0	0	1	...	2	0	3	10	0	0	1	1	1
2	2	5	1	3	0	0	0	1	...	4	0	2	4	0	0	0	0	0
3	3	0	1	6	0	1	0	0	...	5	1	0	5	1	0	1	0	0
4	4	5	1	7	0	0	0	1	...	4	0	0	4	0	1	1	0	1

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	...	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	...	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	...	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	...	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	...	3	1	1	3	0	0	1	1	0

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_09_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
0	0	0	1	8	1	0	1	0	...	1	1	1	12	0	1	1	0	1
1	1	4	2	5	1	0	0	1	...	2	0	3	10	0	0	1	1	1
2	2	5	1	3	0	0	0	1	...	4	0	2	4	0	0	0	0	0
3	3	0	1	6	0	1	0	0	...	5	1	0	5	1	0	1	0	0
4	4	5	1	7	0	0	0	1	...	4	0	0	4	0	1	1	0	1

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	...	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	...	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	...	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	...	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	...	3	1	1	3	0	0	1	1	0

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_09_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
0	0	0	1	8	1	0	1	0	...	1	1	1	12	0	1	1	0	1
1	1	4	2	5	1	0	0	1	...	2	0	3	10	0	0	1	1	1
2	2	5	1	3	0	0	0	1	...	4	0	2	4	0	0	0	0	0
3	3	0	1	6	0	1	0	0	...	5	1	0	5	1	0	1	0	0
4	4	5	1	7	0	0	0	1	...	4	0	0	4	0	1	1	0	1