In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/data/credit-default.csv")

In [3]:
df


Out[3]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_length installment_rate personal_status other_debtors ... property age installment_plan housing existing_credits default dependents telephone foreign_worker job
0 < 0 DM 6 critical radio/tv 1169 unknown > 7 yrs 4 single male none ... real estate 67 none own 2 1 1 yes yes skilled employee
1 1 - 200 DM 48 repaid radio/tv 5951 < 100 DM 1 - 4 yrs 2 female none ... real estate 22 none own 1 2 1 none yes skilled employee
2 unknown 12 critical education 2096 < 100 DM 4 - 7 yrs 2 single male none ... real estate 49 none own 1 1 2 none yes unskilled resident
3 < 0 DM 42 repaid furniture 7882 < 100 DM 4 - 7 yrs 2 single male guarantor ... building society savings 45 none for free 1 1 2 none yes skilled employee
4 < 0 DM 24 delayed car (new) 4870 < 100 DM 1 - 4 yrs 3 single male none ... unknown/none 53 none for free 2 2 2 none yes skilled employee
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 unknown 12 repaid furniture 1736 < 100 DM 4 - 7 yrs 3 female none ... real estate 31 none own 1 1 1 none yes unskilled resident
996 < 0 DM 30 repaid car (used) 3857 < 100 DM 1 - 4 yrs 4 divorced male none ... building society savings 40 none own 1 1 1 yes yes mangement self-employed
997 unknown 12 repaid radio/tv 804 < 100 DM > 7 yrs 4 single male none ... other 38 none own 1 1 1 none yes skilled employee
998 < 0 DM 45 repaid radio/tv 1845 < 100 DM 1 - 4 yrs 4 single male none ... unknown/none 23 none for free 1 2 1 yes yes skilled employee
999 1 - 200 DM 45 critical car (used) 4576 101 - 500 DM unemployed 3 single male none ... other 27 none own 1 1 1 none yes skilled employee

1000 rows × 21 columns


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_length     1000 non-null   object
 7   installment_rate      1000 non-null   int64 
 8   personal_status       1000 non-null   object
 9   other_debtors         1000 non-null   object
 10  residence_history     1000 non-null   int64 
 11  property              1000 non-null   object
 12  age                   1000 non-null   int64 
 13  installment_plan      1000 non-null   object
 14  housing               1000 non-null   object
 15  existing_credits      1000 non-null   int64 
 16  default               1000 non-null   int64 
 17  dependents            1000 non-null   int64 
 18  telephone             1000 non-null   object
 19  foreign_worker        1000 non-null   object
 20  job                   1000 non-null   object
dtypes: int64(8), object(13)
memory usage: 164.2+ KB

In [6]:
df.default.value_counts()


Out[6]:
1    700
2    300
Name: default, dtype: int64

In [7]:
df.default.value_counts()/len(df)


Out[7]:
1    0.7
2    0.3
Name: default, dtype: float64

In [ ]:
1 -> Negative Sample (0) - > customer is not default
2 -> Positive Sample (1) -> customer is default (not able to pay back the loan)

In [ ]:
P -> 76
N -> 45

In [13]:
p1 = 76 / (76 + 45)
p1


Out[13]:
0.628099173553719

In [ ]:
p0 + p1 = 1

gini = po*(1-po) + p1* (1-p1) = (1- p1) * p1 + p1*(1-p1) = 2 * p1 * (1-p1) = 2 * p1 * p0

In [14]:
2 * p1 * (1-p1)


Out[14]:
0.4671812034697083

In [ ]:


In [ ]:
P -> 11
N -> 32

In [15]:
p1 = 32/(32 + 11)
p1


Out[15]:
0.7441860465116279

In [16]:
2 * p1 * (1-p1)


Out[16]:
0.3807463493780422

In [ ]:
P -> 9
N -> 1

In [10]:
9/(9+1)


Out[10]:
0.9

In [11]:
p-> 4
n -> 6


  File "<ipython-input-11-d016f2221685>", line 1
    p-> 4
      ^
SyntaxError: invalid syntax

In [12]:
6/(4+6)


Out[12]:
0.6

In [17]:
p1 = 214/700

In [18]:
2 * p1 * (1-p1)


Out[18]:
0.4245061224489796

In [19]:
(.488 + .221)/2


Out[19]:
0.3545

In [20]:
.424 - .3545


Out[20]:
0.0695

In [22]:
(424 * .488 + 276 * .221)/ 700 # weighted gini avg


Out[22]:
0.3827257142857143

In [24]:
0.4245061224489796 - 0.3827257142857143 #information gain


Out[24]:
0.04178040816326534

In [25]:
# - sum(pi * log(pi)) -> Entropy
# sum(pi * (1-pi))  -> Gini = sum(pi) - sum (pi * pi) = 1 - sum(pi*pi) = 1 - sum(pi * pj) where i != j

In [ ]:
# objective of the tree is find the condition by which it maximizes the information gains

In [ ]:
# stump -> a condition only on one feature, not more than one.

In [26]:
# purpose of loan - P1, P2, P3 (categorical)
# age - 20, 60 (cont)

In [27]:
# candidates for a stump
# for each candidate find the information gain
# select the condition which gives the max information gain 
purpose = p1  
purpose = p2
purpose = p3

age < 25 (percentile 10)
age < 30 (percentile 20)
age < 37 (percentile 30)
...


  File "<ipython-input-27-e6863e5c2cb0>", line 6
    age < 25 (percentile 10)
                          ^
SyntaxError: invalid syntax

In [28]:
df.shape


Out[28]:
(1000, 21)

In [29]:
df.head()


Out[29]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_length installment_rate personal_status other_debtors ... property age installment_plan housing existing_credits default dependents telephone foreign_worker job
0 < 0 DM 6 critical radio/tv 1169 unknown > 7 yrs 4 single male none ... real estate 67 none own 2 1 1 yes yes skilled employee
1 1 - 200 DM 48 repaid radio/tv 5951 < 100 DM 1 - 4 yrs 2 female none ... real estate 22 none own 1 2 1 none yes skilled employee
2 unknown 12 critical education 2096 < 100 DM 4 - 7 yrs 2 single male none ... real estate 49 none own 1 1 2 none yes unskilled resident
3 < 0 DM 42 repaid furniture 7882 < 100 DM 4 - 7 yrs 2 single male guarantor ... building society savings 45 none for free 1 1 2 none yes skilled employee
4 < 0 DM 24 delayed car (new) 4870 < 100 DM 1 - 4 yrs 3 single male none ... unknown/none 53 none for free 2 2 2 none yes skilled employee

5 rows × 21 columns


In [30]:
g1 = df[df.purpose == "radio/tv"]
g2 =  df[df.purpose != "radio/tv"]

In [31]:
len(g1), len(g2)


Out[31]:
(280, 720)

In [33]:
g1.default.value_counts()/len(g1)


Out[33]:
1    0.778571
2    0.221429
Name: default, dtype: float64

In [35]:
2 * 0.778571 * 0.221429


Out[35]:
0.344796395918

In [34]:
g2.default.value_counts()/len(g2)


Out[34]:
1    0.669444
2    0.330556
Name: default, dtype: float64

In [36]:
2 * .669444 * 0.330556


Out[36]:
0.44257746172800005

In [37]:
df.default.value_counts()/len(df)


Out[37]:
1    0.7
2    0.3
Name: default, dtype: float64

In [38]:
gini_root = 2 * .7 * .3
gini_root


Out[38]:
0.42

In [40]:
0.42 - (280 * 0.344796395918 + 720 * 0.44257746172800005)/ (280+720) 
# information gain using purpose = "Radio/tv" as condition


Out[40]:
0.004801236698799893

In [41]:
df.head()


Out[41]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_length installment_rate personal_status other_debtors ... property age installment_plan housing existing_credits default dependents telephone foreign_worker job
0 < 0 DM 6 critical radio/tv 1169 unknown > 7 yrs 4 single male none ... real estate 67 none own 2 1 1 yes yes skilled employee
1 1 - 200 DM 48 repaid radio/tv 5951 < 100 DM 1 - 4 yrs 2 female none ... real estate 22 none own 1 2 1 none yes skilled employee
2 unknown 12 critical education 2096 < 100 DM 4 - 7 yrs 2 single male none ... real estate 49 none own 1 1 2 none yes unskilled resident
3 < 0 DM 42 repaid furniture 7882 < 100 DM 4 - 7 yrs 2 single male guarantor ... building society savings 45 none for free 1 1 2 none yes skilled employee
4 < 0 DM 24 delayed car (new) 4870 < 100 DM 1 - 4 yrs 3 single male none ... unknown/none 53 none for free 2 2 2 none yes skilled employee

5 rows × 21 columns


In [42]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_length     1000 non-null   object
 7   installment_rate      1000 non-null   int64 
 8   personal_status       1000 non-null   object
 9   other_debtors         1000 non-null   object
 10  residence_history     1000 non-null   int64 
 11  property              1000 non-null   object
 12  age                   1000 non-null   int64 
 13  installment_plan      1000 non-null   object
 14  housing               1000 non-null   object
 15  existing_credits      1000 non-null   int64 
 16  default               1000 non-null   int64 
 17  dependents            1000 non-null   int64 
 18  telephone             1000 non-null   object
 19  foreign_worker        1000 non-null   object
 20  job                   1000 non-null   object
dtypes: int64(8), object(13)
memory usage: 164.2+ KB

In [46]:
df_dummy  = pd.get_dummies(df, drop_first=True)

In [47]:
df_dummy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 49 columns):
 #   Column                                 Non-Null Count  Dtype
---  ------                                 --------------  -----
 0   months_loan_duration                   1000 non-null   int64
 1   amount                                 1000 non-null   int64
 2   installment_rate                       1000 non-null   int64
 3   residence_history                      1000 non-null   int64
 4   age                                    1000 non-null   int64
 5   existing_credits                       1000 non-null   int64
 6   default                                1000 non-null   int64
 7   dependents                             1000 non-null   int64
 8   checking_balance_< 0 DM                1000 non-null   uint8
 9   checking_balance_> 200 DM              1000 non-null   uint8
 10  checking_balance_unknown               1000 non-null   uint8
 11  credit_history_delayed                 1000 non-null   uint8
 12  credit_history_fully repaid            1000 non-null   uint8
 13  credit_history_fully repaid this bank  1000 non-null   uint8
 14  credit_history_repaid                  1000 non-null   uint8
 15  purpose_car (new)                      1000 non-null   uint8
 16  purpose_car (used)                     1000 non-null   uint8
 17  purpose_domestic appliances            1000 non-null   uint8
 18  purpose_education                      1000 non-null   uint8
 19  purpose_furniture                      1000 non-null   uint8
 20  purpose_others                         1000 non-null   uint8
 21  purpose_radio/tv                       1000 non-null   uint8
 22  purpose_repairs                        1000 non-null   uint8
 23  purpose_retraining                     1000 non-null   uint8
 24  savings_balance_501 - 1000 DM          1000 non-null   uint8
 25  savings_balance_< 100 DM               1000 non-null   uint8
 26  savings_balance_> 1000 DM              1000 non-null   uint8
 27  savings_balance_unknown                1000 non-null   uint8
 28  employment_length_1 - 4 yrs            1000 non-null   uint8
 29  employment_length_4 - 7 yrs            1000 non-null   uint8
 30  employment_length_> 7 yrs              1000 non-null   uint8
 31  employment_length_unemployed           1000 non-null   uint8
 32  personal_status_female                 1000 non-null   uint8
 33  personal_status_married male           1000 non-null   uint8
 34  personal_status_single male            1000 non-null   uint8
 35  other_debtors_guarantor                1000 non-null   uint8
 36  other_debtors_none                     1000 non-null   uint8
 37  property_other                         1000 non-null   uint8
 38  property_real estate                   1000 non-null   uint8
 39  property_unknown/none                  1000 non-null   uint8
 40  installment_plan_none                  1000 non-null   uint8
 41  installment_plan_stores                1000 non-null   uint8
 42  housing_own                            1000 non-null   uint8
 43  housing_rent                           1000 non-null   uint8
 44  telephone_yes                          1000 non-null   uint8
 45  foreign_worker_yes                     1000 non-null   uint8
 46  job_skilled employee                   1000 non-null   uint8
 47  job_unemployed non-resident            1000 non-null   uint8
 48  job_unskilled resident                 1000 non-null   uint8
dtypes: int64(8), uint8(41)
memory usage: 102.7 KB

In [48]:
target = "default"

In [49]:
y = df_dummy[target]

In [52]:
X = df_dummy.drop(columns=target)

In [53]:
X.shape


Out[53]:
(1000, 48)

In [54]:
X.head()


Out[54]:
months_loan_duration amount installment_rate residence_history age existing_credits dependents checking_balance_< 0 DM checking_balance_> 200 DM checking_balance_unknown ... property_unknown/none installment_plan_none installment_plan_stores housing_own housing_rent telephone_yes foreign_worker_yes job_skilled employee job_unemployed non-resident job_unskilled resident
0 6 1169 4 4 67 2 1 1 0 0 ... 0 1 0 1 0 1 1 1 0 0
1 48 5951 2 2 22 1 1 0 0 0 ... 0 1 0 1 0 0 1 1 0 0
2 12 2096 2 3 49 1 2 0 0 1 ... 0 1 0 1 0 0 1 0 0 1
3 42 7882 2 4 45 1 2 1 0 0 ... 0 1 0 0 0 0 1 1 0 0
4 24 4870 3 4 53 2 2 1 0 0 ... 1 1 0 0 0 0 1 1 0 0

5 rows × 48 columns


In [55]:
from sklearn import model_selection

In [56]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)

In [57]:
from sklearn import tree

In [58]:
est = tree.DecisionTreeClassifier()
est.fit(X_train, y_train)


Out[58]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [59]:
y_test_pred = est.predict(X_test)

In [60]:
from sklearn import metrics

In [61]:
metrics.accuracy_score(y_test, y_test_pred)


Out[61]:
0.6433333333333333

In [62]:
pd.Series(y_train).value_counts()/len(y_train)


Out[62]:
1    0.694286
2    0.305714
Name: default, dtype: float64

In [63]:
pd.Series(y_test).value_counts()/len(y_test)


Out[63]:
1    0.713333
2    0.286667
Name: default, dtype: float64

In [65]:
pd.DataFrame({"actual": y_test, "prediction": y_test_pred})


Out[65]:
actual prediction
507 2 2
818 1 2
452 1 1
368 2 2
242 2 2
... ... ...
459 1 2
415 1 1
61 1 1
347 1 2
349 2 1

300 rows × 2 columns


In [67]:
import numpy as np

In [68]:
np.sum(y_test == y_test_pred)


Out[68]:
193

In [69]:
193/len(y_test)


Out[69]:
0.6433333333333333

In [70]:
metrics.confusion_matrix(y_test, y_test_pred)


Out[70]:
array([[158,  56],
       [ 51,  35]])

In [71]:
158 + 35


Out[71]:
193

In [73]:
print(metrics.classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           1       0.76      0.74      0.75       214
           2       0.38      0.41      0.40        86

    accuracy                           0.64       300
   macro avg       0.57      0.57      0.57       300
weighted avg       0.65      0.64      0.65       300


In [75]:
est = tree.DecisionTreeClassifier()
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))


train accuracy 1.0
test accuracy 0.6566666666666666

In [76]:
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png

In [77]:
est = tree.DecisionTreeClassifier(max_depth=3)
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))


train accuracy 0.7514285714285714
test accuracy 0.7333333333333333

In [78]:
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png

In [83]:
est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=20)
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png


train accuracy 0.7385714285714285
test accuracy 0.7433333333333333

In [84]:
est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=20, criterion="entropy")
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png


train accuracy 0.7385714285714285
test accuracy 0.7433333333333333

In [81]:
scores = model_selection.cross_val_score(est, X_train, y_train, cv = 5)
scores


Out[81]:
array([0.67857143, 0.69285714, 0.67142857, 0.73571429, 0.70714286])

In [82]:
np.mean(scores)


Out[82]:
0.6971428571428572

In [88]:
param_grid = {
    "max_depth": np.arange(2, 20),
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": np.arange(1, 10) * 5
}


gsearch =model_selection.GridSearchCV(est, param_grid=param_grid, scoring="accuracy"
                                      , cv = 5, verbose = True, n_jobs= 8)
gsearch.fit(X, y)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 1592 tasks      | elapsed:    2.7s
[Parallel(n_jobs=8)]: Done 1620 out of 1620 | elapsed:    2.8s finished
Out[88]:
GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='entropy', max_depth=3,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=20,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19]),
                         'min_samples_leaf': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [91]:
18 * 2 * 9 * 5


Out[91]:
1620

In [92]:
len(np.arange(2, 20))


Out[92]:
18

In [93]:
gsearch.best_score_


Out[93]:
0.7289999999999999

In [94]:
gsearch.best_params_


Out[94]:
{'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 10}

In [95]:
est = tree.DecisionTreeClassifier(max_depth=11, min_samples_leaf=10, criterion="gini")
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png


train accuracy 0.8185714285714286
test accuracy 0.7266666666666667

In [103]:
param_grid = {
    "max_depth": np.arange(2, 6),
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": np.arange(5, 20)
}


gsearch =model_selection.GridSearchCV(est, param_grid=param_grid, scoring="accuracy"
                                      , cv = 5, verbose = True, n_jobs= 8)
gsearch.fit(X_train, y_train)


Fitting 5 folds for each of 120 candidates, totalling 600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 600 out of 600 | elapsed:    0.8s finished
Out[103]:
GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=11,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=10,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([2, 3, 4, 5]),
                         'min_samples_leaf': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [104]:
gsearch.score(X_train, y_train), gsearch.best_score_


Out[104]:
(0.7685714285714286, 0.7228571428571429)

In [105]:
gsearch.best_params_


Out[105]:
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 12}

In [107]:
y_test_pred = est.predict_proba(X_test)[:, 1]

In [111]:
from sklearn import preprocessing

In [112]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y_train)
y_test_le = label_encoder.transform(y_test)

In [114]:
fpr, tpr, thresholds = metrics.roc_curve(y_test_le, y_test_pred)

In [115]:
import matplotlib.pyplot as plt

In [122]:
auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr)
plt.title("ROC curve, auc score:" + str(auc))
plt.xlabel("FPR")
plt.ylabel("TPR")


Out[122]:
Text(0, 0.5, 'TPR')

In [124]:
est.feature_importances_


Out[124]:
array([0.14401518, 0.1239812 , 0.06727951, 0.05809262, 0.01162033,
       0.01539952, 0.        , 0.        , 0.02555282, 0.22336633,
       0.00851682, 0.03585933, 0.02596523, 0.0528968 , 0.01313525,
       0.01886405, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.04727327,
       0.        , 0.00262122, 0.00959882, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.03595749,
       0.01132407, 0.02695599, 0.        , 0.        , 0.0253872 ,
       0.        , 0.01633697, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [ ]:
feature 1 - purpose
purpose = v1 -> IG1
purpose = v2 -> IG2
..

feature 2 - age
age < 30   -> IG3
age < 40   -> IG4

..



feature1 (purpose)-> sum(IG1, Ig2 ... )
feature2 (age)-> sum(IG3,IG4  ... )

In [125]:
est.feature_importances_.sum()


Out[125]:
1.0

In [126]:
df_dummy.columns


Out[126]:
Index(['months_loan_duration', 'amount', 'installment_rate',
       'residence_history', 'age', 'existing_credits', 'default', 'dependents',
       'checking_balance_< 0 DM', 'checking_balance_> 200 DM',
       'checking_balance_unknown', 'credit_history_delayed',
       'credit_history_fully repaid', 'credit_history_fully repaid this bank',
       'credit_history_repaid', 'purpose_car (new)', 'purpose_car (used)',
       'purpose_domestic appliances', 'purpose_education', 'purpose_furniture',
       'purpose_others', 'purpose_radio/tv', 'purpose_repairs',
       'purpose_retraining', 'savings_balance_501 - 1000 DM',
       'savings_balance_< 100 DM', 'savings_balance_> 1000 DM',
       'savings_balance_unknown', 'employment_length_1 - 4 yrs',
       'employment_length_4 - 7 yrs', 'employment_length_> 7 yrs',
       'employment_length_unemployed', 'personal_status_female',
       'personal_status_married male', 'personal_status_single male',
       'other_debtors_guarantor', 'other_debtors_none', 'property_other',
       'property_real estate', 'property_unknown/none',
       'installment_plan_none', 'installment_plan_stores', 'housing_own',
       'housing_rent', 'telephone_yes', 'foreign_worker_yes',
       'job_skilled employee', 'job_unemployed non-resident',
       'job_unskilled resident'],
      dtype='object')

In [128]:
X.columns


Out[128]:
Index(['months_loan_duration', 'amount', 'installment_rate',
       'residence_history', 'age', 'existing_credits', 'dependents',
       'checking_balance_< 0 DM', 'checking_balance_> 200 DM',
       'checking_balance_unknown', 'credit_history_delayed',
       'credit_history_fully repaid', 'credit_history_fully repaid this bank',
       'credit_history_repaid', 'purpose_car (new)', 'purpose_car (used)',
       'purpose_domestic appliances', 'purpose_education', 'purpose_furniture',
       'purpose_others', 'purpose_radio/tv', 'purpose_repairs',
       'purpose_retraining', 'savings_balance_501 - 1000 DM',
       'savings_balance_< 100 DM', 'savings_balance_> 1000 DM',
       'savings_balance_unknown', 'employment_length_1 - 4 yrs',
       'employment_length_4 - 7 yrs', 'employment_length_> 7 yrs',
       'employment_length_unemployed', 'personal_status_female',
       'personal_status_married male', 'personal_status_single male',
       'other_debtors_guarantor', 'other_debtors_none', 'property_other',
       'property_real estate', 'property_unknown/none',
       'installment_plan_none', 'installment_plan_stores', 'housing_own',
       'housing_rent', 'telephone_yes', 'foreign_worker_yes',
       'job_skilled employee', 'job_unemployed non-resident',
       'job_unskilled resident'],
      dtype='object')

In [129]:
importance = pd.DataFrame({"feature": X.columns, "importance": est.feature_importances_})

In [130]:
importance


Out[130]:
feature importance
0 months_loan_duration 0.144015
1 amount 0.123981
2 installment_rate 0.067280
3 residence_history 0.058093
4 age 0.011620
5 existing_credits 0.015400
6 dependents 0.000000
7 checking_balance_< 0 DM 0.000000
8 checking_balance_> 200 DM 0.025553
9 checking_balance_unknown 0.223366
10 credit_history_delayed 0.008517
11 credit_history_fully repaid 0.035859
12 credit_history_fully repaid this bank 0.025965
13 credit_history_repaid 0.052897
14 purpose_car (new) 0.013135
15 purpose_car (used) 0.018864
16 purpose_domestic appliances 0.000000
17 purpose_education 0.000000
18 purpose_furniture 0.000000
19 purpose_others 0.000000
20 purpose_radio/tv 0.000000
21 purpose_repairs 0.000000
22 purpose_retraining 0.000000
23 savings_balance_501 - 1000 DM 0.000000
24 savings_balance_< 100 DM 0.047273
25 savings_balance_> 1000 DM 0.000000
26 savings_balance_unknown 0.002621
27 employment_length_1 - 4 yrs 0.009599
28 employment_length_4 - 7 yrs 0.000000
29 employment_length_> 7 yrs 0.000000
30 employment_length_unemployed 0.000000
31 personal_status_female 0.000000
32 personal_status_married male 0.000000
33 personal_status_single male 0.000000
34 other_debtors_guarantor 0.035957
35 other_debtors_none 0.011324
36 property_other 0.026956
37 property_real estate 0.000000
38 property_unknown/none 0.000000
39 installment_plan_none 0.025387
40 installment_plan_stores 0.000000
41 housing_own 0.016337
42 housing_rent 0.000000
43 telephone_yes 0.000000
44 foreign_worker_yes 0.000000
45 job_skilled employee 0.000000
46 job_unemployed non-resident 0.000000
47 job_unskilled resident 0.000000

In [132]:
importance.iloc[:10, :]


Out[132]:
feature importance
0 months_loan_duration 0.144015
1 amount 0.123981
2 installment_rate 0.067280
3 residence_history 0.058093
4 age 0.011620
5 existing_credits 0.015400
6 dependents 0.000000
7 checking_balance_< 0 DM 0.000000
8 checking_balance_> 200 DM 0.025553
9 checking_balance_unknown 0.223366

In [133]:
from sklearn import ensemble

In [141]:
forest = ensemble.RandomForestClassifier(max_depth=6, n_estimators=50, )
forest.fit(X_train, y_train)


y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))


training accuracy:  0.8271428571428572
test accuracy:  0.7433333333333333
training precision:  0.8067226890756303
test precision:  0.7509157509157509
training recall:  0.9876543209876543
test recall:  0.9579439252336449

In [142]:
forest.feature_importances_


Out[142]:
array([0.1012958 , 0.09682539, 0.03229096, 0.01784331, 0.07696492,
       0.01794807, 0.00650361, 0.07494516, 0.00786617, 0.13197544,
       0.01225512, 0.03187523, 0.0103532 , 0.01512056, 0.01852747,
       0.01660301, 0.00219783, 0.00801843, 0.00745262, 0.00094362,
       0.01404989, 0.00919839, 0.00015451, 0.00255661, 0.02811401,
       0.00788001, 0.02144612, 0.00832382, 0.00725275, 0.01458451,
       0.01335561, 0.01276311, 0.00606711, 0.0126112 , 0.0089155 ,
       0.00606123, 0.00881428, 0.01479555, 0.01289806, 0.0267686 ,
       0.00824498, 0.01707735, 0.01445719, 0.01346965, 0.0030963 ,
       0.00893963, 0.00194125, 0.00835686])

In [145]:
importance = pd.DataFrame({"feature": X.columns, "importance": forest.feature_importances_ }) 
importance.iloc[:10,:]


Out[145]:
feature importance
0 months_loan_duration 0.101296
1 amount 0.096825
2 installment_rate 0.032291
3 residence_history 0.017843
4 age 0.076965
5 existing_credits 0.017948
6 dependents 0.006504
7 checking_balance_< 0 DM 0.074945
8 checking_balance_> 200 DM 0.007866
9 checking_balance_unknown 0.131975

In [146]:
forest.estimators_


Out[146]:
[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1776766514, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1087178500, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=664121568, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=183752352, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1702127644, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=756443523, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=697393087, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=685524680, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1383422173, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=956251155, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=721416188, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1288296911, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=961329987, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=519422218, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=821855837, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1913304890, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=2119238008, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=383506554, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=827006483, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=2129796894, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1678675055, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1822692151, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=801678338, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=220500646, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=69059735, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=737010616, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1875743282, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1127598631, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1480074863, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1865260314, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1390215547, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1661539180, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1001328911, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1373932204, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1171376576, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1296032508, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=194332320, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=891901763, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=647258859, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=478834039, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=563784379, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=558345168, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=368695702, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=106511248, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=474272604, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=457562369, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=495049404, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=609735876, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=129884074, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1933877961, splitter='best')]

In [148]:
from sklearn.tree import export_graphviz
export_graphviz(forest.estimators_[1], out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png

In [179]:
est = tree.DecisionTreeClassifier(max_depth=1, min_samples_leaf=10, criterion="gini")

bagging = ensemble.BaggingClassifier(est, n_estimators=20, random_state=23)
bagging.fit(X_train, y_train)

y_train_pred = bagging.predict(X_train)
y_test_pred = bagging.predict(X_test)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))


training accuracy:  0.6942857142857143
test accuracy:  0.7133333333333334
training precision:  0.6942857142857143
test precision:  0.7133333333333334
training recall:  1.0
test recall:  1.0

In [180]:
y_train_pred


Out[180]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [190]:
est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=100, criterion="gini")
adaboost = ensemble.AdaBoostClassifier(est, n_estimators=100 )
adaboost.fit(X_train, y_train)


y_train_pred = adaboost.predict(X_train)
y_test_pred = adaboost.predict(X_test)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))


training accuracy:  0.9228571428571428
test accuracy:  0.7233333333333334
training precision:  0.9268774703557312
test precision:  0.7963800904977375
training recall:  0.9650205761316872
test recall:  0.822429906542056

In [ ]: