notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
df = pd.read_csv("/data/credit-default.csv")



In [3]:

    
df









    Out[3]:







  
    
      
      checking_balance
      months_loan_duration
      credit_history
      purpose
      amount
      savings_balance
      employment_length
      installment_rate
      personal_status
      other_debtors
      ...
      property
      age
      installment_plan
      housing
      existing_credits
      default
      dependents
      telephone
      foreign_worker
      job
    
  
  
    
      0
      < 0 DM
      6
      critical
      radio/tv
      1169
      unknown
      > 7 yrs
      4
      single male
      none
      ...
      real estate
      67
      none
      own
      2
      1
      1
      yes
      yes
      skilled employee
    
    
      1
      1 - 200 DM
      48
      repaid
      radio/tv
      5951
      < 100 DM
      1 - 4 yrs
      2
      female
      none
      ...
      real estate
      22
      none
      own
      1
      2
      1
      none
      yes
      skilled employee
    
    
      2
      unknown
      12
      critical
      education
      2096
      < 100 DM
      4 - 7 yrs
      2
      single male
      none
      ...
      real estate
      49
      none
      own
      1
      1
      2
      none
      yes
      unskilled resident
    
    
      3
      < 0 DM
      42
      repaid
      furniture
      7882
      < 100 DM
      4 - 7 yrs
      2
      single male
      guarantor
      ...
      building society savings
      45
      none
      for free
      1
      1
      2
      none
      yes
      skilled employee
    
    
      4
      < 0 DM
      24
      delayed
      car (new)
      4870
      < 100 DM
      1 - 4 yrs
      3
      single male
      none
      ...
      unknown/none
      53
      none
      for free
      2
      2
      2
      none
      yes
      skilled employee
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      995
      unknown
      12
      repaid
      furniture
      1736
      < 100 DM
      4 - 7 yrs
      3
      female
      none
      ...
      real estate
      31
      none
      own
      1
      1
      1
      none
      yes
      unskilled resident
    
    
      996
      < 0 DM
      30
      repaid
      car (used)
      3857
      < 100 DM
      1 - 4 yrs
      4
      divorced male
      none
      ...
      building society savings
      40
      none
      own
      1
      1
      1
      yes
      yes
      mangement self-employed
    
    
      997
      unknown
      12
      repaid
      radio/tv
      804
      < 100 DM
      > 7 yrs
      4
      single male
      none
      ...
      other
      38
      none
      own
      1
      1
      1
      none
      yes
      skilled employee
    
    
      998
      < 0 DM
      45
      repaid
      radio/tv
      1845
      < 100 DM
      1 - 4 yrs
      4
      single male
      none
      ...
      unknown/none
      23
      none
      for free
      1
      2
      1
      yes
      yes
      skilled employee
    
    
      999
      1 - 200 DM
      45
      critical
      car (used)
      4576
      101 - 500 DM
      unemployed
      3
      single male
      none
      ...
      other
      27
      none
      own
      1
      1
      1
      none
      yes
      skilled employee
    
  

1000 rows × 21 columns



In [4]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_length     1000 non-null   object
 7   installment_rate      1000 non-null   int64 
 8   personal_status       1000 non-null   object
 9   other_debtors         1000 non-null   object
 10  residence_history     1000 non-null   int64 
 11  property              1000 non-null   object
 12  age                   1000 non-null   int64 
 13  installment_plan      1000 non-null   object
 14  housing               1000 non-null   object
 15  existing_credits      1000 non-null   int64 
 16  default               1000 non-null   int64 
 17  dependents            1000 non-null   int64 
 18  telephone             1000 non-null   object
 19  foreign_worker        1000 non-null   object
 20  job                   1000 non-null   object
dtypes: int64(8), object(13)
memory usage: 164.2+ KB



In [6]:

    
df.default.value_counts()









    Out[6]:





1    700
2    300
Name: default, dtype: int64



In [7]:

    
df.default.value_counts()/len(df)









    Out[7]:





1    0.7
2    0.3
Name: default, dtype: float64



In [ ]:

    
1 -> Negative Sample (0) - > customer is not default
2 -> Positive Sample (1) -> customer is default (not able to pay back the loan)



In [ ]:

    
P -> 76
N -> 45



In [13]:

    
p1 = 76 / (76 + 45)
p1









    Out[13]:





0.628099173553719



In [ ]:

    
p0 + p1 = 1

gini = po*(1-po) + p1* (1-p1) = (1- p1) * p1 + p1*(1-p1) = 2 * p1 * (1-p1) = 2 * p1 * p0



In [14]:

    
2 * p1 * (1-p1)









    Out[14]:





0.4671812034697083



In [ ]:



In [ ]:

    
P -> 11
N -> 32



In [15]:

    
p1 = 32/(32 + 11)
p1









    Out[15]:





0.7441860465116279



In [16]:

    
2 * p1 * (1-p1)









    Out[16]:





0.3807463493780422



In [ ]:

    
P -> 9
N -> 1



In [10]:

    
9/(9+1)









    Out[10]:





0.9



In [11]:

    
p-> 4
n -> 6









    



  File "<ipython-input-11-d016f2221685>", line 1
    p-> 4
      ^
SyntaxError: invalid syntax



In [12]:

    
6/(4+6)









    Out[12]:





0.6



In [17]:

    
p1 = 214/700



In [18]:

    
2 * p1 * (1-p1)









    Out[18]:





0.4245061224489796



In [19]:

    
(.488 + .221)/2









    Out[19]:





0.3545



In [20]:

    
.424 - .3545









    Out[20]:





0.0695



In [22]:

    
(424 * .488 + 276 * .221)/ 700 # weighted gini avg









    Out[22]:





0.3827257142857143



In [24]:

    
0.4245061224489796 - 0.3827257142857143 #information gain









    Out[24]:





0.04178040816326534



In [25]:

    
# - sum(pi * log(pi)) -> Entropy
# sum(pi * (1-pi))  -> Gini = sum(pi) - sum (pi * pi) = 1 - sum(pi*pi) = 1 - sum(pi * pj) where i != j



In [ ]:

    
# objective of the tree is find the condition by which it maximizes the information gains



In [ ]:

    
# stump -> a condition only on one feature, not more than one.



In [26]:

    
# purpose of loan - P1, P2, P3 (categorical)
# age - 20, 60 (cont)



In [27]:

    
# candidates for a stump
# for each candidate find the information gain
# select the condition which gives the max information gain 
purpose = p1  
purpose = p2
purpose = p3

age < 25 (percentile 10)
age < 30 (percentile 20)
age < 37 (percentile 30)
...









    



  File "<ipython-input-27-e6863e5c2cb0>", line 6
    age < 25 (percentile 10)
                          ^
SyntaxError: invalid syntax



In [28]:

    
df.shape









    Out[28]:





(1000, 21)



In [29]:

    
df.head()









    Out[29]:







  
    
      
      checking_balance
      months_loan_duration
      credit_history
      purpose
      amount
      savings_balance
      employment_length
      installment_rate
      personal_status
      other_debtors
      ...
      property
      age
      installment_plan
      housing
      existing_credits
      default
      dependents
      telephone
      foreign_worker
      job
    
  
  
    
      0
      < 0 DM
      6
      critical
      radio/tv
      1169
      unknown
      > 7 yrs
      4
      single male
      none
      ...
      real estate
      67
      none
      own
      2
      1
      1
      yes
      yes
      skilled employee
    
    
      1
      1 - 200 DM
      48
      repaid
      radio/tv
      5951
      < 100 DM
      1 - 4 yrs
      2
      female
      none
      ...
      real estate
      22
      none
      own
      1
      2
      1
      none
      yes
      skilled employee
    
    
      2
      unknown
      12
      critical
      education
      2096
      < 100 DM
      4 - 7 yrs
      2
      single male
      none
      ...
      real estate
      49
      none
      own
      1
      1
      2
      none
      yes
      unskilled resident
    
    
      3
      < 0 DM
      42
      repaid
      furniture
      7882
      < 100 DM
      4 - 7 yrs
      2
      single male
      guarantor
      ...
      building society savings
      45
      none
      for free
      1
      1
      2
      none
      yes
      skilled employee
    
    
      4
      < 0 DM
      24
      delayed
      car (new)
      4870
      < 100 DM
      1 - 4 yrs
      3
      single male
      none
      ...
      unknown/none
      53
      none
      for free
      2
      2
      2
      none
      yes
      skilled employee
    
  

5 rows × 21 columns



In [30]:

    
g1 = df[df.purpose == "radio/tv"]
g2 =  df[df.purpose != "radio/tv"]



In [31]:

    
len(g1), len(g2)









    Out[31]:





(280, 720)



In [33]:

    
g1.default.value_counts()/len(g1)









    Out[33]:





1    0.778571
2    0.221429
Name: default, dtype: float64



In [35]:

    
2 * 0.778571 * 0.221429









    Out[35]:





0.344796395918



In [34]:

    
g2.default.value_counts()/len(g2)









    Out[34]:





1    0.669444
2    0.330556
Name: default, dtype: float64



In [36]:

    
2 * .669444 * 0.330556









    Out[36]:





0.44257746172800005



In [37]:

    
df.default.value_counts()/len(df)









    Out[37]:





1    0.7
2    0.3
Name: default, dtype: float64



In [38]:

    
gini_root = 2 * .7 * .3
gini_root









    Out[38]:





0.42



In [40]:

    
0.42 - (280 * 0.344796395918 + 720 * 0.44257746172800005)/ (280+720) 
# information gain using purpose = "Radio/tv" as condition









    Out[40]:





0.004801236698799893



In [41]:

    
df.head()









    Out[41]:







  
    
      
      checking_balance
      months_loan_duration
      credit_history
      purpose
      amount
      savings_balance
      employment_length
      installment_rate
      personal_status
      other_debtors
      ...
      property
      age
      installment_plan
      housing
      existing_credits
      default
      dependents
      telephone
      foreign_worker
      job
    
  
  
    
      0
      < 0 DM
      6
      critical
      radio/tv
      1169
      unknown
      > 7 yrs
      4
      single male
      none
      ...
      real estate
      67
      none
      own
      2
      1
      1
      yes
      yes
      skilled employee
    
    
      1
      1 - 200 DM
      48
      repaid
      radio/tv
      5951
      < 100 DM
      1 - 4 yrs
      2
      female
      none
      ...
      real estate
      22
      none
      own
      1
      2
      1
      none
      yes
      skilled employee
    
    
      2
      unknown
      12
      critical
      education
      2096
      < 100 DM
      4 - 7 yrs
      2
      single male
      none
      ...
      real estate
      49
      none
      own
      1
      1
      2
      none
      yes
      unskilled resident
    
    
      3
      < 0 DM
      42
      repaid
      furniture
      7882
      < 100 DM
      4 - 7 yrs
      2
      single male
      guarantor
      ...
      building society savings
      45
      none
      for free
      1
      1
      2
      none
      yes
      skilled employee
    
    
      4
      < 0 DM
      24
      delayed
      car (new)
      4870
      < 100 DM
      1 - 4 yrs
      3
      single male
      none
      ...
      unknown/none
      53
      none
      for free
      2
      2
      2
      none
      yes
      skilled employee
    
  

5 rows × 21 columns



In [42]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_length     1000 non-null   object
 7   installment_rate      1000 non-null   int64 
 8   personal_status       1000 non-null   object
 9   other_debtors         1000 non-null   object
 10  residence_history     1000 non-null   int64 
 11  property              1000 non-null   object
 12  age                   1000 non-null   int64 
 13  installment_plan      1000 non-null   object
 14  housing               1000 non-null   object
 15  existing_credits      1000 non-null   int64 
 16  default               1000 non-null   int64 
 17  dependents            1000 non-null   int64 
 18  telephone             1000 non-null   object
 19  foreign_worker        1000 non-null   object
 20  job                   1000 non-null   object
dtypes: int64(8), object(13)
memory usage: 164.2+ KB



In [46]:

    
df_dummy  = pd.get_dummies(df, drop_first=True)



In [47]:

    
df_dummy.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 49 columns):
 #   Column                                 Non-Null Count  Dtype
---  ------                                 --------------  -----
 0   months_loan_duration                   1000 non-null   int64
 1   amount                                 1000 non-null   int64
 2   installment_rate                       1000 non-null   int64
 3   residence_history                      1000 non-null   int64
 4   age                                    1000 non-null   int64
 5   existing_credits                       1000 non-null   int64
 6   default                                1000 non-null   int64
 7   dependents                             1000 non-null   int64
 8   checking_balance_< 0 DM                1000 non-null   uint8
 9   checking_balance_> 200 DM              1000 non-null   uint8
 10  checking_balance_unknown               1000 non-null   uint8
 11  credit_history_delayed                 1000 non-null   uint8
 12  credit_history_fully repaid            1000 non-null   uint8
 13  credit_history_fully repaid this bank  1000 non-null   uint8
 14  credit_history_repaid                  1000 non-null   uint8
 15  purpose_car (new)                      1000 non-null   uint8
 16  purpose_car (used)                     1000 non-null   uint8
 17  purpose_domestic appliances            1000 non-null   uint8
 18  purpose_education                      1000 non-null   uint8
 19  purpose_furniture                      1000 non-null   uint8
 20  purpose_others                         1000 non-null   uint8
 21  purpose_radio/tv                       1000 non-null   uint8
 22  purpose_repairs                        1000 non-null   uint8
 23  purpose_retraining                     1000 non-null   uint8
 24  savings_balance_501 - 1000 DM          1000 non-null   uint8
 25  savings_balance_< 100 DM               1000 non-null   uint8
 26  savings_balance_> 1000 DM              1000 non-null   uint8
 27  savings_balance_unknown                1000 non-null   uint8
 28  employment_length_1 - 4 yrs            1000 non-null   uint8
 29  employment_length_4 - 7 yrs            1000 non-null   uint8
 30  employment_length_> 7 yrs              1000 non-null   uint8
 31  employment_length_unemployed           1000 non-null   uint8
 32  personal_status_female                 1000 non-null   uint8
 33  personal_status_married male           1000 non-null   uint8
 34  personal_status_single male            1000 non-null   uint8
 35  other_debtors_guarantor                1000 non-null   uint8
 36  other_debtors_none                     1000 non-null   uint8
 37  property_other                         1000 non-null   uint8
 38  property_real estate                   1000 non-null   uint8
 39  property_unknown/none                  1000 non-null   uint8
 40  installment_plan_none                  1000 non-null   uint8
 41  installment_plan_stores                1000 non-null   uint8
 42  housing_own                            1000 non-null   uint8
 43  housing_rent                           1000 non-null   uint8
 44  telephone_yes                          1000 non-null   uint8
 45  foreign_worker_yes                     1000 non-null   uint8
 46  job_skilled employee                   1000 non-null   uint8
 47  job_unemployed non-resident            1000 non-null   uint8
 48  job_unskilled resident                 1000 non-null   uint8
dtypes: int64(8), uint8(41)
memory usage: 102.7 KB



In [48]:

    
target = "default"



In [49]:

    
y = df_dummy[target]



In [52]:

    
X = df_dummy.drop(columns=target)



In [53]:

    
X.shape









    Out[53]:





(1000, 48)



In [54]:

    
X.head()









    Out[54]:







  
    
      
      months_loan_duration
      amount
      installment_rate
      residence_history
      age
      existing_credits
      dependents
      checking_balance_< 0 DM
      checking_balance_> 200 DM
      checking_balance_unknown
      ...
      property_unknown/none
      installment_plan_none
      installment_plan_stores
      housing_own
      housing_rent
      telephone_yes
      foreign_worker_yes
      job_skilled employee
      job_unemployed non-resident
      job_unskilled resident
    
  
  
    
      0
      6
      1169
      4
      4
      67
      2
      1
      1
      0
      0
      ...
      0
      1
      0
      1
      0
      1
      1
      1
      0
      0
    
    
      1
      48
      5951
      2
      2
      22
      1
      1
      0
      0
      0
      ...
      0
      1
      0
      1
      0
      0
      1
      1
      0
      0
    
    
      2
      12
      2096
      2
      3
      49
      1
      2
      0
      0
      1
      ...
      0
      1
      0
      1
      0
      0
      1
      0
      0
      1
    
    
      3
      42
      7882
      2
      4
      45
      1
      2
      1
      0
      0
      ...
      0
      1
      0
      0
      0
      0
      1
      1
      0
      0
    
    
      4
      24
      4870
      3
      4
      53
      2
      2
      1
      0
      0
      ...
      1
      1
      0
      0
      0
      0
      1
      1
      0
      0
    
  

5 rows × 48 columns



In [55]:

    
from sklearn import model_selection



In [56]:

    
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)



In [57]:

    
from sklearn import tree



In [58]:

    
est = tree.DecisionTreeClassifier()
est.fit(X_train, y_train)









    Out[58]:





DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')



In [59]:

    
y_test_pred = est.predict(X_test)



In [60]:

    
from sklearn import metrics



In [61]:

    
metrics.accuracy_score(y_test, y_test_pred)









    Out[61]:





0.6433333333333333



In [62]:

    
pd.Series(y_train).value_counts()/len(y_train)









    Out[62]:





1    0.694286
2    0.305714
Name: default, dtype: float64



In [63]:

    
pd.Series(y_test).value_counts()/len(y_test)









    Out[63]:





1    0.713333
2    0.286667
Name: default, dtype: float64



In [65]:

    
pd.DataFrame({"actual": y_test, "prediction": y_test_pred})









    Out[65]:







  
    
      
      actual
      prediction
    
  
  
    
      507
      2
      2
    
    
      818
      1
      2
    
    
      452
      1
      1
    
    
      368
      2
      2
    
    
      242
      2
      2
    
    
      ...
      ...
      ...
    
    
      459
      1
      2
    
    
      415
      1
      1
    
    
      61
      1
      1
    
    
      347
      1
      2
    
    
      349
      2
      1
    
  

300 rows × 2 columns



In [67]:

    
import numpy as np



In [68]:

    
np.sum(y_test == y_test_pred)









    Out[68]:





193



In [69]:

    
193/len(y_test)









    Out[69]:





0.6433333333333333



In [70]:

    
metrics.confusion_matrix(y_test, y_test_pred)









    Out[70]:





array([[158,  56],
       [ 51,  35]])



In [71]:

    
158 + 35









    Out[71]:





193



In [73]:

    
print(metrics.classification_report(y_test, y_test_pred))









    



              precision    recall  f1-score   support

           1       0.76      0.74      0.75       214
           2       0.38      0.41      0.40        86

    accuracy                           0.64       300
   macro avg       0.57      0.57      0.57       300
weighted avg       0.65      0.64      0.65       300



In [75]:

    
est = tree.DecisionTreeClassifier()
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))









    



train accuracy 1.0
test accuracy 0.6566666666666666



In [76]:

    
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png



In [77]:

    
est = tree.DecisionTreeClassifier(max_depth=3)
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))









    



train accuracy 0.7514285714285714
test accuracy 0.7333333333333333



In [78]:

    
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png



In [83]:

    
est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=20)
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png









    



train accuracy 0.7385714285714285
test accuracy 0.7433333333333333



In [84]:

    
est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=20, criterion="entropy")
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png









    



train accuracy 0.7385714285714285
test accuracy 0.7433333333333333



In [81]:

    
scores = model_selection.cross_val_score(est, X_train, y_train, cv = 5)
scores









    Out[81]:





array([0.67857143, 0.69285714, 0.67142857, 0.73571429, 0.70714286])



In [82]:

    
np.mean(scores)









    Out[82]:





0.6971428571428572



In [88]:

    
param_grid = {
    "max_depth": np.arange(2, 20),
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": np.arange(1, 10) * 5
}


gsearch =model_selection.GridSearchCV(est, param_grid=param_grid, scoring="accuracy"
                                      , cv = 5, verbose = True, n_jobs= 8)
gsearch.fit(X, y)









    



Fitting 5 folds for each of 324 candidates, totalling 1620 fits






    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 1592 tasks      | elapsed:    2.7s
[Parallel(n_jobs=8)]: Done 1620 out of 1620 | elapsed:    2.8s finished






    Out[88]:





GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='entropy', max_depth=3,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=20,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19]),
                         'min_samples_leaf': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)



In [91]:

    
18 * 2 * 9 * 5









    Out[91]:





1620



In [92]:

    
len(np.arange(2, 20))









    Out[92]:





18



In [93]:

    
gsearch.best_score_









    Out[93]:





0.7289999999999999



In [94]:

    
gsearch.best_params_









    Out[94]:





{'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 10}



In [95]:

    
est = tree.DecisionTreeClassifier(max_depth=11, min_samples_leaf=10, criterion="gini")
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train accuracy", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy", metrics.accuracy_score(y_test, y_test_pred))
from sklearn.tree import export_graphviz
export_graphviz(est, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png









    



train accuracy 0.8185714285714286
test accuracy 0.7266666666666667



In [103]:

    
param_grid = {
    "max_depth": np.arange(2, 6),
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": np.arange(5, 20)
}


gsearch =model_selection.GridSearchCV(est, param_grid=param_grid, scoring="accuracy"
                                      , cv = 5, verbose = True, n_jobs= 8)
gsearch.fit(X_train, y_train)









    



Fitting 5 folds for each of 120 candidates, totalling 600 fits






    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 600 out of 600 | elapsed:    0.8s finished






    Out[103]:





GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=11,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=10,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([2, 3, 4, 5]),
                         'min_samples_leaf': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)



In [104]:

    
gsearch.score(X_train, y_train), gsearch.best_score_









    Out[104]:





(0.7685714285714286, 0.7228571428571429)



In [105]:

    
gsearch.best_params_









    Out[105]:





{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 12}



In [107]:

    
y_test_pred = est.predict_proba(X_test)[:, 1]



In [111]:

    
from sklearn import preprocessing



In [112]:

    
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y_train)
y_test_le = label_encoder.transform(y_test)



In [114]:

    
fpr, tpr, thresholds = metrics.roc_curve(y_test_le, y_test_pred)



In [115]:

    
import matplotlib.pyplot as plt



In [122]:

    
auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr)
plt.title("ROC curve, auc score:" + str(auc))
plt.xlabel("FPR")
plt.ylabel("TPR")









    Out[122]:





Text(0, 0.5, 'TPR')



In [124]:

    
est.feature_importances_









    Out[124]:





array([0.14401518, 0.1239812 , 0.06727951, 0.05809262, 0.01162033,
       0.01539952, 0.        , 0.        , 0.02555282, 0.22336633,
       0.00851682, 0.03585933, 0.02596523, 0.0528968 , 0.01313525,
       0.01886405, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.04727327,
       0.        , 0.00262122, 0.00959882, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.03595749,
       0.01132407, 0.02695599, 0.        , 0.        , 0.0253872 ,
       0.        , 0.01633697, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])



In [ ]:

    
feature 1 - purpose
purpose = v1 -> IG1
purpose = v2 -> IG2
..

feature 2 - age
age < 30   -> IG3
age < 40   -> IG4

..



feature1 (purpose)-> sum(IG1, Ig2 ... )
feature2 (age)-> sum(IG3,IG4  ... )



In [125]:

    
est.feature_importances_.sum()









    Out[125]:





1.0



In [126]:

    
df_dummy.columns









    Out[126]:





Index(['months_loan_duration', 'amount', 'installment_rate',
       'residence_history', 'age', 'existing_credits', 'default', 'dependents',
       'checking_balance_< 0 DM', 'checking_balance_> 200 DM',
       'checking_balance_unknown', 'credit_history_delayed',
       'credit_history_fully repaid', 'credit_history_fully repaid this bank',
       'credit_history_repaid', 'purpose_car (new)', 'purpose_car (used)',
       'purpose_domestic appliances', 'purpose_education', 'purpose_furniture',
       'purpose_others', 'purpose_radio/tv', 'purpose_repairs',
       'purpose_retraining', 'savings_balance_501 - 1000 DM',
       'savings_balance_< 100 DM', 'savings_balance_> 1000 DM',
       'savings_balance_unknown', 'employment_length_1 - 4 yrs',
       'employment_length_4 - 7 yrs', 'employment_length_> 7 yrs',
       'employment_length_unemployed', 'personal_status_female',
       'personal_status_married male', 'personal_status_single male',
       'other_debtors_guarantor', 'other_debtors_none', 'property_other',
       'property_real estate', 'property_unknown/none',
       'installment_plan_none', 'installment_plan_stores', 'housing_own',
       'housing_rent', 'telephone_yes', 'foreign_worker_yes',
       'job_skilled employee', 'job_unemployed non-resident',
       'job_unskilled resident'],
      dtype='object')



In [128]:

    
X.columns









    Out[128]:





Index(['months_loan_duration', 'amount', 'installment_rate',
       'residence_history', 'age', 'existing_credits', 'dependents',
       'checking_balance_< 0 DM', 'checking_balance_> 200 DM',
       'checking_balance_unknown', 'credit_history_delayed',
       'credit_history_fully repaid', 'credit_history_fully repaid this bank',
       'credit_history_repaid', 'purpose_car (new)', 'purpose_car (used)',
       'purpose_domestic appliances', 'purpose_education', 'purpose_furniture',
       'purpose_others', 'purpose_radio/tv', 'purpose_repairs',
       'purpose_retraining', 'savings_balance_501 - 1000 DM',
       'savings_balance_< 100 DM', 'savings_balance_> 1000 DM',
       'savings_balance_unknown', 'employment_length_1 - 4 yrs',
       'employment_length_4 - 7 yrs', 'employment_length_> 7 yrs',
       'employment_length_unemployed', 'personal_status_female',
       'personal_status_married male', 'personal_status_single male',
       'other_debtors_guarantor', 'other_debtors_none', 'property_other',
       'property_real estate', 'property_unknown/none',
       'installment_plan_none', 'installment_plan_stores', 'housing_own',
       'housing_rent', 'telephone_yes', 'foreign_worker_yes',
       'job_skilled employee', 'job_unemployed non-resident',
       'job_unskilled resident'],
      dtype='object')



In [129]:

    
importance = pd.DataFrame({"feature": X.columns, "importance": est.feature_importances_})



In [130]:

    
importance









    Out[130]:







  
    
      
      feature
      importance
    
  
  
    
      0
      months_loan_duration
      0.144015
    
    
      1
      amount
      0.123981
    
    
      2
      installment_rate
      0.067280
    
    
      3
      residence_history
      0.058093
    
    
      4
      age
      0.011620
    
    
      5
      existing_credits
      0.015400
    
    
      6
      dependents
      0.000000
    
    
      7
      checking_balance_< 0 DM
      0.000000
    
    
      8
      checking_balance_> 200 DM
      0.025553
    
    
      9
      checking_balance_unknown
      0.223366
    
    
      10
      credit_history_delayed
      0.008517
    
    
      11
      credit_history_fully repaid
      0.035859
    
    
      12
      credit_history_fully repaid this bank
      0.025965
    
    
      13
      credit_history_repaid
      0.052897
    
    
      14
      purpose_car (new)
      0.013135
    
    
      15
      purpose_car (used)
      0.018864
    
    
      16
      purpose_domestic appliances
      0.000000
    
    
      17
      purpose_education
      0.000000
    
    
      18
      purpose_furniture
      0.000000
    
    
      19
      purpose_others
      0.000000
    
    
      20
      purpose_radio/tv
      0.000000
    
    
      21
      purpose_repairs
      0.000000
    
    
      22
      purpose_retraining
      0.000000
    
    
      23
      savings_balance_501 - 1000 DM
      0.000000
    
    
      24
      savings_balance_< 100 DM
      0.047273
    
    
      25
      savings_balance_> 1000 DM
      0.000000
    
    
      26
      savings_balance_unknown
      0.002621
    
    
      27
      employment_length_1 - 4 yrs
      0.009599
    
    
      28
      employment_length_4 - 7 yrs
      0.000000
    
    
      29
      employment_length_> 7 yrs
      0.000000
    
    
      30
      employment_length_unemployed
      0.000000
    
    
      31
      personal_status_female
      0.000000
    
    
      32
      personal_status_married male
      0.000000
    
    
      33
      personal_status_single male
      0.000000
    
    
      34
      other_debtors_guarantor
      0.035957
    
    
      35
      other_debtors_none
      0.011324
    
    
      36
      property_other
      0.026956
    
    
      37
      property_real estate
      0.000000
    
    
      38
      property_unknown/none
      0.000000
    
    
      39
      installment_plan_none
      0.025387
    
    
      40
      installment_plan_stores
      0.000000
    
    
      41
      housing_own
      0.016337
    
    
      42
      housing_rent
      0.000000
    
    
      43
      telephone_yes
      0.000000
    
    
      44
      foreign_worker_yes
      0.000000
    
    
      45
      job_skilled employee
      0.000000
    
    
      46
      job_unemployed non-resident
      0.000000
    
    
      47
      job_unskilled resident
      0.000000



In [132]:

    
importance.iloc[:10, :]









    Out[132]:







  
    
      
      feature
      importance
    
  
  
    
      0
      months_loan_duration
      0.144015
    
    
      1
      amount
      0.123981
    
    
      2
      installment_rate
      0.067280
    
    
      3
      residence_history
      0.058093
    
    
      4
      age
      0.011620
    
    
      5
      existing_credits
      0.015400
    
    
      6
      dependents
      0.000000
    
    
      7
      checking_balance_< 0 DM
      0.000000
    
    
      8
      checking_balance_> 200 DM
      0.025553
    
    
      9
      checking_balance_unknown
      0.223366



In [133]:

    
from sklearn import ensemble



In [141]:

    
forest = ensemble.RandomForestClassifier(max_depth=6, n_estimators=50, )
forest.fit(X_train, y_train)


y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))









    



training accuracy:  0.8271428571428572
test accuracy:  0.7433333333333333
training precision:  0.8067226890756303
test precision:  0.7509157509157509
training recall:  0.9876543209876543
test recall:  0.9579439252336449



In [142]:

    
forest.feature_importances_









    Out[142]:





array([0.1012958 , 0.09682539, 0.03229096, 0.01784331, 0.07696492,
       0.01794807, 0.00650361, 0.07494516, 0.00786617, 0.13197544,
       0.01225512, 0.03187523, 0.0103532 , 0.01512056, 0.01852747,
       0.01660301, 0.00219783, 0.00801843, 0.00745262, 0.00094362,
       0.01404989, 0.00919839, 0.00015451, 0.00255661, 0.02811401,
       0.00788001, 0.02144612, 0.00832382, 0.00725275, 0.01458451,
       0.01335561, 0.01276311, 0.00606711, 0.0126112 , 0.0089155 ,
       0.00606123, 0.00881428, 0.01479555, 0.01289806, 0.0267686 ,
       0.00824498, 0.01707735, 0.01445719, 0.01346965, 0.0030963 ,
       0.00893963, 0.00194125, 0.00835686])



In [145]:

    
importance = pd.DataFrame({"feature": X.columns, "importance": forest.feature_importances_ }) 
importance.iloc[:10,:]









    Out[145]:







  
    
      
      feature
      importance
    
  
  
    
      0
      months_loan_duration
      0.101296
    
    
      1
      amount
      0.096825
    
    
      2
      installment_rate
      0.032291
    
    
      3
      residence_history
      0.017843
    
    
      4
      age
      0.076965
    
    
      5
      existing_credits
      0.017948
    
    
      6
      dependents
      0.006504
    
    
      7
      checking_balance_< 0 DM
      0.074945
    
    
      8
      checking_balance_> 200 DM
      0.007866
    
    
      9
      checking_balance_unknown
      0.131975



In [146]:

    
forest.estimators_









    Out[146]:





[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1776766514, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1087178500, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=664121568, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=183752352, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1702127644, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=756443523, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=697393087, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=685524680, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1383422173, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=956251155, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=721416188, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1288296911, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=961329987, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=519422218, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=821855837, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1913304890, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=2119238008, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=383506554, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=827006483, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=2129796894, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1678675055, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1822692151, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=801678338, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=220500646, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=69059735, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=737010616, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1875743282, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1127598631, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1480074863, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1865260314, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1390215547, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1661539180, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1001328911, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1373932204, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1171376576, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1296032508, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=194332320, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=891901763, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=647258859, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=478834039, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=563784379, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=558345168, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=368695702, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=106511248, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=474272604, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=457562369, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=495049404, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=609735876, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=129884074, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=6, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1933877961, splitter='best')]



In [148]:

    
from sklearn.tree import export_graphviz
export_graphviz(forest.estimators_[1], out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png



In [179]:

    
est = tree.DecisionTreeClassifier(max_depth=1, min_samples_leaf=10, criterion="gini")

bagging = ensemble.BaggingClassifier(est, n_estimators=20, random_state=23)
bagging.fit(X_train, y_train)

y_train_pred = bagging.predict(X_train)
y_test_pred = bagging.predict(X_test)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))









    



training accuracy:  0.6942857142857143
test accuracy:  0.7133333333333334
training precision:  0.6942857142857143
test precision:  0.7133333333333334
training recall:  1.0
test recall:  1.0



In [180]:

    
y_train_pred









    Out[180]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])



In [190]:

    
est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=100, criterion="gini")
adaboost = ensemble.AdaBoostClassifier(est, n_estimators=100 )
adaboost.fit(X_train, y_train)


y_train_pred = adaboost.predict(X_train)
y_test_pred = adaboost.predict(X_test)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))









    



training accuracy:  0.9228571428571428
test accuracy:  0.7233333333333334
training precision:  0.9268774703557312
test precision:  0.7963800904977375
training recall:  0.9650205761316872
test recall:  0.822429906542056



In [ ]:

	checking_balance	months_loan_duration	credit_history	purpose	amount	savings_balance	employment_length	installment_rate	personal_status	other_debtors	...	property	age	installment_plan	housing	existing_credits	default	dependents	telephone	foreign_worker	job
0	< 0 DM	6	critical	radio/tv	1169	unknown	> 7 yrs	4	single male	none	...	real estate	67	none	own	2	1	1	yes	yes	skilled employee
1	1 - 200 DM	48	repaid	radio/tv	5951	< 100 DM	1 - 4 yrs	2	female	none	...	real estate	22	none	own	1	2	1	none	yes	skilled employee
2	unknown	12	critical	education	2096	< 100 DM	4 - 7 yrs	2	single male	none	...	real estate	49	none	own	1	1	2	none	yes	unskilled resident
3	< 0 DM	42	repaid	furniture	7882	< 100 DM	4 - 7 yrs	2	single male	guarantor	...	building society savings	45	none	for free	1	1	2	none	yes	skilled employee
4	< 0 DM	24	delayed	car (new)	4870	< 100 DM	1 - 4 yrs	3	single male	none	...	unknown/none	53	none	for free	2	2	2	none	yes	skilled employee
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	unknown	12	repaid	furniture	1736	< 100 DM	4 - 7 yrs	3	female	none	...	real estate	31	none	own	1	1	1	none	yes	unskilled resident
996	< 0 DM	30	repaid	car (used)	3857	< 100 DM	1 - 4 yrs	4	divorced male	none	...	building society savings	40	none	own	1	1	1	yes	yes	mangement self-employed
997	unknown	12	repaid	radio/tv	804	< 100 DM	> 7 yrs	4	single male	none	...	other	38	none	own	1	1	1	none	yes	skilled employee
998	< 0 DM	45	repaid	radio/tv	1845	< 100 DM	1 - 4 yrs	4	single male	none	...	unknown/none	23	none	for free	1	2	1	yes	yes	skilled employee
999	1 - 200 DM	45	critical	car (used)	4576	101 - 500 DM	unemployed	3	single male	none	...	other	27	none	own	1	1	1	none	yes	skilled employee

	feature	importance
0	months_loan_duration	0.144015
1	amount	0.123981
2	installment_rate	0.067280
3	residence_history	0.058093
4	age	0.011620
5	existing_credits	0.015400
6	dependents	0.000000
7	checking_balance_< 0 DM	0.000000
8	checking_balance_> 200 DM	0.025553
9	checking_balance_unknown	0.223366
10	credit_history_delayed	0.008517
11	credit_history_fully repaid	0.035859
12	credit_history_fully repaid this bank	0.025965
13	credit_history_repaid	0.052897
14	purpose_car (new)	0.013135
15	purpose_car (used)	0.018864
16	purpose_domestic appliances	0.000000
17	purpose_education	0.000000
18	purpose_furniture	0.000000
19	purpose_others	0.000000
20	purpose_radio/tv	0.000000
21	purpose_repairs	0.000000
22	purpose_retraining	0.000000
23	savings_balance_501 - 1000 DM	0.000000
24	savings_balance_< 100 DM	0.047273
25	savings_balance_> 1000 DM	0.000000
26	savings_balance_unknown	0.002621
27	employment_length_1 - 4 yrs	0.009599
28	employment_length_4 - 7 yrs	0.000000
29	employment_length_> 7 yrs	0.000000
30	employment_length_unemployed	0.000000
31	personal_status_female	0.000000
32	personal_status_married male	0.000000
33	personal_status_single male	0.000000
34	other_debtors_guarantor	0.035957
35	other_debtors_none	0.011324
36	property_other	0.026956
37	property_real estate	0.000000
38	property_unknown/none	0.000000
39	installment_plan_none	0.025387
40	installment_plan_stores	0.000000
41	housing_own	0.016337
42	housing_rent	0.000000
43	telephone_yes	0.000000
44	foreign_worker_yes	0.000000
45	job_skilled employee	0.000000
46	job_unemployed non-resident	0.000000
47	job_unskilled resident	0.000000

	feature	importance
0	months_loan_duration	0.101296
1	amount	0.096825
2	installment_rate	0.032291
3	residence_history	0.017843
4	age	0.076965
5	existing_credits	0.017948
6	dependents	0.006504
7	checking_balance_< 0 DM	0.074945
8	checking_balance_> 200 DM	0.007866
9	checking_balance_unknown	0.131975