Assignment 2: Improving the Machine Learning Pipeline

Ari Anisfeld



In [1]:

    
import pandas as pd
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import math

import pipeline.magicloops as magic
import pipeline.util as u
import pipeline.process as pr
import pipeline.read as r
import pipeline.explore as ex
import pipeline.evaluate as ev
% matplotlib inline


data = r.read_csv('data/credit-data.csv', parse_zipcodes=["zipcode"], dtype ={"SeriousDlqin2yrs": "category","PersonID":"category"})
data.serious_dlqin2yrs.cat.categories = ["Nondelinquent", "Delinquent"]

data = r.read_csv('data/credit-data.csv', parse_zipcodes=["zipcode"], dtype ={"PersonID":"category"})
#data.serious_dlqin2yrs.cat.categories = ["Nondelinquent", "Delinquent"]
data["debt_ratio_groups"] = pr.cut(data.debt_ratio, [0,0.25,.5,.75,1], labels="auto")

# This apply function is too slow.
# data.debt_ratio = data.debt_ratio.apply(lambda x: pr.cap_values(x,data.debt_ratio.quantile(.99)))

#Fill Na with mean for monthly income and median for number of dependents
na_cols = ["monthly_income","number_of_dependents"]
data[na_cols] = pr.fill_with(df=data,col=na_cols, group="serious_dlqin2yrs")


#binnerize number of dependents data and then make dummies.
data["number_of_dependents_cut"] = pr.cut(data.number_of_dependents, [0,.99,3.01,20.1], \
                                          method=pd.cut, labels=["No dependents","1-3 dependents", "4+ dependents"],include_lowest=True)
data = pr.get_dummies(data.number_of_dependents_cut, data)

df = data
potential_features = list(data.columns[2:14])
y = df.serious_dlqin2yrs
X =df[potential_features].join(df.monthly_income.map(lambda x: math.log(x) if x!=0 else 0), rsuffix="_log")









    



/Users/arianisfeld/anaconda2/envs/py36/lib/python3.6/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
/Users/arianisfeld/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/arianisfeld/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

Explore Data

The below table shows outcomes split by data labeled delinquent or nondelinquent. The number of times delinquents are past due is an order of magnitude higher than nondelinquents, but the high variance on the measure and low median suggest that is driven by people with particularly high numbers of past due bills. Nondelinquents are older and have higher income on average.



In [3]:

    
summary = x.summary_by_outcome(data, "serious_dlqin2yrs")

summary.ix[:,[0,1,2,4,6,7,8,10]]









    Out[3]:






  
    
      serious_dlqin2yrs
      Nondelinquent
      Delinquent
    
    
      
      count
      mean
      std
      50%
      count
      mean
      std
      50%
    
  
  
    
      age
      139974.0
      52.75
      14.79
      52.00
      10026.0
      45.93
      12.92
      45.00
    
    
      debt_ratio
      139974.0
      357.15
      2083.28
      0.36
      10026.0
      295.12
      1238.36
      0.43
    
    
      monthly_income
      111912.0
      6747.84
      14813.50
      5466.00
      8357.0
      5630.83
      6171.72
      4500.00
    
    
      number_of_dependents
      136229.0
      0.74
      1.11
      0.00
      9847.0
      0.95
      1.22
      0.00
    
    
      number_of_open_credit_lines_and_loans
      139974.0
      8.49
      5.11
      8.00
      10026.0
      7.88
      5.65
      7.00
    
    
      number_of_time30-59_days_past_due_not_worse
      139974.0
      0.28
      2.95
      0.00
      10026.0
      2.39
      11.73
      0.00
    
    
      number_of_time60-89_days_past_due_not_worse
      139974.0
      0.13
      2.90
      0.00
      10026.0
      1.83
      11.75
      0.00
    
    
      number_of_times90_days_late
      139974.0
      0.14
      2.91
      0.00
      10026.0
      2.09
      11.76
      0.00
    
    
      number_real_estate_loans_or_lines
      139974.0
      1.02
      1.11
      1.00
      10026.0
      0.99
      1.43
      1.00
    
    
      revolving_utilization_of_unsecured_lines
      139974.0
      6.17
      256.13
      0.13
      10026.0
      4.37
      131.84
      0.84



In [4]:

    
null_income = u.check_nulls(data, "monthly_income")
null_sum = x.summary_by_outcome(null_income, "serious_dlqin2yrs")

not_null_income = u.get_notnulls(data, "monthly_income")
not_null_sum = x.summary_by_outcome(not_null_income, "serious_dlqin2yrs")
ratio = round(null_sum.ix[:,[0,1,2,5,6,7,8,11]]/not_null_sum.ix[:,[0,1,2,5,6,7,8,11]],2)
ratio.drop("monthly_income")









    Out[4]:






  
    
      serious_dlqin2yrs
      Nondelinquent
      Delinquent
    
    
      
      count
      mean
      std
      max
      count
      mean
      std
      max
    
  
  
    
      age
      0.25
      1.10
      1.06
      1.06
      0.20
      1.01
      1.09
      0.98
    
    
      debt_ratio
      0.25
      62.04
      10.12
      5.39
      0.20
      77.49
      6.67
      1.86
    
    
      number_of_dependents
      0.22
      0.37
      0.70
      0.45
      0.18
      0.37
      0.75
      0.75
    
    
      number_of_open_credit_lines_and_loans
      0.25
      0.83
      0.94
      0.78
      0.20
      0.74
      0.94
      0.60
    
    
      number_of_time30-59_days_past_due_not_worse
      0.25
      1.31
      1.85
      1.00
      0.20
      2.32
      1.90
      1.00
    
    
      number_of_time60-89_days_past_due_not_worse
      0.25
      2.40
      1.89
      1.00
      0.20
      3.00
      1.91
      1.00
    
    
      number_of_times90_days_late
      0.25
      2.27
      1.88
      1.00
      0.20
      2.74
      1.90
      1.00
    
    
      number_real_estate_loans_or_lines
      0.25
      0.83
      0.92
      0.43
      0.20
      0.69
      0.78
      0.52
    
    
      revolving_utilization_of_unsecured_lines
      0.25
      1.08
      0.82
      0.44
      0.20
      2.17
      2.12
      1.35

The analysis above compares people who have a reported monthly income and those with null monthly incomes. The ratio of missing income data over debt ratio with income data is very large. Those without income data have 62 to 77 times the debt ratio of those with reported data. In general, we would hope these ratios were close to 1. The count data show for each person with missing data there are 4 to 5 people with reported income. In other words, those with missing data make up slightly under 20 percent of our data. I will have to be careful when developing models to account for this discrepancy. For example, in a large project I would compare models with imputed values for the missing income data with dropping the data and would consider capping debt ratios. The graphs below show that the data with nulls has more extreme debt ratios (compare the bottom corners of the following two pair plots). The solution is I will implement will be to group debt ratios.



In [7]:

    
sns.set(style="white")
sns.pairplot(data, vars=["debt_ratio","revolving_utilization_of_unsecured_lines","number_real_estate_loans_or_lines"],\
             hue="serious_dlqin2yrs", size=3, plot_kws={'alpha':0.2})
pl.suptitle("Paired distribution of indictors by delinquincy status (keeping NA's)")









    Out[7]:





<matplotlib.text.Text at 0x115589438>



In [8]:

    
sns.pairplot(u.get_notnulls(data,"monthly_income"), vars=["monthly_income","debt_ratio","revolving_utilization_of_unsecured_lines","number_real_estate_loans_or_lines"],\
             hue="serious_dlqin2yrs", size=3, plot_kws={'alpha':0.2})
pl.suptitle("Paired distribution of indictors by delinquincy status (removing NA's)")









    Out[8]:





<matplotlib.text.Text at 0x11fbe8be0>



In [109]:

    
import math

"""normalized_data = data.monthly_income.map(lambda x: math.log(x) if x != 0 else 0)

sns.distplot(normalized_data.loc[data.serious_dlqin2yrs==1], color = 'red', hist=False)
sns.distplot(normalized_data.loc[data.serious_dlqin2yrs==0], color = 'black', hist=False)
plt.xlim = (5, 11)
plt.show()
"""

def no_correction_function(x):
    return x

def my_distplot(df, col, binary_split=None,fn=no_correction_function, hist=False, kde=True):
    '''
    df (dataFrame)
    col (column in dataFrame) we expect values >= 0 
    binary_split (column in df of 0 and 1)
    fn (function) (e.g. math.log, math.sqrt)
    FUTURE: use quantile or other methods to cut outliers
    FUTURE: make arbitrary distinct graphs not just binary
    '''
    normalized_data = df[col].map(lambda x: fn(x) if x > 0 else x)
    if binary_split:
        sns.distplot(normalized_data.loc[df[binary_split]==1], color = 'red', hist=hist,kde=kde, label='{} =1'.format(binary_split))
        sns.distplot(normalized_data.loc[df[binary_split]==0], color = 'black',hist=hist,kde=kde,label='{} = 0'.format(binary_split))
        plt.title("Histogram of {} of {} split on {}".format(fn, col, binary_split))
    else:
        sns.distplot(normalized_data)
        plt.title("Histogram of {} of {}".format(fn, col), label = col)
    if fn==no_correction_function:
        plt.xlabel("{}".format(col))
    else:
        plt.xlabel("{} of {}".format(fn, col))
    plt.legend()
    

    
my_distplot(data, 'monthly_income', 'serious_dlqin2yrs',fn=math.log)



In [106]:

    
my_distplot(data, 'revolving_utilization_of_unsecured_lines', 'serious_dlqin2yrs',fn=math.log)



In [111]:

    
my_distplot(data, 'number_of_dependents', 'serious_dlqin2yrs', hist=True, kde=True)



In [90]:

    
normalized_data = data.monthly_income.map(lambda x: math.log(x) if x != 0 else 0)

data.loc[data.serious_dlqin2yrs==1]['number_of_dependents'].value_counts()
#sns.distplot(normalized_data.loc[data.serious_dlqin2yrs==0], color = 'black', hist=False)









    Out[90]:





0.000000    5095
1.000000    1935
2.000000    1584
3.000000     837
4.000000     297
0.948208     179
5.000000      68
6.000000      24
7.000000       5
8.000000       2
Name: number_of_dependents, dtype: int64



In [81]:

    
plot_skewed_data(data, 'debt_ratio', 'serious_dlqin2yrs')

Correlation Plot

The correlation plot shows very strong correlation between the different past-due categories. While age and number of open credit lines and loans are weakly negatively correlated with the past-due categories and uncorrelated with debt ratios, income and number of dependents. Debt ratios, income and number of dependents are postively correlated with number of open credit lines and real estate loans.

I expanded the plot for the second round to include delinquency. Nothing is strongly correlaed with delinquency, though age has the strongest negative correlation.



In [182]:

    
ex.correlation_plot(data)
pl.suptitle("Correlation matrix")









    Out[182]:





<matplotlib.text.Text at 0x11228bb00>



In [10]:

    
print("Table: Average feature values by zip code")
x.summary_by_outcome(data, "zipcode").iloc[:,1::6]









    



Table: Average feature values by zip code






    Out[10]:






  
    
      zipcode
      60601
      60618
      60625
      60629
      60637
      60644
      60657
      60804
    
    
      
      mean
      mean
      mean
      mean
      mean
      mean
      mean
      mean
    
  
  
    
      age
      52.28
      52.27
      52.33
      52.18
      52.25
      52.22
      52.41
      52.40
    
    
      debt_ratio
      341.85
      342.50
      360.24
      345.67
      338.03
      373.64
      357.01
      357.87
    
    
      monthly_income
      6686.20
      6686.42
      6616.10
      6628.62
      6805.49
      6595.88
      6635.17
      6764.87
    
    
      number_of_dependents
      0.76
      0.75
      0.76
      0.76
      0.76
      0.77
      0.76
      0.74
    
    
      number_of_open_credit_lines_and_loans
      8.56
      8.44
      8.45
      8.39
      8.43
      8.46
      8.47
      8.41
    
    
      number_of_time30-59_days_past_due_not_worse
      0.39
      0.42
      0.41
      0.40
      0.46
      0.40
      0.39
      0.51
    
    
      number_of_time60-89_days_past_due_not_worse
      0.20
      0.24
      0.23
      0.22
      0.27
      0.22
      0.22
      0.33
    
    
      number_of_times90_days_late
      0.22
      0.27
      0.26
      0.24
      0.30
      0.25
      0.24
      0.36
    
    
      number_real_estate_loans_or_lines
      1.02
      1.02
      1.03
      1.01
      1.02
      1.01
      1.02
      1.01
    
    
      revolving_utilization_of_unsecured_lines
      10.22
      4.24
      5.15
      5.03
      9.67
      3.79
      7.45
      3.75

On average, we do not see major difference betweeen zip codes, despite representing very differen Chicago neighborhoods. Revolving utilization of unsecured lines is the exception. The results were similar when looking at median. Most zipcodes hovered around 16600 residents in the sample, while 60625 zipcode had double that. Despite this, in the feature generation section, I hoped to make dummies of these. I ran into memory issues however, even after reducing the size of the stored integers and using sparse matrices.

Feature Generation



In [11]:

    
data["debt_ratio_groups"] = pr.cut(data.debt_ratio, [0,0.25,.5,.75,1], labels="auto")

# This apply function is too slow.
# data.debt_ratio = data.debt_ratio.apply(lambda x: pr.cap_values(x,data.debt_ratio.quantile(.99)))

#Fill Na with mean for monthly income and median for number of dependents
na_cols = ["monthly_income","number_of_dependents"]
data[na_cols] = pr.fill_with(df=data,col=na_cols, group="serious_dlqin2yrs")


#binnerize number of dependents data and then make dummies.
data["number_of_dependents_cut"] = pr.cut(data.number_of_dependents, [0,.99,3.01,20.1], \
                                          method=pd.cut, labels=["No dependents","1-3 dependents", "4+ dependents"],include_lowest=True)
data = pr.get_dummies(data.number_of_dependents_cut, data)



In [ ]:



In [ ]:

    
SOME PRELIMINARY ROC curves. I eventually aborted this run of the small loop in favor of using a terminal overnight.









    



RF






    





<matplotlib.figure.Figure at 0x11b1d8e80>






    












    





<matplotlib.figure.Figure at 0x11b83e940>






    












    





<matplotlib.figure.Figure at 0x11b48a5c0>






    












    





<matplotlib.figure.Figure at 0x11b497908>






    












    





<matplotlib.figure.Figure at 0x11d1b6cc0>






    












    





<matplotlib.figure.Figure at 0x10f484978>






    












    





<matplotlib.figure.Figure at 0x117a004e0>






    












    





<matplotlib.figure.Figure at 0x11b4df3c8>






    












    





<matplotlib.figure.Figure at 0x11da91b70>






    












    





<matplotlib.figure.Figure at 0x11d9a0080>






    












    





<matplotlib.figure.Figure at 0x11d9d0978>






    












    





<matplotlib.figure.Figure at 0x11d5a7e48>






    












    





<matplotlib.figure.Figure at 0x11672e5c0>






    












    





<matplotlib.figure.Figure at 0x11d5ecda0>






    












    





<matplotlib.figure.Figure at 0x11d156438>






    












    





<matplotlib.figure.Figure at 0x11b406780>






    












    



DT






    





<matplotlib.figure.Figure at 0x11c09f390>






    












    





<matplotlib.figure.Figure at 0x11d697e10>






    












    





<matplotlib.figure.Figure at 0x11d3c9c50>






    












    





<matplotlib.figure.Figure at 0x11b844fd0>






    












    





<matplotlib.figure.Figure at 0x11d3d57b8>






    












    





<matplotlib.figure.Figure at 0x11c1b3048>






    












    





<matplotlib.figure.Figure at 0x11d13aa20>






    












    





<matplotlib.figure.Figure at 0x11b880da0>






    












    





<matplotlib.figure.Figure at 0x11b8a0e80>






    












    





<matplotlib.figure.Figure at 0x11b4caa20>






    












    





<matplotlib.figure.Figure at 0x11d3ed128>






    












    





<matplotlib.figure.Figure at 0x11c16e630>






    












    





<matplotlib.figure.Figure at 0x11d9b5f60>






    












    





<matplotlib.figure.Figure at 0x11d3c97b8>






    












    





<matplotlib.figure.Figure at 0x11d699a90>






    












    





<matplotlib.figure.Figure at 0x11b4e7d68>






    












    





<matplotlib.figure.Figure at 0x11da32470>






    












    





<matplotlib.figure.Figure at 0x11d5f0ef0>






    












    





<matplotlib.figure.Figure at 0x10f81ae48>






    












    





<matplotlib.figure.Figure at 0x11d697278>






    












    





<matplotlib.figure.Figure at 0x11b8c9940>






    












    





<matplotlib.figure.Figure at 0x11b4becf8>






    












    





<matplotlib.figure.Figure at 0x11d25c7f0>






    












    





<matplotlib.figure.Figure at 0x11c14c2b0>






    












    





<matplotlib.figure.Figure at 0x11d697240>






    












    





<matplotlib.figure.Figure at 0x11b801f60>






    












    





<matplotlib.figure.Figure at 0x11b4ce240>






    












    





<matplotlib.figure.Figure at 0x11b825208>






    












    





<matplotlib.figure.Figure at 0x11b4fa710>






    












    





<matplotlib.figure.Figure at 0x1165c2b38>






    












    





<matplotlib.figure.Figure at 0x11d1d02b0>






    












    





<matplotlib.figure.Figure at 0x11b4bbeb8>






    












    





<matplotlib.figure.Figure at 0x11d400f28>






    












    





<matplotlib.figure.Figure at 0x11b8eff60>






    












    





<matplotlib.figure.Figure at 0x11c19a6a0>






    












    





<matplotlib.figure.Figure at 0x11b4d5fd0>






    












    





<matplotlib.figure.Figure at 0x11b81bd30>






    












    





<matplotlib.figure.Figure at 0x11d14b6a0>






    












    





<matplotlib.figure.Figure at 0x11b4b5588>






    












    





<matplotlib.figure.Figure at 0x11d9f66a0>






    












    





<matplotlib.figure.Figure at 0x11b406a58>






    












    





<matplotlib.figure.Figure at 0x11d5927b8>






    












    





<matplotlib.figure.Figure at 0x11d6aa7f0>






    












    





<matplotlib.figure.Figure at 0x11d9b8668>






    












    





<matplotlib.figure.Figure at 0x1165cf9b0>






    












    





<matplotlib.figure.Figure at 0x11b8d1cc0>






    












    





<matplotlib.figure.Figure at 0x11b8d1c50>






    












    





<matplotlib.figure.Figure at 0x11b4cef28>






    












    





<matplotlib.figure.Figure at 0x1165cc7b8>






    












    





<matplotlib.figure.Figure at 0x11d15d358>






    












    





<matplotlib.figure.Figure at 0x11d413320>






    












    





<matplotlib.figure.Figure at 0x11d15d6d8>






    












    





<matplotlib.figure.Figure at 0x10fc80f60>






    












    





<matplotlib.figure.Figure at 0x11d213cf8>






    












    





<matplotlib.figure.Figure at 0x11b4df390>






    












    





<matplotlib.figure.Figure at 0x11c1b4b00>






    












    





<matplotlib.figure.Figure at 0x11b513208>






    












    





<matplotlib.figure.Figure at 0x11d69e898>






    












    





<matplotlib.figure.Figure at 0x11d663160>






    












    





<matplotlib.figure.Figure at 0x11d3ea390>






    












    





<matplotlib.figure.Figure at 0x11b4bbe80>






    












    





<matplotlib.figure.Figure at 0x11c152d30>






    












    





<matplotlib.figure.Figure at 0x11dfcb860>






    












    





<matplotlib.figure.Figure at 0x11d5fa0f0>






    












    





<matplotlib.figure.Figure at 0x10fc91b00>






    












    





<matplotlib.figure.Figure at 0x11d572a90>






    












    





<matplotlib.figure.Figure at 0x11c1b3eb8>






    












    





<matplotlib.figure.Figure at 0x11d1c2cc0>






    












    





<matplotlib.figure.Figure at 0x11d24a908>






    












    





<matplotlib.figure.Figure at 0x11d14f320>






    












    





<matplotlib.figure.Figure at 0x11d21f438>






    












    





<matplotlib.figure.Figure at 0x11d1b9160>






    












    



KNN






    





<matplotlib.figure.Figure at 0x11d5b8208>






    












    





<matplotlib.figure.Figure at 0x1165a7cc0>






    












    





<matplotlib.figure.Figure at 0x11b497710>






    












    





<matplotlib.figure.Figure at 0x10fc90780>






    












    





<matplotlib.figure.Figure at 0x11b46f1d0>






    












    





<matplotlib.figure.Figure at 0x11d40e860>






    












    





<matplotlib.figure.Figure at 0x11e933c18>






    












    





<matplotlib.figure.Figure at 0x11d400748>






    












    





<matplotlib.figure.Figure at 0x10fc97c18>






    












    





<matplotlib.figure.Figure at 0x11b439cc0>






    












    





<matplotlib.figure.Figure at 0x11b89db00>






    












    





<matplotlib.figure.Figure at 0x1165bc0f0>






    












    





<matplotlib.figure.Figure at 0x11e933c50>






    












    





<matplotlib.figure.Figure at 0x11b4c9c18>






    












    





<matplotlib.figure.Figure at 0x11e9339b0>






    












    





<matplotlib.figure.Figure at 0x11d3dcdd8>






    












    





<matplotlib.figure.Figure at 0x11d9c6f60>






    












    





<matplotlib.figure.Figure at 0x11c1adac8>






    












    





<matplotlib.figure.Figure at 0x11b412400>






    












    





<matplotlib.figure.Figure at 0x11b85da58>






    












    





<matplotlib.figure.Figure at 0x11d6aa518>






    












    





<matplotlib.figure.Figure at 0x11b474b00>






    












    





<matplotlib.figure.Figure at 0x11d4b9470>






    












    





<matplotlib.figure.Figure at 0x11d6977f0>






    












    





<matplotlib.figure.Figure at 0x11d5f7320>






    












    





<matplotlib.figure.Figure at 0x11b4582b0>






    












    





<matplotlib.figure.Figure at 0x11d3e3908>






    












    





<matplotlib.figure.Figure at 0x11d553ba8>






    












    





<matplotlib.figure.Figure at 0x11d9b2860>






    












    





<matplotlib.figure.Figure at 0x11d9b2908>






    












    





<matplotlib.figure.Figure at 0x11b4bc668>






    












    





<matplotlib.figure.Figure at 0x11b88e128>






    












    





<matplotlib.figure.Figure at 0x11b462b00>






    












    





<matplotlib.figure.Figure at 0x11d5790f0>






    












    





<matplotlib.figure.Figure at 0x11c151518>






    












    





<matplotlib.figure.Figure at 0x11b4ca0f0>






    












    



SVM






    





<matplotlib.figure.Figure at 0x11d3ed080>






    












    





<matplotlib.figure.Figure at 0x11b456fd0>






    












    





<matplotlib.figure.Figure at 0x11d9a0c18>

Feature Selection

Using a random forest, we can identify the importance of features



In [9]:

    
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

#df = u.get_subsample(data, 5000)
df = data
potential_features = list(data.columns[2:14])
y = df.serious_dlqin2yrs
X =df[potential_features].join(df.monthly_income.map(lambda x: math.log(x) if x!=0 else 0), rsuffix="_log")


def rf_feature_selection(X,y):
    '''
    identify important features using a random forest
    
    
    
    This is based on sklearn example code:
    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py
    '''
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), indices)
    plt.show()



In [10]:

    
rf_feature_selection(X,y)









    



Feature ranking:
1. feature 5 (0.129706)
2. feature 12 (0.124128)
3. feature 0 (0.105268)
4. feature 4 (0.097960)
5. feature 7 (0.090882)
6. feature 1 (0.089481)
7. feature 6 (0.079380)
8. feature 3 (0.065565)
9. feature 2 (0.061315)
10. feature 9 (0.054666)
11. feature 10 (0.043717)
12. feature 8 (0.036002)
13. feature 11 (0.021929)

Collect top 5 features: Note feature 12 is log(income) which did about the same as the income measure. It is somewhat surprising that factors that were more correlated with delinquency, such as number of dependents did not show up. and the other "number of days past due" were not as important. The debt ratio groups, which discretized debt ratios, appears as the least useful.



In [2]:

    
features = []
for x in [5,0,4,1,7]:
    features.append(potential_features[x])  
features









    Out[2]:





['monthly_income',
 'revolving_utilization_of_unsecured_lines',
 'debt_ratio',
 'age',
 'number_of_times90_days_late']



In [ ]:

    
df = data
X = df[features]
y = df.serious_dlqin2yrs

#models_to_run=['RF','DT','KNN', 'SVM', 'AB', 'GB', 'LR', 'NB']
grid_size = "smaller"
models_to_run=['RF','DT','GB','LR', 'NB', 'GB']

def run_magicloop(grid_size, models_to_run, outfile='results.csv', JUPTYER=1):
    clfs, grid = magic.define_clfs_params(grid_size)
    results_df = magic.clf_loop(models_to_run, clfs,grid, X,y)
    results_df.to_csv(outfile, index=False)
    return results_df

results_df = run_magicloop(grid_size, models_to_run)



In [4]:

    
results_df









    Out[4]:






  
    
      
      model_type
      clf
      parameters
      auc-roc
      p_at_5
      p_at_10
      p_at_20
      cnf
      runtime
    
  
  
    
      0
      RF
      (DecisionTreeClassifier(class_weight=None, cri...
      {'max_depth': 1, 'max_features': 'sqrt', 'min_...
      0.653223
      0.464000
      0.672533
      0.339600
      [[34953, 0], [2547, 0]]
      0.125045
    
    
      1
      DT
      DecisionTreeClassifier(class_weight=None, crit...
      {'criterion': 'gini', 'max_depth': 1, 'max_fea...
      0.699723
      0.759467
      0.379733
      0.256000
      [[34953, 0], [2547, 0]]
      0.056247
    
    
      2
      NB
      GaussianNB(priors=None)
      {}
      0.659564
      0.217600
      0.164800
      0.130267
      [[34921, 32], [2485, 62]]
      0.040034
    
    
      3
      GB
      ([DecisionTreeRegressor(criterion='friedman_ms...
      {'learning_rate': 0.1, 'max_depth': 1, 'n_esti...
      0.653223
      0.464000
      0.672533
      0.339600
      [[34953, 0], [2547, 0]]
      0.136803



In [21]:

    
results_df









    Out[21]:






  
    
      
      model_type
      clf
      parameters
      auc-roc
      p_at_5
      p_at_10
      p_at_20
      runtime
    
  
  
    
      0
      RF
      (DecisionTreeClassifier(class_weight=None, cri...
      {'max_depth': 1, 'max_features': 'sqrt', 'min_...
      0.705853
      0.789333
      0.394667
      0.210667
      -0.125176
    
    
      1
      DT
      DecisionTreeClassifier(class_weight=None, crit...
      {'criterion': 'gini', 'max_depth': 1, 'max_fea...
      0.603952
      1.000000
      0.529067
      0.264533
      -0.049901
    
    
      2
      KNN
      KNeighborsClassifier(algorithm='auto', leaf_si...
      {'algorithm': 'auto', 'n_neighbors': 5, 'weigh...
      0.663950
      0.488533
      0.321867
      0.160933
      -3.032433
    
    
      3
      LR
      LogisticRegression(C=0.01, class_weight=None, ...
      {'C': 0.01, 'penalty': 'l1'}
      0.662507
      0.138667
      0.132267
      0.120933
      -0.471463
    
    
      4
      NB
      GaussianNB(priors=None)
      {}
      0.659564
      0.217600
      0.164800
      0.130267
      -0.056506
    
    
      5
      GB
      ([DecisionTreeRegressor(criterion='friedman_ms...
      {'learning_rate': 0.1, 'max_depth': 1, 'n_esti...
      0.653223
      0.464000
      0.672533
      0.339600
      -0.121616



In [ ]:

    
# Compute confusion matrix
def plot_cnf(y_test,y_hat, class_names=y_test.unique()):
    cnf_matrix = confusion_matrix(y_test, y_hat)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    pl.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    pl.figure()
    ev.plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                          title='Normalized confusion matrix')


    pl.show()

serious_dlqin2yrs	Nondelinquent				Delinquent
	count	mean	std	50%	count	mean	std	50%
age	139974.0	52.75	14.79	52.00	10026.0	45.93	12.92	45.00
debt_ratio	139974.0	357.15	2083.28	0.36	10026.0	295.12	1238.36	0.43
monthly_income	111912.0	6747.84	14813.50	5466.00	8357.0	5630.83	6171.72	4500.00
number_of_dependents	136229.0	0.74	1.11	0.00	9847.0	0.95	1.22	0.00
number_of_open_credit_lines_and_loans	139974.0	8.49	5.11	8.00	10026.0	7.88	5.65	7.00
number_of_time30-59_days_past_due_not_worse	139974.0	0.28	2.95	0.00	10026.0	2.39	11.73	0.00
number_of_time60-89_days_past_due_not_worse	139974.0	0.13	2.90	0.00	10026.0	1.83	11.75	0.00
number_of_times90_days_late	139974.0	0.14	2.91	0.00	10026.0	2.09	11.76	0.00
number_real_estate_loans_or_lines	139974.0	1.02	1.11	1.00	10026.0	0.99	1.43	1.00
revolving_utilization_of_unsecured_lines	139974.0	6.17	256.13	0.13	10026.0	4.37	131.84	0.84

	model_type	clf	parameters	auc-roc	p_at_5	p_at_10	p_at_20	cnf	runtime
0	RF	(DecisionTreeClassifier(class_weight=None, cri...	{'max_depth': 1, 'max_features': 'sqrt', 'min_...	0.653223	0.464000	0.672533	0.339600	[[34953, 0], [2547, 0]]	0.125045
1	DT	DecisionTreeClassifier(class_weight=None, crit...	{'criterion': 'gini', 'max_depth': 1, 'max_fea...	0.699723	0.759467	0.379733	0.256000	[[34953, 0], [2547, 0]]	0.056247
2	NB	GaussianNB(priors=None)	{}	0.659564	0.217600	0.164800	0.130267	[[34921, 32], [2485, 62]]	0.040034
3	GB	([DecisionTreeRegressor(criterion='friedman_ms...	{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...	0.653223	0.464000	0.672533	0.339600	[[34953, 0], [2547, 0]]	0.136803

	model_type	clf	parameters	auc-roc	p_at_5	p_at_10	p_at_20	runtime
0	RF	(DecisionTreeClassifier(class_weight=None, cri...	{'max_depth': 1, 'max_features': 'sqrt', 'min_...	0.705853	0.789333	0.394667	0.210667	-0.125176
1	DT	DecisionTreeClassifier(class_weight=None, crit...	{'criterion': 'gini', 'max_depth': 1, 'max_fea...	0.603952	1.000000	0.529067	0.264533	-0.049901
2	KNN	KNeighborsClassifier(algorithm='auto', leaf_si...	{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...	0.663950	0.488533	0.321867	0.160933	-3.032433
3	LR	LogisticRegression(C=0.01, class_weight=None, ...	{'C': 0.01, 'penalty': 'l1'}	0.662507	0.138667	0.132267	0.120933	-0.471463
4	NB	GaussianNB(priors=None)	{}	0.659564	0.217600	0.164800	0.130267	-0.056506
5	GB	([DecisionTreeRegressor(criterion='friedman_ms...	{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...	0.653223	0.464000	0.672533	0.339600	-0.121616