In [1]:
import pandas as pd
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import math
import pipeline.magicloops as magic
import pipeline.util as u
import pipeline.process as pr
import pipeline.read as r
import pipeline.explore as ex
import pipeline.evaluate as ev
% matplotlib inline
data = r.read_csv('data/credit-data.csv', parse_zipcodes=["zipcode"], dtype ={"SeriousDlqin2yrs": "category","PersonID":"category"})
data.serious_dlqin2yrs.cat.categories = ["Nondelinquent", "Delinquent"]
data = r.read_csv('data/credit-data.csv', parse_zipcodes=["zipcode"], dtype ={"PersonID":"category"})
#data.serious_dlqin2yrs.cat.categories = ["Nondelinquent", "Delinquent"]
data["debt_ratio_groups"] = pr.cut(data.debt_ratio, [0,0.25,.5,.75,1], labels="auto")
# This apply function is too slow.
# data.debt_ratio = data.debt_ratio.apply(lambda x: pr.cap_values(x,data.debt_ratio.quantile(.99)))
#Fill Na with mean for monthly income and median for number of dependents
na_cols = ["monthly_income","number_of_dependents"]
data[na_cols] = pr.fill_with(df=data,col=na_cols, group="serious_dlqin2yrs")
#binnerize number of dependents data and then make dummies.
data["number_of_dependents_cut"] = pr.cut(data.number_of_dependents, [0,.99,3.01,20.1], \
method=pd.cut, labels=["No dependents","1-3 dependents", "4+ dependents"],include_lowest=True)
data = pr.get_dummies(data.number_of_dependents_cut, data)
df = data
potential_features = list(data.columns[2:14])
y = df.serious_dlqin2yrs
X =df[potential_features].join(df.monthly_income.map(lambda x: math.log(x) if x!=0 else 0), rsuffix="_log")
The below table shows outcomes split by data labeled delinquent or nondelinquent. The number of times delinquents are past due is an order of magnitude higher than nondelinquents, but the high variance on the measure and low median suggest that is driven by people with particularly high numbers of past due bills. Nondelinquents are older and have higher income on average.
In [3]:
summary = x.summary_by_outcome(data, "serious_dlqin2yrs")
summary.ix[:,[0,1,2,4,6,7,8,10]]
Out[3]:
In [4]:
null_income = u.check_nulls(data, "monthly_income")
null_sum = x.summary_by_outcome(null_income, "serious_dlqin2yrs")
not_null_income = u.get_notnulls(data, "monthly_income")
not_null_sum = x.summary_by_outcome(not_null_income, "serious_dlqin2yrs")
ratio = round(null_sum.ix[:,[0,1,2,5,6,7,8,11]]/not_null_sum.ix[:,[0,1,2,5,6,7,8,11]],2)
ratio.drop("monthly_income")
Out[4]:
The analysis above compares people who have a reported monthly income and those with null monthly incomes. The ratio of missing income data over debt ratio with income data is very large. Those without income data have 62 to 77 times the debt ratio of those with reported data. In general, we would hope these ratios were close to 1. The count data show for each person with missing data there are 4 to 5 people with reported income. In other words, those with missing data make up slightly under 20 percent of our data. I will have to be careful when developing models to account for this discrepancy. For example, in a large project I would compare models with imputed values for the missing income data with dropping the data and would consider capping debt ratios. The graphs below show that the data with nulls has more extreme debt ratios (compare the bottom corners of the following two pair plots). The solution is I will implement will be to group debt ratios.
In [7]:
sns.set(style="white")
sns.pairplot(data, vars=["debt_ratio","revolving_utilization_of_unsecured_lines","number_real_estate_loans_or_lines"],\
hue="serious_dlqin2yrs", size=3, plot_kws={'alpha':0.2})
pl.suptitle("Paired distribution of indictors by delinquincy status (keeping NA's)")
Out[7]:
In [8]:
sns.pairplot(u.get_notnulls(data,"monthly_income"), vars=["monthly_income","debt_ratio","revolving_utilization_of_unsecured_lines","number_real_estate_loans_or_lines"],\
hue="serious_dlqin2yrs", size=3, plot_kws={'alpha':0.2})
pl.suptitle("Paired distribution of indictors by delinquincy status (removing NA's)")
Out[8]:
In [109]:
import math
"""normalized_data = data.monthly_income.map(lambda x: math.log(x) if x != 0 else 0)
sns.distplot(normalized_data.loc[data.serious_dlqin2yrs==1], color = 'red', hist=False)
sns.distplot(normalized_data.loc[data.serious_dlqin2yrs==0], color = 'black', hist=False)
plt.xlim = (5, 11)
plt.show()
"""
def no_correction_function(x):
return x
def my_distplot(df, col, binary_split=None,fn=no_correction_function, hist=False, kde=True):
'''
df (dataFrame)
col (column in dataFrame) we expect values >= 0
binary_split (column in df of 0 and 1)
fn (function) (e.g. math.log, math.sqrt)
FUTURE: use quantile or other methods to cut outliers
FUTURE: make arbitrary distinct graphs not just binary
'''
normalized_data = df[col].map(lambda x: fn(x) if x > 0 else x)
if binary_split:
sns.distplot(normalized_data.loc[df[binary_split]==1], color = 'red', hist=hist,kde=kde, label='{} =1'.format(binary_split))
sns.distplot(normalized_data.loc[df[binary_split]==0], color = 'black',hist=hist,kde=kde,label='{} = 0'.format(binary_split))
plt.title("Histogram of {} of {} split on {}".format(fn, col, binary_split))
else:
sns.distplot(normalized_data)
plt.title("Histogram of {} of {}".format(fn, col), label = col)
if fn==no_correction_function:
plt.xlabel("{}".format(col))
else:
plt.xlabel("{} of {}".format(fn, col))
plt.legend()
my_distplot(data, 'monthly_income', 'serious_dlqin2yrs',fn=math.log)
In [106]:
my_distplot(data, 'revolving_utilization_of_unsecured_lines', 'serious_dlqin2yrs',fn=math.log)
In [111]:
my_distplot(data, 'number_of_dependents', 'serious_dlqin2yrs', hist=True, kde=True)
In [90]:
normalized_data = data.monthly_income.map(lambda x: math.log(x) if x != 0 else 0)
data.loc[data.serious_dlqin2yrs==1]['number_of_dependents'].value_counts()
#sns.distplot(normalized_data.loc[data.serious_dlqin2yrs==0], color = 'black', hist=False)
Out[90]:
In [81]:
plot_skewed_data(data, 'debt_ratio', 'serious_dlqin2yrs')
The correlation plot shows very strong correlation between the different past-due categories. While age and number of open credit lines and loans are weakly negatively correlated with the past-due categories and uncorrelated with debt ratios, income and number of dependents. Debt ratios, income and number of dependents are postively correlated with number of open credit lines and real estate loans.
I expanded the plot for the second round to include delinquency. Nothing is strongly correlaed with delinquency, though age has the strongest negative correlation.
In [182]:
ex.correlation_plot(data)
pl.suptitle("Correlation matrix")
Out[182]:
In [10]:
print("Table: Average feature values by zip code")
x.summary_by_outcome(data, "zipcode").iloc[:,1::6]
Out[10]:
On average, we do not see major difference betweeen zip codes, despite representing very differen Chicago neighborhoods. Revolving utilization of unsecured lines is the exception. The results were similar when looking at median. Most zipcodes hovered around 16600 residents in the sample, while 60625 zipcode had double that. Despite this, in the feature generation section, I hoped to make dummies of these. I ran into memory issues however, even after reducing the size of the stored integers and using sparse matrices.
In [11]:
data["debt_ratio_groups"] = pr.cut(data.debt_ratio, [0,0.25,.5,.75,1], labels="auto")
# This apply function is too slow.
# data.debt_ratio = data.debt_ratio.apply(lambda x: pr.cap_values(x,data.debt_ratio.quantile(.99)))
#Fill Na with mean for monthly income and median for number of dependents
na_cols = ["monthly_income","number_of_dependents"]
data[na_cols] = pr.fill_with(df=data,col=na_cols, group="serious_dlqin2yrs")
#binnerize number of dependents data and then make dummies.
data["number_of_dependents_cut"] = pr.cut(data.number_of_dependents, [0,.99,3.01,20.1], \
method=pd.cut, labels=["No dependents","1-3 dependents", "4+ dependents"],include_lowest=True)
data = pr.get_dummies(data.number_of_dependents_cut, data)
In [ ]:
In [ ]:
SOME PRELIMINARY ROC curves. I eventually aborted this run of the small loop in favor of using a terminal overnight.
In [9]:
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
#df = u.get_subsample(data, 5000)
df = data
potential_features = list(data.columns[2:14])
y = df.serious_dlqin2yrs
X =df[potential_features].join(df.monthly_income.map(lambda x: math.log(x) if x!=0 else 0), rsuffix="_log")
def rf_feature_selection(X,y):
'''
identify important features using a random forest
This is based on sklearn example code:
http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py
'''
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.show()
In [10]:
rf_feature_selection(X,y)
Collect top 5 features: Note feature 12 is log(income) which did about the same as the income measure. It is somewhat surprising that factors that were more correlated with delinquency, such as number of dependents did not show up. and the other "number of days past due" were not as important. The debt ratio groups, which discretized debt ratios, appears as the least useful.
In [2]:
features = []
for x in [5,0,4,1,7]:
features.append(potential_features[x])
features
Out[2]:
In [ ]:
df = data
X = df[features]
y = df.serious_dlqin2yrs
#models_to_run=['RF','DT','KNN', 'SVM', 'AB', 'GB', 'LR', 'NB']
grid_size = "smaller"
models_to_run=['RF','DT','GB','LR', 'NB', 'GB']
def run_magicloop(grid_size, models_to_run, outfile='results.csv', JUPTYER=1):
clfs, grid = magic.define_clfs_params(grid_size)
results_df = magic.clf_loop(models_to_run, clfs,grid, X,y)
results_df.to_csv(outfile, index=False)
return results_df
results_df = run_magicloop(grid_size, models_to_run)
In [4]:
results_df
Out[4]:
In [21]:
results_df
Out[21]:
In [ ]:
# Compute confusion matrix
def plot_cnf(y_test,y_hat, class_names=y_test.unique()):
cnf_matrix = confusion_matrix(y_test, y_hat)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
pl.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
pl.figure()
ev.plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='Normalized confusion matrix')
pl.show()