""" For Homework 2, Build models to predict "Credit Card Approval" using dataset
http://archive.ics.uci.edu/ml/datasets/Credit+Approval
You may need to do the following -
Impute missing data
Plot and visualize data to see any patterns
For the actual model, the submission Notebook should have the following -
Build Models using Logistics Regression and SVM (you will learn tonight - Wed).
Use Grid Search to evaluate model parameters (Wed Lab) and select a model
Build a Confusion Matrix (Mon Lab) to show how well your prediction did.
The homework is due by Monday Dec'15th Midnight. Upload your submission the same way as Homework 1. """"
In [1]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)
In [2]:
pd.__version__
Out[2]:
In [3]:
!pwd
In [4]:
crx_cols = ['A' + str(idx) for idx in range(1, 17)]
creditcheck = pd.read_csv('crx.data.txt', header=None, names=crx_cols)
In [5]:
creditcheck.info()
In [6]:
creditcheck.head(5)
Out[6]:
In [7]:
"""
From documentation:
Attribute Information:
A1: b, a.
A2: continuous.
A3: continuous.
A4: u, y, l, t.
A5: g, p, gg.
A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
A7: v, h, bb, j, n, z, dd, ff, o.
A8: continuous.
A9: t, f.
A10: t, f.
A11: continuous.
A12: t, f.
A13: g, p, s.
A14: continuous.
A15: continuous.
A16: +,- (class attribute)
In [8]:
# so we aren't seeing the missing values...
"""
8. Missing Attribute Values:
37 cases (5%) have one or more missing values. The missing values from particular attributes are:
A1: 12
A2: 12
A4: 6
A5: 6
A6: 9
A7: 9
A14: 13
Missing value quantities appear to be turning their rows into strings instead of int / float.
"""
Out[8]:
In [9]:
# from sklearn.cross_validation import train_test_split
# NOTE train_test_split generates numpy arrays instead of pandas dataframes, so we're going to use something different at this step
In [10]:
rows = np.random.binomial(1, 0.7, size=len(creditcheck)).astype('bool')
In [11]:
credittrain = creditcheck[rows]
credittest = creditcheck[~rows]
In [12]:
credittrain.info()
In [13]:
credittrain.head(5)
Out[13]:
In [14]:
"""
From documentation, columns A1, A2, A4, A5, A6, A7, and A14 have missing values.
Based on .info() result, these missing values are not currently listed as NaN.
"""
Out[14]:
In [15]:
credittrain.A14.unique()
# checks were run on other relevant variables
Out[15]:
In [16]:
"""
Missing values for A1/A4/A5/A6/A7 (all strings) are input as '?',
for A2 are '?' and A2 is float,
for A14 are '?' but A14 is continuous in specs and "Z5" in practice?
"""
# replace '?' with NaN for both numerics and strings
Out[16]:
In [17]:
credittrain['A1'].value_counts()
Out[17]:
In [18]:
credittrainfull = credittrain.replace(to_replace='?',value=np.nan)
credittrainfull.info()
In [19]:
# convert A2 and A14 to numerics, now that the '?'s are gone
credittrainfull.A2 = credittrainfull.A2.astype(float)
credittrainfull.A14 = credittrainfull.A14.astype(float)
credittrainfull.info()
Now we can impute over the NaNs, using the most common non-missing values (for strings A1, 4, 5, 6, 7) and means (numerics A2, A14)
Note that A1, 4, and 5 are heavily concentrated in one category, while 6 and 7 have more spread.
In [20]:
col_dist = {}
def get_col_dist(col_name):
excl_null_mask = credittrainfull[col_name] != '?'
row_count = credittrainfull[excl_null_mask][col_name].size
col_data = {}
col_data['prob'] = (credittrainfull[excl_null_mask][col_name].value_counts() / row_count).values
col_data['values'] = (credittrainfull[excl_null_mask][col_name].value_counts() / row_count).index.values
return col_data
In [21]:
col_dist['A1'] = get_col_dist('A1')
col_dist['A4'] = get_col_dist('A4')
col_dist['A5'] = get_col_dist('A5')
col_dist['A6'] = get_col_dist('A6')
col_dist['A7'] = get_col_dist('A7')
In [22]:
def impute_cols(val, options):
if val == '?':
return np.random.choice(options['values'], p=options['prob'])
return val
In [23]:
def impute_a1(val):
return impute_cols(val, col_dist['A1'])
def impute_a4(val):
return impute_cols(val, col_dist['A4'])
def impute_a5(val):
return impute_cols(val, col_dist['A5'])
def impute_a6(val):
return impute_cols(val, col_dist['A6'])
def impute_a7(val):
return impute_cols(val, col_dist['A7'])
In [24]:
credittrainfull['A1imp'] = credittrainfull.A1.map(impute_a1)
credittrainfull['A4imp'] = credittrainfull.A4.map(impute_a4)
credittrainfull['A5imp'] = credittrainfull.A5.map(impute_a5)
credittrainfull['A6imp'] = credittrainfull.A6.map(impute_a6)
credittrainfull['A7imp'] = credittrainfull.A7.map(impute_a7)
In [25]:
# Imputing the numeric vars in place because I'm afraid of breaking the function from class...
def impute_numeric_cols(col_data, col_name):
na_row_count = col_data.isnull().sum()
impute_vals = np.random.normal(col_data.mean(), col_data.std(), na_row_count)
return impute_vals
In [26]:
A2_rows_mask = credittrainfull['A2'].isnull()
credittrainfull.loc[A2_rows_mask, 'A2'] = impute_numeric_cols(credittrainfull['A2'], 'A2')
A14_rows_mask = credittrainfull['A14'].isnull()
credittrainfull.loc[A14_rows_mask, 'A14'] = impute_numeric_cols(credittrainfull['A14'], 'A14')
credittrainfull.info()
In [27]:
"""
# This was my own attempt over the weekend:
# Manually find the most requent values (for strings) and the averages (for numerics).
# I didn't see an easy way to automate this without moving from pandas back to numpy/sklearn, but I'm sure one exists.
print 'A1', credittrainfull.A1.value_counts()[0:1]
print 'A2', credittrainfull.A2.mean
print 'A4', credittrainfull.A4.value_counts()[0:1]
print 'A5', credittrainfull.A5.value_counts()[0:1]
print 'A6', credittrainfull.A6.value_counts()[0:1]
print 'A7', credittrainfull.A7.value_counts()[0:1]
print 'A14', credittrainfull.A14.mean
"""
Out[27]:
In [28]:
"""
# We create a dictionary with each variable's desired fillna value.
dfill = {}
dfill['A1'] = 'b'
dfill['A2'] = 30.8
dfill['A4'] = 'u'
dfill['A5'] = 'g'
dfill['A6'] = 'c'
dfill['A7'] = 'v'
dfill['A14'] = 202
dfill
"""
Out[28]:
In [29]:
"""
credittrainfull['A1imp'] = credittrainfull.A1.fillna(dfill['A1'])
credittrainfull['A2imp'] = credittrainfull.A2.fillna(dfill['A2'])
credittrainfull['A4imp'] = credittrainfull.A4.fillna(dfill['A4'])
credittrainfull['A5imp'] = credittrainfull.A5.fillna(dfill['A5'])
credittrainfull['A6imp'] = credittrainfull.A6.fillna(dfill['A6'])
credittrainfull['A7imp'] = credittrainfull.A7.fillna(dfill['A7'])
credittrainfull['A14imp'] = credittrainfull.A14.fillna(dfill['A14'])
credittrainfull.info()
"""
Out[29]:
In [30]:
grid_plot = sns.FacetGrid(credittrainfull, row='A9', col='A12')
grid_plot.map(sns.regplot, 'A2', 'A3', color='.3', fit_reg=False)
Out[30]:
In [31]:
credittrainnum = credittrainfull.ix[:,['A2','A3','A8','A11','A14','A15']]
from pandas.tools.plotting import scatter_matrix
scatter_matrix(credittrainnum, alpha=0.2, figsize=(15, 10), diagonal='hist')
Out[31]:
""" A1: b, a. -- could be boolean, but since b is more common than a, ambiguous as to which should be True A4: u, y, l, t. A5: g, p, gg. A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff. A7: v, h, bb, j, n, z, dd, ff, o. A9: t, f. [replace with boolean] A10: t, f. [replace with boolean] A12: t, f. [replace with boolean] A13: g, p, s. Noting that A1 / 4/ 5/ 6/ 7 are all imputed """
In [32]:
A1_dummies = pd.get_dummies(credittrainfull.A1imp, prefix='A1')
A4_dummies = pd.get_dummies(credittrainfull.A4imp, prefix='A4')
A5_dummies = pd.get_dummies(credittrainfull.A5imp, prefix='A5')
A6_dummies = pd.get_dummies(credittrainfull.A6imp, prefix='A6')
A7_dummies = pd.get_dummies(credittrainfull.A7imp, prefix='A7')
A13_dummies = pd.get_dummies(credittrainfull.A13, prefix='A13')
creditmodel = credittrainfull.drop(['A1','A2','A4','A5','A6','A7','A13','A14','A1imp','A4imp','A5imp','A6imp','A7imp'],1)
creditmodel = creditmodel.merge(A1_dummies,left_index=True, right_index=True)
creditmodel = creditmodel.merge(A4_dummies,left_index=True, right_index=True)
creditmodel = creditmodel.merge(A5_dummies,left_index=True, right_index=True)
creditmodel = creditmodel.merge(A6_dummies,left_index=True, right_index=True)
creditmodel = creditmodel.merge(A7_dummies,left_index=True, right_index=True)
creditmodel = creditmodel.merge(A13_dummies,left_index=True, right_index=True)
creditmodel.info()
In [33]:
bdict = {'t' : True, 'f' : False}
creditmodel['A9bool'] = creditmodel['A9'].map(bdict)
creditmodel['A10bool'] = creditmodel['A10'].map(bdict)
creditmodel['A12bool'] = creditmodel['A12'].map(bdict)
creditmodel.drop(['A9','A10','A12'],1,inplace=True)
creditmodel['A16'] = creditmodel['A16'].replace('-', 0)
creditmodel['A16'] = creditmodel['A16'].replace('+', 1)
In [34]:
# And now we're ready to move back into SKLEARN!
from sklearn.cross_validation import train_test_split
creditmodelout = creditmodel['A16']
creditmodelin = creditmodel.drop('A16',1)
# we'll start with a train/validation split, using the cleaned data
X_train, X_test, y_train, y_test = train_test_split(creditmodelin,creditmodelout,test_size=0.2)
In [35]:
from sklearn.svm import LinearSVC
est = LinearSVC(C=1e-3)
est.fit(X_train, y_train)
Out[35]:
In [36]:
est.score(X_test, y_test)
Out[36]:
Pretty good! Now let's look at a non-linear (rbf) kernel!
In [37]:
from sklearn.svm import SVC
my_svc = SVC()
my_svc.fit(X_train, y_train)
Out[37]:
In [38]:
my_svc.score(X_test, y_test)
Out[38]:
hmmmmm, it's worse than the linear kernel! maybe there's a parameter issue we aren't seeing -- I guess we have a good motivation to go to grid search
In [39]:
from sklearn.grid_search import GridSearchCV
In [40]:
# first with linear
d_l = {'C': np.logspace(-3.,3.,10)}
gs_l = GridSearchCV(LinearSVC(), d_l)
In [41]:
gs_l.fit(X_train,y_train)
Out[41]:
In [42]:
gs_l.best_params_, gs_l.best_score_
Out[42]:
This looks like an improvement on the linear model with C=.001 we used initially
In [43]:
# Now with SVC
d = {}
d['C'] = np.logspace(-3.,3.,10)
d['gamma'] = np.logspace(-3.,3.,10)
gs_rbf = GridSearchCV(SVC(), d)
In [44]:
gs_rbf.fit(X_train,y_train)
Out[44]:
In [45]:
gs_rbf.best_params_, gs_rbf.best_score_
Out[45]:
So that's also an improvement on the default parameters, but not enough to catch the linear estimator. If I have enough time, I may try one or more of the alternative kernel function types built into SVC (‘poly’ or ‘sigmoid’)
In [46]:
sig_svc = SVC(kernel='sigmoid')
sig_svc.fit(X_train, y_train)
Out[46]:
In [47]:
sig_svc.score(X_test, y_test)
Out[47]:
In [48]:
gs_sig = GridSearchCV(SVC(kernel='sigmoid'), d)
In [49]:
gs_sig.fit(X_train,y_train)
Out[49]:
In [50]:
gs_sig.best_params_, gs_sig.best_score_
Out[50]:
In [51]:
# but the big question here is: how much cleaning (i.e. fixing variable types, creating dummies)
# should we be doing with the credittest df
In [52]:
"""
I'm out of time, but it looks to me like we should:
- convert '?'s into NaNs, so that A2 and A14 show up correctly as numeric
- convert A9, A10, and A12 to booleans
- run our two main types of model (linear, SVC) with the parameters determined through grid search
"
In [53]:
from sklearn.metrics import confusion_matrix
In [54]:
# this function takes the following form:
# confusion_matrix(y_true,y_pred)
# We have y_true as the y_test array from train_test_split (using the validation data)
# y_pred is probably some method call on my_SVC, with X_test as the input parameter
# in other words, we'll need to run this function separately for each estimator type
In [55]:
est_pred = est.predict(X_test)
cm_l = confusion_matrix(y_test, est_pred)
print cm_l
In [56]:
my_svc_pred = my_svc.predict(X_test)
cm_rbf = confusion_matrix(y_test, my_svc_pred)
print cm_rbf
In [57]:
"""
For a pretty plot of confusion matrix 'cm':
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
"""
Out[57]:
In [58]:
plt.matshow(cm_l)
plt.title('Confusion matrix - linear estimator')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [59]:
plt.matshow(cm_rbf)
plt.title('Confusion matrix - linear estimator')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
Note that the two diagrams side-by-side suggest an improper conclusion (that rbf is producing a more accurate estimate) because the color-bar used to shade each quadrant is not scaled with a zero minimum...
In [ ]: