In [340]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
pd.set_option('display.max_rows', 500)
from pandas.tools.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix, classification_report

In [341]:
#get the data
#data does no headers, so import with header=None to prevent first row of data from being used as header

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',header=None)
df.head()


Out[341]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [342]:
#create a list to be used as new headers for each column

h=[]

for i in range(0,16):
    h.append('A%s' % str(i+1))

#rename the headers with the newly created list
for i in range(0,16):
    df.rename(columns={i:h[i]},inplace=True)

df.head()


Out[342]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [343]:
#determine if there are any missing rows
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [344]:
df.head()


Out[344]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [345]:
#there are a bunch of question marks (missing data)
#replace them with 'Nan'
df.replace('?', np.nan, inplace = True)

In [346]:
#now, impute values for the new 'Nan' values
#A1 - randomly impute 'a' or 'b'
#A2 - impute A2 mean
#A4 - randomly impute 'u' or 'l' or 'y' or 't'
#A5 - randomly impute 'g' or 'p' or 'gg'
#A6 - randomly impute 'c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff'
#A7 - randomly impute 'v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o'
#A14 - impute A14 mean
    
#switch data type from object to float
df.A2 = df.A2.astype(float)
df['A2'].fillna(df['A2'].mean(), inplace=True)

df.A14 = df.A14.astype(float)
df['A14'].fillna(df['A14'].mean(), inplace=True)

df['A1'].fillna(np.random.choice(df.A1), inplace=True)
df['A4'].fillna(np.random.choice(df.A4), inplace=True)
df['A5'].fillna(np.random.choice(df.A5), inplace=True)
df['A6'].fillna(np.random.choice(df.A6), inplace=True)
df['A7'].fillna(np.random.choice(df.A7), inplace=True)

In [347]:
#test to make sure there are no more Nan values 
df[pd.isnull(df.A7)] #A1, A2, etc..


Out[347]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16

In [348]:
#describe the data
df.describe()


Out[348]:
A2 A3 A8 A11 A14 A15
count 690.000000 690.000000 690.000000 690.00000 690.000000 690.000000
mean 31.568171 4.758725 2.223406 2.40000 184.014771 1017.385507
std 11.853273 4.978163 3.346513 4.86294 172.159274 5210.102598
min 13.750000 0.000000 0.000000 0.00000 0.000000 0.000000
25% 22.670000 1.000000 0.165000 0.00000 80.000000 0.000000
50% 28.625000 2.750000 1.000000 0.00000 160.000000 5.000000
75% 37.707500 7.207500 2.625000 3.00000 272.000000 395.500000
max 80.250000 28.000000 28.500000 67.00000 2000.000000 100000.000000

In [349]:
scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde')


Out[349]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x119e50cd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a19e5d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a129910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119d11690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a20d750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a225f50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x117ce59d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x117006090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a2d4710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a45d8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a40abd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a545e50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11a5cad50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a63c210>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a6b4e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a70e890>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a79f850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a821410>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11a887fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11aa0aed0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11aa70ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11aaf3c90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11aaa9390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11abea190>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11ad6e090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11add15d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11ae57290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11aea5c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11b835c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11b8b87d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11ba29310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11baae210>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bb07e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bb9a090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bb42710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd7f550>]], dtype=object)

In [350]:
df.corr()


Out[350]:
A2 A3 A8 A11 A14 A15
A2 1.000000 0.201316 0.392787 0.185575 -0.077161 0.018539
A3 0.201316 1.000000 0.298902 0.271207 -0.222346 0.123121
A8 0.392787 0.298902 1.000000 0.322330 -0.076389 0.051345
A11 0.185575 0.271207 0.322330 1.000000 -0.119809 0.063692
A14 -0.077161 -0.222346 -0.076389 -0.119809 1.000000 0.065609
A15 0.018539 0.123121 0.051345 0.063692 0.065609 1.000000

In [351]:
X = df.ix[:,'A1':'A15']
X = pd.get_dummies(X)
X.head()


Out[351]:
A2 A3 A8 A11 A14 A15 A1_a A1_b A4_l A4_u ... A7_z A9_f A9_t A10_f A10_t A12_f A12_t A13_g A13_p A13_s
0 30.83 0.000 1.25 1 202 0 0 1 0 1 ... 0 0 1 0 1 1 0 1 0 0
1 58.67 4.460 3.04 6 43 560 1 0 0 1 ... 0 0 1 0 1 1 0 1 0 0
2 24.50 0.500 1.50 0 280 824 1 0 0 1 ... 0 0 1 1 0 1 0 1 0 0
3 27.83 1.540 3.75 5 100 3 0 1 0 1 ... 0 0 1 0 1 0 1 1 0 0
4 20.17 5.625 1.71 0 120 0 0 1 0 1 ... 0 0 1 1 0 1 0 0 0 1

5 rows × 46 columns


In [352]:
y = df['A16']
y.head()


Out[352]:
0    +
1    +
2    +
3    +
4    +
Name: A16, dtype: object

In [353]:
#Split the data into train/test sets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.3, random_state=12)

In [354]:
y_train.shape


Out[354]:
(483,)

In [355]:
#call Logistic Regression model
lr_est = LogisticRegression()

In [356]:
#fit training data to the LR model
lr_est.fit(X_train,y_train)


Out[356]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [357]:
#score the test data using the LR model
lr_est.score(X_test,y_test)


Out[357]:
0.86956521739130432

In [358]:
#call SVC
svc_est = SVC()

In [359]:
#fit training data to SVC
svc_est.fit(X_train,y_train)


Out[359]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [360]:
#score the test data using SVC
svc_est.score(X_test,y_test)


Out[360]:
0.56521739130434778

In [361]:
#use grid search to find the best C for the LR model
#standard np.logspace(-3,3,10)

param = {'C': np.logspace(-3,3,10)}
gs_lr = GridSearchCV(LogisticRegression(),param)

In [362]:
#fit the training data to grid search
gs_lr.fit(X_train,y_train)


Out[362]:
GridSearchCV(cv=None,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [363]:
#what is the best score in the range? what is the best C in the range?
gs_lr.grid_scores_


Out[363]:
[mean: 0.77226, std: 0.02972, params: {'C': 0.001},
 mean: 0.79503, std: 0.03514, params: {'C': 0.0046415888336127772},
 mean: 0.83230, std: 0.03085, params: {'C': 0.021544346900318832},
 mean: 0.85714, std: 0.02029, params: {'C': 0.10000000000000001},
 mean: 0.86749, std: 0.01781, params: {'C': 0.46415888336127775},
 mean: 0.86128, std: 0.01276, params: {'C': 2.154434690031882},
 mean: 0.85300, std: 0.01171, params: {'C': 10.0},
 mean: 0.85507, std: 0.01630, params: {'C': 46.415888336127729},
 mean: 0.84886, std: 0.01464, params: {'C': 215.44346900318823},
 mean: 0.84679, std: 0.01630, params: {'C': 1000.0}]

In [364]:
print 'Best Score: ', gs_lr.best_score_
print 'Best Parameter: ', gs_lr.best_params_


Best Score:  0.867494824017
Best Parameter:  {'C': 0.46415888336127775}

In [365]:
#Incorporate best parameters
lr_est2 = LogisticRegression(C=gs_lr.best_params_['C'])
lr_est2.fit(X_train, y_train)


Out[365]:
LogisticRegression(C=0.46415888336127775, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.0001)

In [366]:
lr_est2.score(X_test,y_test)


Out[366]:
0.87439613526570048

In [367]:
#use grid search to find the best C and gamme for the SVC model
param = {'C': np.logspace(-3,3,10),'gamma':np.logspace(-3,3,10)}
gs_svc = GridSearchCV(SVC(),param)

In [368]:
#fir the training data to grid serach
gs_svc.fit(X_train,y_train)


Out[368]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [369]:
#which is the best score? The best 'C' and 'gamma' parameters?
print "Best Score: ", gs_svc.best_score_
print "Best Params: ", gs_svc.best_params_


Best Score:  0.679089026915
Best Params:  {'C': 46.415888336127729, 'gamma': 0.001}

In [370]:
#Incorporate best parameters
svc_est2 = SVC(C=gs_svc.best_params_['C'],gamma=gs_svc.best_params_['gamma'])
svc_est2.fit(X_train,y_train)


Out[370]:
SVC(C=46.415888336127729, cache_size=200, class_weight=None, coef0=0.0,
  degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [371]:
svc_est2.score(X_test,y_test)


Out[371]:
0.71980676328502413

In [372]:
#Create a confusion matrix for the Logistic Regression model
y_pred_lr = lr_est2.predict(X_test)
y_pred_lr


Out[372]:
array(['+', '-', '+', '+', '+', '+', '-', '-', '+', '+', '-', '+', '+',
       '+', '-', '+', '+', '+', '+', '+', '-', '-', '-', '-', '+', '+',
       '-', '-', '-', '-', '-', '+', '-', '-', '-', '-', '-', '+', '+',
       '+', '-', '-', '-', '+', '+', '+', '+', '-', '-', '-', '+', '-',
       '-', '-', '+', '+', '-', '+', '+', '+', '+', '-', '+', '-', '-',
       '+', '-', '+', '-', '-', '+', '-', '+', '-', '-', '-', '-', '-',
       '-', '-', '-', '+', '+', '-', '+', '+', '-', '-', '-', '-', '-',
       '-', '-', '+', '+', '+', '-', '-', '+', '+', '-', '+', '+', '+',
       '+', '+', '-', '-', '+', '-', '-', '+', '-', '+', '+', '+', '-',
       '-', '-', '-', '+', '+', '+', '+', '+', '+', '-', '+', '+', '-',
       '+', '+', '+', '-', '+', '-', '+', '+', '-', '-', '+', '-', '-',
       '+', '-', '-', '-', '-', '+', '+', '+', '-', '-', '+', '-', '-',
       '-', '-', '+', '-', '-', '+', '+', '+', '+', '-', '-', '-', '-',
       '+', '+', '-', '-', '+', '+', '+', '+', '-', '+', '-', '-', '-',
       '-', '-', '-', '-', '-', '+', '+', '-', '+', '-', '+', '+', '-',
       '-', '-', '-', '+', '+', '-', '-', '-', '-', '-', '-', '+'], dtype=object)

In [373]:
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred_lr)


Out[373]:
array([[ 79,   9],
       [ 17, 102]])

In [374]:
# Examine Precision and Recall
print classification_report(y_test, y_pred_lr)


             precision    recall  f1-score   support

          +       0.82      0.90      0.86        88
          -       0.92      0.86      0.89       119

avg / total       0.88      0.87      0.87       207


In [375]:
# Examine the coefficients and significance of Variables
pd.DataFrame(zip(X.columns, np.transpose(lr_est2.coef_)))


Out[375]:
0 1
0 A2 [0.00164160881722]
1 A3 [0.00372130533697]
2 A8 [-0.0674108539767]
3 A11 [-0.117578342644]
4 A14 [0.000899223885982]
5 A15 [-0.000381436133813]
6 A1_a [0.134045707796]
7 A1_b [0.0141140719734]
8 A4_l [-8.26766816225e-11]
9 A4_u [-0.0982532843361]
10 A4_y [0.284476451655]
11 A5_g [-0.0982532843361]
12 A5_gg [-8.26766816225e-11]
13 A5_p [0.284476451655]
14 A6_aa [0.264304219253]
15 A6_c [0.0780341776233]
16 A6_cc [-0.291145073587]
17 A6_d [0.118735746169]
18 A6_e [-0.00485595052372]
19 A6_ff [0.38934808296]
20 A6_i [0.340410416629]
21 A6_j [0.0481354040562]
22 A6_k [0.00385512768941]
23 A6_m [0.0864884964476]
24 A6_q [-0.261008883898]
25 A6_r [0.00209217890661]
26 A6_w [-0.109797764088]
27 A6_x [-0.478373010401]
28 A7_bb [-0.00723261032221]
29 A7_dd [-0.00114877704273]
30 A7_ff [0.383078113818]
31 A7_h [-0.375284853624]
32 A7_j [-0.0590038279147]
33 A7_n [-0.0749672688104]
34 A7_o [-8.26766816225e-11]
35 A7_v [0.209890531533]
36 A7_z [0.110891859683]
37 A9_f [1.68681221868]
38 A9_t [-1.50058905145]
39 A10_f [0.406442047267]
40 A10_t [-0.220218880031]
41 A12_f [0.0144328074521]
42 A12_t [0.171790359784]
43 A13_g [0.289181852674]
44 A13_p [-0.165070577055]
45 A13_s [0.0621118916167]

In [376]:
#Now run the confusion matrix for the SVC model
y_pred_svc = svc_est2.predict(X_test)
y_pred_svc


Out[376]:
array(['+', '-', '+', '+', '+', '+', '-', '+', '+', '+', '-', '+', '+',
       '+', '-', '-', '+', '-', '+', '+', '-', '-', '+', '-', '+', '-',
       '+', '-', '-', '-', '-', '+', '+', '+', '-', '-', '-', '+', '+',
       '+', '+', '-', '+', '+', '+', '+', '+', '-', '-', '-', '-', '-',
       '-', '+', '+', '+', '-', '-', '-', '+', '+', '+', '-', '-', '-',
       '+', '-', '+', '+', '-', '+', '-', '+', '+', '-', '-', '+', '-',
       '-', '-', '-', '-', '-', '-', '-', '+', '-', '-', '-', '-', '-',
       '-', '-', '+', '+', '-', '-', '-', '+', '+', '-', '+', '-', '+',
       '-', '+', '-', '-', '+', '+', '+', '-', '+', '+', '+', '+', '+',
       '+', '+', '-', '+', '-', '-', '+', '+', '+', '+', '-', '+', '-',
       '+', '+', '-', '-', '-', '-', '-', '+', '-', '-', '-', '-', '-',
       '-', '-', '+', '-', '+', '+', '-', '+', '-', '-', '+', '-', '-',
       '-', '-', '-', '-', '-', '+', '+', '+', '-', '-', '-', '-', '-',
       '+', '+', '-', '+', '-', '-', '+', '+', '+', '+', '-', '+', '+',
       '-', '+', '+', '+', '-', '+', '+', '-', '+', '-', '+', '+', '-',
       '+', '-', '-', '+', '-', '-', '-', '+', '-', '+', '+', '+'], dtype=object)

In [377]:
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred_svc)


Out[377]:
array([[65, 23],
       [35, 84]])

In [378]:
# Examine Precision and Recall
print classification_report(y_test,y_pred_svc)


             precision    recall  f1-score   support

          +       0.65      0.74      0.69        88
          -       0.79      0.71      0.74       119

avg / total       0.73      0.72      0.72       207


In [379]:
#Of the two models, Logistic Regression seems like the better model for this data