In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats 
from sklearn.cross_validation import train_test_split

pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 4)
np.set_printoptions(precision = 4)

%matplotlib inline

In [3]:
credit = pd.read_csv('./crx.data', header = None,
                     names = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','A16'])

In [4]:
credit.head()


Out[4]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [5]:
credit.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [6]:
# Recode A16 plus minus into new variable with 1 and 0.
def recodeA16(sign):
    if sign == '+':
        return 1
    return 0
credit['A16R'] = credit['A16'].map(recodeA16).astype(float)
credit = credit.drop('A16', axis = 1)

In [7]:
# Deal with missing NaN.
# Transform numeric variables for float, the rest to str.
stringFeat = {'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'}
numFeat = {'A2', 'A3', 'A8', 'A11', 'A14', 'A15', 'A16R'}
for string in stringFeat:
    credit[string] = credit[string].astype(str)
for number in numFeat:
    credit[number] = credit[number].convert_objects(convert_numeric = True)

In [8]:
# This was my inefficient way of doing the imputations. 
# Check for missing values: A1, A2, A4, A5, A6, A7, and A14 should have some (documentation says so). 
# Some are coded as ?, so need to be replaced with NaN.
# Recode ? as Nan.
# credit.replace('?', np.nan, inplace = True)
# Deal with missing values.
# A1 imputation based on distribution of unique values in A1. 
# Binary mask for NaN.
# A1_missing_mask = credit.A1.isnull()
# Get unique values. 
# A1_choice = list(set(credit[-A1_missing_mask].A1))
# Get counts of unique values.
# A1_a_count = credit[credit.A1 == 'a']['A1'].count().astype(float)
# A1_b_count = credit[credit.A1 == 'b']['A1'].count().astype(float)
# Function to impute A1. 
# def get_A1_impute(number):
#    return np.random.choice(A1_choice, number, p = [A1_a_count / (A1_a_count + A1_b_count), 
#                                                                           A1_b_count / (A1_a_count + A1_b_count)])
# Impute. 
# credit.loc[A1_missing_mask, 'A1'] = get_A1_impute(len(credit[pd.isnull(credit.A1)]))
# Deal with A4, A5, A6, A7. 
# Masks.
# A4_missing_mask = credit.A4.isnull() # 6 missing
# A5_missing_mask = credit.A5.isnull() # 6 missing
# A6_missing_mask = credit.A6.isnull() # 8 missing
# A7_missing_mask = credit.A7.isnull() # 8 missing
# Compare rows idx A4 through A7. If the same/ subsets, drop for now.
# def get_idx(data, feature):
#    return data[data[feature].isnull()].index.tolist()
# get_idx(credit, 'A4') == get_idx(credit, 'A5')
# get_idx(credit, 'A6') == get_idx(credit, 'A7')
# Check whether A4 and A5 are subsets of A6 and A7.
# set(get_idx(credit,'A4')).issubset(get_idx(credit,'A6'))
# Drop all rows where A6 NaN. Drop A2. 
# creditR = credit.drop(get_idx(credit,'A6'), axis = 0)

In [9]:
# Code from Lab_07. Thank you! 
col_dist = {}
def get_col_dist(col_name):
    excl_null_mask = credit[col_name] != '?'
    row_count = credit[excl_null_mask][col_name].size
    col_data = {}
    col_data['prob'] = (credit[excl_null_mask][col_name].value_counts() / row_count).values
    col_data['values'] = (credit[excl_null_mask][col_name].value_counts() / row_count).index.values
    return col_data
col_dist['A1'] = get_col_dist('A1')
col_dist['A4'] = get_col_dist('A4')
col_dist['A5'] = get_col_dist('A5')
col_dist['A6'] = get_col_dist('A6')
col_dist['A7'] = get_col_dist('A7')

def impute_cols(val, options):
    if val == '?':
        return np.random.choice(options['values'], p=options['prob'])
    return val

def impute_a1(val):
    return impute_cols(val, col_dist['A1'])

def impute_a4(val):
    return impute_cols(val, col_dist['A4'])

def impute_a5(val):
    return impute_cols(val, col_dist['A5'])

def impute_a6(val):
    return impute_cols(val, col_dist['A6'])

def impute_a7(val):
    return impute_cols(val, col_dist['A7'])

credit.A1 = credit.A1.map(impute_a1)
credit.A4 = credit.A4.map(impute_a4)
credit.A5 = credit.A5.map(impute_a5)
credit.A6 = credit.A6.map(impute_a6)
credit.A7 = credit.A7.map(impute_a7)

In [10]:
# A2 imputation.
# Check distribution. Looks like median would be better than mean. 
# Actually, this is snooping. It'd be better to split first, then impute. 
# But I didn't really know how to do it, so I left as is. 
credit.A2.hist(bins = 10)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x109bf3810>

In [11]:
# Impute median in A2 NaNs. 
A2_median = credit['A2'].median()
A2_missing_mask = credit.A2.isnull()
credit.loc[A2_missing_mask, 'A2'] = A2_median

In [12]:
# Impute A14 similar to A2.
# Check distribution. Very skewed. 
credit['A14'].hist(bins=20)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x109e04390>

In [13]:
# Impute A14 NaN with median.
A14_median = credit['A14'].median()
A14_missing_mask = credit['A14'].isnull()
credit.loc[A14_missing_mask, 'A14'] = A14_median

In [14]:
# Get dummies.
creditD = pd.get_dummies(credit)

In [15]:
credit_cols = creditD.columns
credit_cols


Out[15]:
Index([u'A2', u'A3', u'A8', u'A11', u'A14', u'A15', u'A16R', u'A1_a', u'A1_b', u'A4_l', u'A4_u', u'A4_y', u'A5_g', u'A5_gg', u'A5_p', u'A6_aa', u'A6_c', u'A6_cc', u'A6_d', u'A6_e', u'A6_ff', u'A6_i', u'A6_j', u'A6_k', u'A6_m', u'A6_q', u'A6_r', u'A6_w', u'A6_x', u'A7_bb', u'A7_dd', u'A7_ff', u'A7_h', u'A7_j', u'A7_n', u'A7_o', u'A7_v', u'A7_z', u'A9_f', u'A9_t', u'A10_f', u'A10_t', u'A12_f', u'A12_t', u'A13_g', u'A13_p', u'A13_s'], dtype='object')

In [16]:
creditD.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 47 columns):
A2       690 non-null float64
A3       690 non-null float64
A8       690 non-null float64
A11      690 non-null int64
A14      690 non-null float64
A15      690 non-null int64
A16R     690 non-null float64
A1_a     690 non-null float64
A1_b     690 non-null float64
A4_l     690 non-null float64
A4_u     690 non-null float64
A4_y     690 non-null float64
A5_g     690 non-null float64
A5_gg    690 non-null float64
A5_p     690 non-null float64
A6_aa    690 non-null float64
A6_c     690 non-null float64
A6_cc    690 non-null float64
A6_d     690 non-null float64
A6_e     690 non-null float64
A6_ff    690 non-null float64
A6_i     690 non-null float64
A6_j     690 non-null float64
A6_k     690 non-null float64
A6_m     690 non-null float64
A6_q     690 non-null float64
A6_r     690 non-null float64
A6_w     690 non-null float64
A6_x     690 non-null float64
A7_bb    690 non-null float64
A7_dd    690 non-null float64
A7_ff    690 non-null float64
A7_h     690 non-null float64
A7_j     690 non-null float64
A7_n     690 non-null float64
A7_o     690 non-null float64
A7_v     690 non-null float64
A7_z     690 non-null float64
A9_f     690 non-null float64
A9_t     690 non-null float64
A10_f    690 non-null float64
A10_t    690 non-null float64
A12_f    690 non-null float64
A12_t    690 non-null float64
A13_g    690 non-null float64
A13_p    690 non-null float64
A13_s    690 non-null float64
dtypes: float64(45), int64(2)
memory usage: 258.8 KB

In [17]:
# Split.
credit_y = creditD.A16R
credit_X = creditD.drop('A16R', axis = 1)
credit_X_train, credit_X_test, credit_y_train, credit_y_test = train_test_split(credit_X, credit_y, 
                                                                                test_size=0.2, 
                                                                                random_state = 12)

In [18]:
## Exploratory analysis on credit_X_train, credit_y_train. 
credit_XTrain = pd.DataFrame(credit_X_train)
credit_YTrain = pd.DataFrame(credit_y_train, columns = ['A16R'])
creditExpl = credit_XTrain.join(credit_YTrain)

In [19]:
creditExpl.columns = credit_cols

In [20]:
creditExpl.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 552 entries, 0 to 551
Data columns (total 47 columns):
A2       552 non-null float64
A3       552 non-null float64
A8       552 non-null float64
A11      552 non-null float64
A14      552 non-null float64
A15      552 non-null float64
A16R     552 non-null float64
A1_a     552 non-null float64
A1_b     552 non-null float64
A4_l     552 non-null float64
A4_u     552 non-null float64
A4_y     552 non-null float64
A5_g     552 non-null float64
A5_gg    552 non-null float64
A5_p     552 non-null float64
A6_aa    552 non-null float64
A6_c     552 non-null float64
A6_cc    552 non-null float64
A6_d     552 non-null float64
A6_e     552 non-null float64
A6_ff    552 non-null float64
A6_i     552 non-null float64
A6_j     552 non-null float64
A6_k     552 non-null float64
A6_m     552 non-null float64
A6_q     552 non-null float64
A6_r     552 non-null float64
A6_w     552 non-null float64
A6_x     552 non-null float64
A7_bb    552 non-null float64
A7_dd    552 non-null float64
A7_ff    552 non-null float64
A7_h     552 non-null float64
A7_j     552 non-null float64
A7_n     552 non-null float64
A7_o     552 non-null float64
A7_v     552 non-null float64
A7_z     552 non-null float64
A9_f     552 non-null float64
A9_t     552 non-null float64
A10_f    552 non-null float64
A10_t    552 non-null float64
A12_f    552 non-null float64
A12_t    552 non-null float64
A13_g    552 non-null float64
A13_p    552 non-null float64
A13_s    552 non-null float64
dtypes: float64(47)
memory usage: 207.0 KB

In [21]:
creditExpl.head(20)


Out[21]:
A2 A3 A8 A11 A14 A15 A16R A1_a A1_b A4_l ... A7_z A9_f A9_t A10_f A10_t A12_f A12_t A13_g A13_p A13_s
0 23.75 0.710 0.250 1 240 4 1 0 0 1 ... 1 0 0 1 0 1 1 0 0 0
1 25.83 12.835 0.500 0 0 2 0 1 0 1 ... 1 0 1 0 1 0 1 0 0 0
2 25.17 2.875 0.875 0 360 0 1 0 0 1 ... 0 1 1 0 1 0 1 0 0 1
3 69.50 6.000 0.000 0 0 0 1 0 0 1 ... 1 0 1 0 1 0 0 0 1 0
4 32.08 4.000 1.500 0 120 0 0 1 0 0 ... 1 0 1 0 0 1 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
15 29.50 0.460 0.540 4 380 500 1 0 0 1 ... 0 1 0 1 1 0 1 0 0 1
16 18.58 10.290 0.415 0 80 0 0 1 0 1 ... 1 0 1 0 1 0 1 0 0 0
17 17.92 10.210 0.000 0 0 50 1 0 0 1 ... 1 0 1 0 1 0 1 0 0 0
18 23.58 0.835 0.085 0 220 5 0 1 0 1 ... 1 0 1 0 0 1 1 0 0 0
19 18.17 10.250 1.085 0 320 13 0 1 0 1 ... 1 0 1 0 1 0 1 0 0 0

20 rows × 47 columns


In [22]:
numFeat = ['A2', 'A3', 'A8', 'A14', 'A15']
creditExpl[numFeat].describe()


Out[22]:
A2 A3 A8 A14 A15
count 552.000 552.000 552.000 552.000 552.000
mean 31.445 4.796 2.125 187.393 1144.716
std 11.976 4.856 3.065 177.605 5766.012
min 15.170 0.000 0.000 0.000 0.000
25% 22.730 1.040 0.165 80.000 0.000
50% 28.460 2.855 1.000 160.000 4.500
75% 37.330 7.510 2.551 280.000 379.750
max 80.250 26.335 20.000 2000.000 100000.000

In [23]:
# Scatter matrix all data. Clearly non-normal/ skewed. 
pd.tools.plotting.scatter_matrix(creditExpl[numFeat], alpha = 0.2, figsize = (15, 10), diagonal = 'kde');



In [24]:
# Note: If you have seaborn already imported, outliers won't show. If that happends, restart Kernel. 
# Some boxplots: A2, A3, A8, A11, A14, A15.
# Super skewed, outliers. 
fig, axarr = plt.subplots(2, 3, figsize = (15, 15))
axarr[0, 0].boxplot(creditExpl['A2'], 1);
axarr[0, 0].set_title('A2')
axarr[0, 1].boxplot(creditExpl['A3'], 1);
axarr[0, 1].set_title('A3')
axarr[0, 2].boxplot(creditExpl['A8'], 1);
axarr[0, 2].set_title('A8')
axarr[1, 0].boxplot(creditExpl['A11'], 1);
axarr[1, 0].set_title('A11')
axarr[1, 1].boxplot(creditExpl['A14'], 1);
axarr[1, 1].set_title('A14')
axarr[1, 2].boxplot(creditExpl['A15'], 1);
axarr[1, 2].set_title('A15')


Out[24]:
<matplotlib.text.Text at 0x10c69bf90>

In [25]:
# Non-parametric correlations.
creditExpl[numFeat].corr(method = 'spearman')


Out[25]:
A2 A3 A8 A14 A15
A2 1.000 0.068 0.224 -0.000 0.018
A3 0.068 1.000 0.228 -0.315 0.095
A8 0.224 0.228 1.000 -0.047 0.089
A14 -0.000 -0.315 -0.047 1.000 -0.083
A15 0.018 0.095 0.089 -0.083 1.000

In [26]:
# Double-check a few corr for significance. Seem pretty high. 
# Sign. corr would violate assumption of logReg. But scatter plots/ boxplots suggest influential cases.
print 'A2, A8:', stats.spearmanr(creditExpl['A2'], creditExpl['A8'])
print 'A3, A8:', stats.spearmanr(creditExpl['A8'], creditExpl['A3'])
print 'A3, A14:', stats.spearmanr(creditExpl['A3'], creditExpl['A14'])


A2, A8: (0.22406272719179662, 1.036890574115048e-07)
A3, A8: (0.22793218520983996, 6.1464903856487324e-08)
A3, A14: (-0.31469589038511209, 3.729592309652447e-14)

In [27]:
# Means by group (y). 
creditExpl.groupby(creditExpl.A16R)[numFeat].agg('mean')


Out[27]:
A2 A3 A8 A14 A15
A16R
0 31.821 4.688 2.315 195.379 1174.624
1 30.615 5.035 1.707 169.750 1078.640

In [28]:
# Medians by group (y).
creditExpl.groupby(creditExpl.A16R)[numFeat].agg('median')


Out[28]:
A2 A3 A8 A14 A15
A16R
0 28.750 2.73 1.085 160 2
1 26.125 3.00 0.875 140 10

In [29]:
# Build logistic regression.
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()

In [30]:
logReg.fit(credit_X_train, credit_y_train)


Out[30]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [31]:
logReg.intercept_


Out[31]:
array([-0.0454])

In [32]:
logReg.coef_


Out[32]:
array([[ -1.0359e-05,  -1.6605e-02,   9.0979e-02,   1.2233e-01,
         -1.4361e-03,   3.9203e-04,  -1.0910e-01,   6.3675e-02,
          2.8408e-01,   6.3370e-02,  -3.9288e-01,  -3.4432e-03,
          2.8408e-01,  -3.2607e-01,  -4.5956e-01,  -2.0965e-01,
          7.4176e-01,  -2.0231e-01,   9.3729e-03,  -8.9100e-01,
         -6.9002e-01,  -1.0492e-01,   5.1600e-02,  -2.0996e-01,
          3.3490e-01,   3.8309e-03,   3.7734e-01,   1.2032e+00,
          1.1905e-01,   3.7856e-02,  -5.8530e-01,   3.2743e-01,
          1.9429e-01,   2.4943e-01,  -2.2300e-02,  -4.8764e-02,
         -3.1711e-01,  -1.6630e+00,   1.6176e+00,  -3.3007e-01,
          2.8464e-01,   1.1066e-01,  -1.5608e-01,  -5.7705e-01,
          7.6783e-01,  -2.3620e-01]])

In [33]:
# Explained variance (R squared) - about 86%. Pretty high. 
logReg.score(credit_X_test, credit_y_test)


Out[33]:
0.85507246376811596

In [34]:
from sklearn.metrics import confusion_matrix, classification_report

In [35]:
# Confusion Matrix
credit_y_pred = logReg.predict(credit_X_test)
credit_cm = confusion_matrix(credit_y_test, credit_y_pred)
credit_cm


Out[35]:
array([[62, 15],
       [ 5, 56]])

In [36]:
# Plot confusion matrix.
plt.matshow(credit_cm)
plt.title('Confusion matrix for credit data')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')


Out[36]:
<matplotlib.text.Text at 0x10da92d90>

In [37]:
# Get precision and recall
# Precision: better for 0 than 1 credit. Closer to 1 is better. TP/ TP + FP
# Precision = not to label as positive a sample that is negative. 
# Recall (sensitivity): better for 1 than 0. Means algo is more sensitive for 1. TP / TP + FN
# Recall find all the positive samples.
# F measure is weighted harmonic mean of the precision and recall - better closer to 1 than 0. Seems ok.
# Support is number of cases. 
print classification_report(credit_y_test, credit_y_pred)


             precision    recall  f1-score   support

        0.0       0.93      0.81      0.86        77
        1.0       0.79      0.92      0.85        61

avg / total       0.86      0.86      0.86       138


In [38]:
# Look at coefs. But they are not normalized, so don't know their importance. 
# Need to check out/ include preprocessing module to normalize X.  
pd.DataFrame(zip(credit_cols, np.transpose(logReg.coef_)))


Out[38]:
0 1
0 A2 [-1.03593533785e-05]
1 A3 [-0.016604785678]
2 A8 [0.0909788100897]
3 A11 [0.122327498209]
4 A14 [-0.00143610989014]
... ... ...
41 A10_t [0.110658968395]
42 A12_f [-0.156083948483]
43 A12_t [-0.577048002548]
44 A13_g [0.767827860262]
45 A13_p [-0.236204837802]

46 rows × 2 columns


In [39]:
# Look at probabilities.
credit_y_pred_df = pd.DataFrame(logReg.predict_proba(credit_X_test)) 
credit_y_pred_df['Predicted credit'] = credit_y_pred
credit_y_pred_df['True credit'] = credit_y_test
credit_y_pred_df.head(10)


Out[39]:
0 1 Predicted credit True credit
0 0.037 0.963 1 0
1 0.935 0.065 0 0
2 0.231 0.769 1 1
3 0.252 0.748 1 1
4 0.391 0.609 1 1
5 0.011 0.989 1 1
6 0.956 0.044 0 0
7 0.933 0.067 0 1
8 0.062 0.938 1 1
9 0.020 0.980 1 1

In [40]:
# Plot predicted vs true values
import seaborn as sns
sns.regplot(credit_y_pred, credit_y_test, x_jitter=0.15, y_jitter=0.15, color = 'r');



In [41]:
# Calculate Matthews corr coeff as measure of quality of binary classification. Closer to 1 = better. 
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(credit_y_test, credit_y_pred)


Out[41]:
0.71865208690214999

In [42]:
# Cross validation. Values seem to vary a lot, so model seems to be sensitive to train test split. 
import sklearn.cross_validation as cv
cv.cross_val_score(logReg, credit_X, credit_y, )


Out[42]:
array([ 0.7662,  0.8826,  0.8297])

In [43]:
# Do grid search for logistic regression.
from sklearn.grid_search import GridSearchCV
gs_LogReg = GridSearchCV(logReg, {'C': np.logspace(-5, 5, 200)}, n_jobs=4)
gs_LogReg.fit(credit_X_train, credit_y_train);
gs_LogReg.best_params_, gs_LogReg.best_score_, gs_LogReg.best_estimator_# These would be the best params to use.


Out[43]:
({'C': 1231.5506032928261},
 0.87681159420289856,
 LogisticRegression(C=1231.5506032928261, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, penalty='l2',
           random_state=None, tol=0.0001))

In [44]:
gs_LogReg_scores = cv.cross_val_score(gs_LogReg.best_estimator_, credit_X, credit_y, cv=5)
gs_LogReg_scores.min(), gs_LogReg_scores.max(), gs_LogReg_scores.mean() # Scores vary with split - not good.


Out[44]:
(0.61870503597122306, 0.97122302158273377, 0.84368115789818154)

In [45]:
# SVM
from sklearn.svm import LinearSVC

In [46]:
# Linear first.
# Smaller C, larger margin. 
LinSVC = LinearSVC() # basic model

In [47]:
LinSVC.fit(credit_X_train, credit_y_train)


Out[47]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [48]:
LinSVC.score(credit_X_test, credit_y_test) # Pretty good.


Out[48]:
0.81159420289855078

In [49]:
# Confusion Matrix
credit_y_pred2 = LinSVC.predict(credit_X_test)
credit_cm2 = confusion_matrix(credit_y_test, credit_y_pred2)
credit_cm2 # This model works similar to LogReg model.


Out[49]:
array([[59, 18],
       [ 8, 53]])

In [50]:
# Plot confusion matrix.
plt.matshow(credit_cm2)
plt.title('Confusion matrix for credit data')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')


Out[50]:
<matplotlib.text.Text at 0x10ae57250>

In [51]:
print classification_report(credit_y_test, credit_y_pred2)


             precision    recall  f1-score   support

        0.0       0.88      0.77      0.82        77
        1.0       0.75      0.87      0.80        61

avg / total       0.82      0.81      0.81       138


In [52]:
# Plot predicted vs true values
import seaborn as sns
sns.regplot(credit_y_test, credit_y_pred2, x_jitter=0.15, y_jitter=0.15, color = 'r');



In [53]:
# Gridsearch for linear 
from sklearn.grid_search import GridSearchCV
linParameters = {'C': np.logspace(-3,3,10)}

In [54]:
gs_LinSVC = GridSearchCV(LinearSVC(), linParameters)

In [55]:
gs_LinSVC.fit(credit_X_train, credit_y_train) # random_state also seems to set other params. meaning? influence?


Out[55]:
GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [56]:
gs_LinSVC.score(credit_X_test, credit_y_test) # Pretty good too.


Out[56]:
0.84782608695652173

In [57]:
gs_LinSVC.best_params_, gs_LinSVC.best_score_ # Best params to use (C and gamma). Score comparable to LogReg.


Out[57]:
({'C': 0.021544346900318832}, 0.83695652173913049)

In [58]:
gs_LinSVC.grid_scores_ # Everything calculated.


Out[58]:
[mean: 0.74819, std: 0.07767, params: {'C': 0.001},
 mean: 0.78080, std: 0.03614, params: {'C': 0.0046415888336127772},
 mean: 0.83696, std: 0.02662, params: {'C': 0.021544346900318832},
 mean: 0.82971, std: 0.03360, params: {'C': 0.10000000000000001},
 mean: 0.74457, std: 0.06889, params: {'C': 0.46415888336127775},
 mean: 0.75725, std: 0.08210, params: {'C': 2.154434690031882},
 mean: 0.74275, std: 0.07548, params: {'C': 10.0},
 mean: 0.74275, std: 0.05619, params: {'C': 46.415888336127729},
 mean: 0.72826, std: 0.14989, params: {'C': 215.44346900318823},
 mean: 0.71558, std: 0.10542, params: {'C': 1000.0}]

In [59]:
from sklearn.svm import SVC

In [60]:
# Non-linear SVC. 
NlSVC = SVC()

In [61]:
NlSVC.fit(credit_X_train, credit_y_train)


Out[61]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [62]:
NlSVC.score(credit_X_test, credit_y_test) # Pretty low.


Out[62]:
0.55072463768115942

In [63]:
# Confusion Matrix
credit_y_pred3 = NlSVC.predict(credit_X_test)
credit_cm3 = confusion_matrix(credit_y_test, credit_y_pred3)
credit_cm3 # That's pretty bad.


Out[63]:
array([[70,  7],
       [55,  6]])

In [64]:
print classification_report(credit_y_test, credit_y_pred3)


             precision    recall  f1-score   support

        0.0       0.56      0.91      0.69        77
        1.0       0.46      0.10      0.16        61

avg / total       0.52      0.55      0.46       138


In [65]:
# Plot confusion matrix.
plt.matshow(credit_cm3)
plt.title('Confusion matrix for credit data')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')


Out[65]:
<matplotlib.text.Text at 0x10a8d77d0>

In [66]:
sns.regplot(credit_y_test, credit_y_pred3, x_jitter=0.15, y_jitter=0.15, color = 'r'); #Yikes.


Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aaec1d0>

In [67]:
# Params for GridSearch
nlParameters = {'C': np.logspace(-3,3,10), 'gamma': np.logspace(-3,3,10)}

In [68]:
gs_nlSVC = GridSearchCV(SVC(random_state = 12), nlParameters)

In [69]:
gs_nlSVC.fit(credit_X_train, credit_y_train)


Out[69]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=12,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [70]:
gs_nlSVC.score(credit_X_test, credit_y_test) # Ok, but not great.


Out[70]:
0.73188405797101452

In [71]:
gs_nlSVC.best_params_, gs_nlSVC.best_score_ # Confused by this. Large C means smaller margin?


Out[71]:
({'C': 46.415888336127729, 'gamma': 0.001}, 0.67391304347826086)

In [72]:
gs_nlSVC.grid_scores_ # Everything calculated.


Out[72]:
[mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 1000.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 1000.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 1000.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 1000.0},
 mean: 0.63406, std: 0.02818, params: {'C': 0.46415888336127775, 'gamma': 0.001},
 mean: 0.53986, std: 0.01281, params: {'C': 0.46415888336127775, 'gamma': 0.0046415888336127772},
 mean: 0.55616, std: 0.00256, params: {'C': 0.46415888336127775, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 1000.0},
 mean: 0.63768, std: 0.01793, params: {'C': 2.154434690031882, 'gamma': 0.001},
 mean: 0.60326, std: 0.01174, params: {'C': 2.154434690031882, 'gamma': 0.0046415888336127772},
 mean: 0.51630, std: 0.01600, params: {'C': 2.154434690031882, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 2.154434690031882, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 2.154434690031882, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 1000.0},
 mean: 0.65942, std: 0.01356, params: {'C': 10.0, 'gamma': 0.001},
 mean: 0.61957, std: 0.01174, params: {'C': 10.0, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 10.0, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 10.0, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 10.0, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 1000.0},
 mean: 0.67391, std: 0.01174, params: {'C': 46.415888336127729, 'gamma': 0.001},
 mean: 0.61594, std: 0.02600, params: {'C': 46.415888336127729, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 46.415888336127729, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 46.415888336127729, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 46.415888336127729, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 1000.0},
 mean: 0.65036, std: 0.00256, params: {'C': 215.44346900318823, 'gamma': 0.001},
 mean: 0.61594, std: 0.02600, params: {'C': 215.44346900318823, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 215.44346900318823, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 215.44346900318823, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 215.44346900318823, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 1000.0},
 mean: 0.65217, std: 0.00000, params: {'C': 1000.0, 'gamma': 0.001},
 mean: 0.61594, std: 0.02600, params: {'C': 1000.0, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 1000.0, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 1000.0, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 1000.0, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 1000.0}]

In [ ]:
# LogReg and LinearSVC worked pretty well. Non-linear SVC did not - not sure why.