notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats 
from sklearn.cross_validation import train_test_split

pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 4)
np.set_printoptions(precision = 4)

%matplotlib inline



In [3]:

    
credit = pd.read_csv('./crx.data', header = None,
                     names = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','A16'])



In [4]:

    
credit.head()



In [5]:

    
credit.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [6]:

    
# Recode A16 plus minus into new variable with 1 and 0.
def recodeA16(sign):
    if sign == '+':
        return 1
    return 0
credit['A16R'] = credit['A16'].map(recodeA16).astype(float)
credit = credit.drop('A16', axis = 1)



In [7]:

    
# Deal with missing NaN.
# Transform numeric variables for float, the rest to str.
stringFeat = {'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'}
numFeat = {'A2', 'A3', 'A8', 'A11', 'A14', 'A15', 'A16R'}
for string in stringFeat:
    credit[string] = credit[string].astype(str)
for number in numFeat:
    credit[number] = credit[number].convert_objects(convert_numeric = True)



In [8]:

    
# This was my inefficient way of doing the imputations. 
# Check for missing values: A1, A2, A4, A5, A6, A7, and A14 should have some (documentation says so). 
# Some are coded as ?, so need to be replaced with NaN.
# Recode ? as Nan.
# credit.replace('?', np.nan, inplace = True)
# Deal with missing values.
# A1 imputation based on distribution of unique values in A1. 
# Binary mask for NaN.
# A1_missing_mask = credit.A1.isnull()
# Get unique values. 
# A1_choice = list(set(credit[-A1_missing_mask].A1))
# Get counts of unique values.
# A1_a_count = credit[credit.A1 == 'a']['A1'].count().astype(float)
# A1_b_count = credit[credit.A1 == 'b']['A1'].count().astype(float)
# Function to impute A1. 
# def get_A1_impute(number):
#    return np.random.choice(A1_choice, number, p = [A1_a_count / (A1_a_count + A1_b_count), 
#                                                                           A1_b_count / (A1_a_count + A1_b_count)])
# Impute. 
# credit.loc[A1_missing_mask, 'A1'] = get_A1_impute(len(credit[pd.isnull(credit.A1)]))
# Deal with A4, A5, A6, A7. 
# Masks.
# A4_missing_mask = credit.A4.isnull() # 6 missing
# A5_missing_mask = credit.A5.isnull() # 6 missing
# A6_missing_mask = credit.A6.isnull() # 8 missing
# A7_missing_mask = credit.A7.isnull() # 8 missing
# Compare rows idx A4 through A7. If the same/ subsets, drop for now.
# def get_idx(data, feature):
#    return data[data[feature].isnull()].index.tolist()
# get_idx(credit, 'A4') == get_idx(credit, 'A5')
# get_idx(credit, 'A6') == get_idx(credit, 'A7')
# Check whether A4 and A5 are subsets of A6 and A7.
# set(get_idx(credit,'A4')).issubset(get_idx(credit,'A6'))
# Drop all rows where A6 NaN. Drop A2. 
# creditR = credit.drop(get_idx(credit,'A6'), axis = 0)



In [9]:

    
# Code from Lab_07. Thank you! 
col_dist = {}
def get_col_dist(col_name):
    excl_null_mask = credit[col_name] != '?'
    row_count = credit[excl_null_mask][col_name].size
    col_data = {}
    col_data['prob'] = (credit[excl_null_mask][col_name].value_counts() / row_count).values
    col_data['values'] = (credit[excl_null_mask][col_name].value_counts() / row_count).index.values
    return col_data
col_dist['A1'] = get_col_dist('A1')
col_dist['A4'] = get_col_dist('A4')
col_dist['A5'] = get_col_dist('A5')
col_dist['A6'] = get_col_dist('A6')
col_dist['A7'] = get_col_dist('A7')

def impute_cols(val, options):
    if val == '?':
        return np.random.choice(options['values'], p=options['prob'])
    return val

def impute_a1(val):
    return impute_cols(val, col_dist['A1'])

def impute_a4(val):
    return impute_cols(val, col_dist['A4'])

def impute_a5(val):
    return impute_cols(val, col_dist['A5'])

def impute_a6(val):
    return impute_cols(val, col_dist['A6'])

def impute_a7(val):
    return impute_cols(val, col_dist['A7'])

credit.A1 = credit.A1.map(impute_a1)
credit.A4 = credit.A4.map(impute_a4)
credit.A5 = credit.A5.map(impute_a5)
credit.A6 = credit.A6.map(impute_a6)
credit.A7 = credit.A7.map(impute_a7)



In [10]:

    
# A2 imputation.
# Check distribution. Looks like median would be better than mean. 
# Actually, this is snooping. It'd be better to split first, then impute. 
# But I didn't really know how to do it, so I left as is. 
credit.A2.hist(bins = 10)









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x109bf3810>



In [11]:

    
# Impute median in A2 NaNs. 
A2_median = credit['A2'].median()
A2_missing_mask = credit.A2.isnull()
credit.loc[A2_missing_mask, 'A2'] = A2_median



In [12]:

    
# Impute A14 similar to A2.
# Check distribution. Very skewed. 
credit['A14'].hist(bins=20)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x109e04390>



In [13]:

    
# Impute A14 NaN with median.
A14_median = credit['A14'].median()
A14_missing_mask = credit['A14'].isnull()
credit.loc[A14_missing_mask, 'A14'] = A14_median



In [14]:

    
# Get dummies.
creditD = pd.get_dummies(credit)



In [15]:

    
credit_cols = creditD.columns
credit_cols









    Out[15]:





Index([u'A2', u'A3', u'A8', u'A11', u'A14', u'A15', u'A16R', u'A1_a', u'A1_b', u'A4_l', u'A4_u', u'A4_y', u'A5_g', u'A5_gg', u'A5_p', u'A6_aa', u'A6_c', u'A6_cc', u'A6_d', u'A6_e', u'A6_ff', u'A6_i', u'A6_j', u'A6_k', u'A6_m', u'A6_q', u'A6_r', u'A6_w', u'A6_x', u'A7_bb', u'A7_dd', u'A7_ff', u'A7_h', u'A7_j', u'A7_n', u'A7_o', u'A7_v', u'A7_z', u'A9_f', u'A9_t', u'A10_f', u'A10_t', u'A12_f', u'A12_t', u'A13_g', u'A13_p', u'A13_s'], dtype='object')



In [16]:

    
creditD.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 47 columns):
A2       690 non-null float64
A3       690 non-null float64
A8       690 non-null float64
A11      690 non-null int64
A14      690 non-null float64
A15      690 non-null int64
A16R     690 non-null float64
A1_a     690 non-null float64
A1_b     690 non-null float64
A4_l     690 non-null float64
A4_u     690 non-null float64
A4_y     690 non-null float64
A5_g     690 non-null float64
A5_gg    690 non-null float64
A5_p     690 non-null float64
A6_aa    690 non-null float64
A6_c     690 non-null float64
A6_cc    690 non-null float64
A6_d     690 non-null float64
A6_e     690 non-null float64
A6_ff    690 non-null float64
A6_i     690 non-null float64
A6_j     690 non-null float64
A6_k     690 non-null float64
A6_m     690 non-null float64
A6_q     690 non-null float64
A6_r     690 non-null float64
A6_w     690 non-null float64
A6_x     690 non-null float64
A7_bb    690 non-null float64
A7_dd    690 non-null float64
A7_ff    690 non-null float64
A7_h     690 non-null float64
A7_j     690 non-null float64
A7_n     690 non-null float64
A7_o     690 non-null float64
A7_v     690 non-null float64
A7_z     690 non-null float64
A9_f     690 non-null float64
A9_t     690 non-null float64
A10_f    690 non-null float64
A10_t    690 non-null float64
A12_f    690 non-null float64
A12_t    690 non-null float64
A13_g    690 non-null float64
A13_p    690 non-null float64
A13_s    690 non-null float64
dtypes: float64(45), int64(2)
memory usage: 258.8 KB



In [17]:

    
# Split.
credit_y = creditD.A16R
credit_X = creditD.drop('A16R', axis = 1)
credit_X_train, credit_X_test, credit_y_train, credit_y_test = train_test_split(credit_X, credit_y, 
                                                                                test_size=0.2, 
                                                                                random_state = 12)



In [18]:

    
## Exploratory analysis on credit_X_train, credit_y_train. 
credit_XTrain = pd.DataFrame(credit_X_train)
credit_YTrain = pd.DataFrame(credit_y_train, columns = ['A16R'])
creditExpl = credit_XTrain.join(credit_YTrain)



In [19]:

    
creditExpl.columns = credit_cols



In [20]:

    
creditExpl.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 552 entries, 0 to 551
Data columns (total 47 columns):
A2       552 non-null float64
A3       552 non-null float64
A8       552 non-null float64
A11      552 non-null float64
A14      552 non-null float64
A15      552 non-null float64
A16R     552 non-null float64
A1_a     552 non-null float64
A1_b     552 non-null float64
A4_l     552 non-null float64
A4_u     552 non-null float64
A4_y     552 non-null float64
A5_g     552 non-null float64
A5_gg    552 non-null float64
A5_p     552 non-null float64
A6_aa    552 non-null float64
A6_c     552 non-null float64
A6_cc    552 non-null float64
A6_d     552 non-null float64
A6_e     552 non-null float64
A6_ff    552 non-null float64
A6_i     552 non-null float64
A6_j     552 non-null float64
A6_k     552 non-null float64
A6_m     552 non-null float64
A6_q     552 non-null float64
A6_r     552 non-null float64
A6_w     552 non-null float64
A6_x     552 non-null float64
A7_bb    552 non-null float64
A7_dd    552 non-null float64
A7_ff    552 non-null float64
A7_h     552 non-null float64
A7_j     552 non-null float64
A7_n     552 non-null float64
A7_o     552 non-null float64
A7_v     552 non-null float64
A7_z     552 non-null float64
A9_f     552 non-null float64
A9_t     552 non-null float64
A10_f    552 non-null float64
A10_t    552 non-null float64
A12_f    552 non-null float64
A12_t    552 non-null float64
A13_g    552 non-null float64
A13_p    552 non-null float64
A13_s    552 non-null float64
dtypes: float64(47)
memory usage: 207.0 KB



In [21]:

    
creditExpl.head(20)









    Out[21]:






  
    
      
      A2
      A3
      A8
      A11
      A14
      A15
      A16R
      A1_a
      A1_b
      A4_l
      ...
      A7_z
      A9_f
      A9_t
      A10_f
      A10_t
      A12_f
      A12_t
      A13_g
      A13_p
      A13_s
    
  
  
    
      0 
       23.75
        0.710
       0.250
       1
       240
         4
       1
       0
       0
       1
      ...
       1
       0
       0
       1
       0
       1
       1
       0
       0
       0
    
    
      1 
       25.83
       12.835
       0.500
       0
         0
         2
       0
       1
       0
       1
      ...
       1
       0
       1
       0
       1
       0
       1
       0
       0
       0
    
    
      2 
       25.17
        2.875
       0.875
       0
       360
         0
       1
       0
       0
       1
      ...
       0
       1
       1
       0
       1
       0
       1
       0
       0
       1
    
    
      3 
       69.50
        6.000
       0.000
       0
         0
         0
       1
       0
       0
       1
      ...
       1
       0
       1
       0
       1
       0
       0
       0
       1
       0
    
    
      4 
       32.08
        4.000
       1.500
       0
       120
         0
       0
       1
       0
       0
      ...
       1
       0
       1
       0
       0
       1
       1
       0
       0
       0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      15
       29.50
        0.460
       0.540
       4
       380
       500
       1
       0
       0
       1
      ...
       0
       1
       0
       1
       1
       0
       1
       0
       0
       1
    
    
      16
       18.58
       10.290
       0.415
       0
        80
         0
       0
       1
       0
       1
      ...
       1
       0
       1
       0
       1
       0
       1
       0
       0
       0
    
    
      17
       17.92
       10.210
       0.000
       0
         0
        50
       1
       0
       0
       1
      ...
       1
       0
       1
       0
       1
       0
       1
       0
       0
       0
    
    
      18
       23.58
        0.835
       0.085
       0
       220
         5
       0
       1
       0
       1
      ...
       1
       0
       1
       0
       0
       1
       1
       0
       0
       0
    
    
      19
       18.17
       10.250
       1.085
       0
       320
        13
       0
       1
       0
       1
      ...
       1
       0
       1
       0
       1
       0
       1
       0
       0
       0
    
  

20 rows × 47 columns



In [22]:

    
numFeat = ['A2', 'A3', 'A8', 'A14', 'A15']
creditExpl[numFeat].describe()



In [23]:

    
# Scatter matrix all data. Clearly non-normal/ skewed. 
pd.tools.plotting.scatter_matrix(creditExpl[numFeat], alpha = 0.2, figsize = (15, 10), diagonal = 'kde');



In [24]:

    
# Note: If you have seaborn already imported, outliers won't show. If that happends, restart Kernel. 
# Some boxplots: A2, A3, A8, A11, A14, A15.
# Super skewed, outliers. 
fig, axarr = plt.subplots(2, 3, figsize = (15, 15))
axarr[0, 0].boxplot(creditExpl['A2'], 1);
axarr[0, 0].set_title('A2')
axarr[0, 1].boxplot(creditExpl['A3'], 1);
axarr[0, 1].set_title('A3')
axarr[0, 2].boxplot(creditExpl['A8'], 1);
axarr[0, 2].set_title('A8')
axarr[1, 0].boxplot(creditExpl['A11'], 1);
axarr[1, 0].set_title('A11')
axarr[1, 1].boxplot(creditExpl['A14'], 1);
axarr[1, 1].set_title('A14')
axarr[1, 2].boxplot(creditExpl['A15'], 1);
axarr[1, 2].set_title('A15')









    Out[24]:





<matplotlib.text.Text at 0x10c69bf90>



In [25]:

    
# Non-parametric correlations.
creditExpl[numFeat].corr(method = 'spearman')



In [26]:

    
# Double-check a few corr for significance. Seem pretty high. 
# Sign. corr would violate assumption of logReg. But scatter plots/ boxplots suggest influential cases.
print 'A2, A8:', stats.spearmanr(creditExpl['A2'], creditExpl['A8'])
print 'A3, A8:', stats.spearmanr(creditExpl['A8'], creditExpl['A3'])
print 'A3, A14:', stats.spearmanr(creditExpl['A3'], creditExpl['A14'])









    



A2, A8: (0.22406272719179662, 1.036890574115048e-07)
A3, A8: (0.22793218520983996, 6.1464903856487324e-08)
A3, A14: (-0.31469589038511209, 3.729592309652447e-14)



In [27]:

    
# Means by group (y). 
creditExpl.groupby(creditExpl.A16R)[numFeat].agg('mean')



In [28]:

    
# Medians by group (y).
creditExpl.groupby(creditExpl.A16R)[numFeat].agg('median')



In [29]:

    
# Build logistic regression.
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()



In [30]:

    
logReg.fit(credit_X_train, credit_y_train)









    Out[30]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [31]:

    
logReg.intercept_









    Out[31]:





array([-0.0454])



In [32]:

    
logReg.coef_









    Out[32]:





array([[ -1.0359e-05,  -1.6605e-02,   9.0979e-02,   1.2233e-01,
         -1.4361e-03,   3.9203e-04,  -1.0910e-01,   6.3675e-02,
          2.8408e-01,   6.3370e-02,  -3.9288e-01,  -3.4432e-03,
          2.8408e-01,  -3.2607e-01,  -4.5956e-01,  -2.0965e-01,
          7.4176e-01,  -2.0231e-01,   9.3729e-03,  -8.9100e-01,
         -6.9002e-01,  -1.0492e-01,   5.1600e-02,  -2.0996e-01,
          3.3490e-01,   3.8309e-03,   3.7734e-01,   1.2032e+00,
          1.1905e-01,   3.7856e-02,  -5.8530e-01,   3.2743e-01,
          1.9429e-01,   2.4943e-01,  -2.2300e-02,  -4.8764e-02,
         -3.1711e-01,  -1.6630e+00,   1.6176e+00,  -3.3007e-01,
          2.8464e-01,   1.1066e-01,  -1.5608e-01,  -5.7705e-01,
          7.6783e-01,  -2.3620e-01]])



In [33]:

    
# Explained variance (R squared) - about 86%. Pretty high. 
logReg.score(credit_X_test, credit_y_test)









    Out[33]:





0.85507246376811596



In [34]:

    
from sklearn.metrics import confusion_matrix, classification_report



In [35]:

    
# Confusion Matrix
credit_y_pred = logReg.predict(credit_X_test)
credit_cm = confusion_matrix(credit_y_test, credit_y_pred)
credit_cm









    Out[35]:





array([[62, 15],
       [ 5, 56]])



In [36]:

    
# Plot confusion matrix.
plt.matshow(credit_cm)
plt.title('Confusion matrix for credit data')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')









    Out[36]:





<matplotlib.text.Text at 0x10da92d90>



In [37]:

    
# Get precision and recall
# Precision: better for 0 than 1 credit. Closer to 1 is better. TP/ TP + FP
# Precision = not to label as positive a sample that is negative. 
# Recall (sensitivity): better for 1 than 0. Means algo is more sensitive for 1. TP / TP + FN
# Recall find all the positive samples.
# F measure is weighted harmonic mean of the precision and recall - better closer to 1 than 0. Seems ok.
# Support is number of cases. 
print classification_report(credit_y_test, credit_y_pred)









    



             precision    recall  f1-score   support

        0.0       0.93      0.81      0.86        77
        1.0       0.79      0.92      0.85        61

avg / total       0.86      0.86      0.86       138



In [38]:

    
# Look at coefs. But they are not normalized, so don't know their importance. 
# Need to check out/ include preprocessing module to normalize X.  
pd.DataFrame(zip(credit_cols, np.transpose(logReg.coef_)))









    Out[38]:






  
    
      
      0
      1
    
  
  
    
      0 
          A2
       [-1.03593533785e-05]
    
    
      1 
          A3
          [-0.016604785678]
    
    
      2 
          A8
          [0.0909788100897]
    
    
      3 
         A11
           [0.122327498209]
    
    
      4 
         A14
        [-0.00143610989014]
    
    
      ...
      ...
      ...
    
    
      41
       A10_t
           [0.110658968395]
    
    
      42
       A12_f
          [-0.156083948483]
    
    
      43
       A12_t
          [-0.577048002548]
    
    
      44
       A13_g
           [0.767827860262]
    
    
      45
       A13_p
          [-0.236204837802]
    
  

46 rows × 2 columns



In [39]:

    
# Look at probabilities.
credit_y_pred_df = pd.DataFrame(logReg.predict_proba(credit_X_test)) 
credit_y_pred_df['Predicted credit'] = credit_y_pred
credit_y_pred_df['True credit'] = credit_y_test
credit_y_pred_df.head(10)









    Out[39]:






  
    
      
      0
      1
      Predicted credit
      True credit
    
  
  
    
      0
       0.037
       0.963
       1
       0
    
    
      1
       0.935
       0.065
       0
       0
    
    
      2
       0.231
       0.769
       1
       1
    
    
      3
       0.252
       0.748
       1
       1
    
    
      4
       0.391
       0.609
       1
       1
    
    
      5
       0.011
       0.989
       1
       1
    
    
      6
       0.956
       0.044
       0
       0
    
    
      7
       0.933
       0.067
       0
       1
    
    
      8
       0.062
       0.938
       1
       1
    
    
      9
       0.020
       0.980
       1
       1



In [40]:

    
# Plot predicted vs true values
import seaborn as sns
sns.regplot(credit_y_pred, credit_y_test, x_jitter=0.15, y_jitter=0.15, color = 'r');



In [41]:

    
# Calculate Matthews corr coeff as measure of quality of binary classification. Closer to 1 = better. 
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(credit_y_test, credit_y_pred)









    Out[41]:





0.71865208690214999



In [42]:

    
# Cross validation. Values seem to vary a lot, so model seems to be sensitive to train test split. 
import sklearn.cross_validation as cv
cv.cross_val_score(logReg, credit_X, credit_y, )









    Out[42]:





array([ 0.7662,  0.8826,  0.8297])



In [43]:

    
# Do grid search for logistic regression.
from sklearn.grid_search import GridSearchCV
gs_LogReg = GridSearchCV(logReg, {'C': np.logspace(-5, 5, 200)}, n_jobs=4)
gs_LogReg.fit(credit_X_train, credit_y_train);
gs_LogReg.best_params_, gs_LogReg.best_score_, gs_LogReg.best_estimator_# These would be the best params to use.









    Out[43]:





({'C': 1231.5506032928261},
 0.87681159420289856,
 LogisticRegression(C=1231.5506032928261, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, penalty='l2',
           random_state=None, tol=0.0001))



In [44]:

    
gs_LogReg_scores = cv.cross_val_score(gs_LogReg.best_estimator_, credit_X, credit_y, cv=5)
gs_LogReg_scores.min(), gs_LogReg_scores.max(), gs_LogReg_scores.mean() # Scores vary with split - not good.









    Out[44]:





(0.61870503597122306, 0.97122302158273377, 0.84368115789818154)



In [45]:

    
# SVM
from sklearn.svm import LinearSVC



In [46]:

    
# Linear first.
# Smaller C, larger margin. 
LinSVC = LinearSVC() # basic model



In [47]:

    
LinSVC.fit(credit_X_train, credit_y_train)









    Out[47]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [48]:

    
LinSVC.score(credit_X_test, credit_y_test) # Pretty good.









    Out[48]:





0.81159420289855078



In [49]:

    
# Confusion Matrix
credit_y_pred2 = LinSVC.predict(credit_X_test)
credit_cm2 = confusion_matrix(credit_y_test, credit_y_pred2)
credit_cm2 # This model works similar to LogReg model.









    Out[49]:





array([[59, 18],
       [ 8, 53]])



In [50]:

    
# Plot confusion matrix.
plt.matshow(credit_cm2)
plt.title('Confusion matrix for credit data')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')









    Out[50]:





<matplotlib.text.Text at 0x10ae57250>



In [51]:

    
print classification_report(credit_y_test, credit_y_pred2)









    



             precision    recall  f1-score   support

        0.0       0.88      0.77      0.82        77
        1.0       0.75      0.87      0.80        61

avg / total       0.82      0.81      0.81       138



In [52]:

    
# Plot predicted vs true values
import seaborn as sns
sns.regplot(credit_y_test, credit_y_pred2, x_jitter=0.15, y_jitter=0.15, color = 'r');



In [53]:

    
# Gridsearch for linear 
from sklearn.grid_search import GridSearchCV
linParameters = {'C': np.logspace(-3,3,10)}



In [54]:

    
gs_LinSVC = GridSearchCV(LinearSVC(), linParameters)



In [55]:

    
gs_LinSVC.fit(credit_X_train, credit_y_train) # random_state also seems to set other params. meaning? influence?









    Out[55]:





GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [56]:

    
gs_LinSVC.score(credit_X_test, credit_y_test) # Pretty good too.









    Out[56]:





0.84782608695652173



In [57]:

    
gs_LinSVC.best_params_, gs_LinSVC.best_score_ # Best params to use (C and gamma). Score comparable to LogReg.









    Out[57]:





({'C': 0.021544346900318832}, 0.83695652173913049)



In [58]:

    
gs_LinSVC.grid_scores_ # Everything calculated.









    Out[58]:





[mean: 0.74819, std: 0.07767, params: {'C': 0.001},
 mean: 0.78080, std: 0.03614, params: {'C': 0.0046415888336127772},
 mean: 0.83696, std: 0.02662, params: {'C': 0.021544346900318832},
 mean: 0.82971, std: 0.03360, params: {'C': 0.10000000000000001},
 mean: 0.74457, std: 0.06889, params: {'C': 0.46415888336127775},
 mean: 0.75725, std: 0.08210, params: {'C': 2.154434690031882},
 mean: 0.74275, std: 0.07548, params: {'C': 10.0},
 mean: 0.74275, std: 0.05619, params: {'C': 46.415888336127729},
 mean: 0.72826, std: 0.14989, params: {'C': 215.44346900318823},
 mean: 0.71558, std: 0.10542, params: {'C': 1000.0}]



In [59]:

    
from sklearn.svm import SVC



In [60]:

    
# Non-linear SVC. 
NlSVC = SVC()



In [61]:

    
NlSVC.fit(credit_X_train, credit_y_train)









    Out[61]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [62]:

    
NlSVC.score(credit_X_test, credit_y_test) # Pretty low.









    Out[62]:





0.55072463768115942



In [63]:

    
# Confusion Matrix
credit_y_pred3 = NlSVC.predict(credit_X_test)
credit_cm3 = confusion_matrix(credit_y_test, credit_y_pred3)
credit_cm3 # That's pretty bad.









    Out[63]:





array([[70,  7],
       [55,  6]])



In [64]:

    
print classification_report(credit_y_test, credit_y_pred3)









    



             precision    recall  f1-score   support

        0.0       0.56      0.91      0.69        77
        1.0       0.46      0.10      0.16        61

avg / total       0.52      0.55      0.46       138



In [65]:

    
# Plot confusion matrix.
plt.matshow(credit_cm3)
plt.title('Confusion matrix for credit data')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')









    Out[65]:





<matplotlib.text.Text at 0x10a8d77d0>



In [66]:

    
sns.regplot(credit_y_test, credit_y_pred3, x_jitter=0.15, y_jitter=0.15, color = 'r'); #Yikes.









    Out[66]:





<matplotlib.axes._subplots.AxesSubplot at 0x10aaec1d0>



In [67]:

    
# Params for GridSearch
nlParameters = {'C': np.logspace(-3,3,10), 'gamma': np.logspace(-3,3,10)}



In [68]:

    
gs_nlSVC = GridSearchCV(SVC(random_state = 12), nlParameters)



In [69]:

    
gs_nlSVC.fit(credit_X_train, credit_y_train)









    Out[69]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=12,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [70]:

    
gs_nlSVC.score(credit_X_test, credit_y_test) # Ok, but not great.









    Out[70]:





0.73188405797101452



In [71]:

    
gs_nlSVC.best_params_, gs_nlSVC.best_score_ # Confused by this. Large C means smaller margin?









    Out[71]:





({'C': 46.415888336127729, 'gamma': 0.001}, 0.67391304347826086)



In [72]:

    
gs_nlSVC.grid_scores_ # Everything calculated.









    Out[72]:





[mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.001, 'gamma': 1000.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.0046415888336127772, 'gamma': 1000.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.021544346900318832, 'gamma': 1000.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.0046415888336127772},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.10000000000000001, 'gamma': 1000.0},
 mean: 0.63406, std: 0.02818, params: {'C': 0.46415888336127775, 'gamma': 0.001},
 mean: 0.53986, std: 0.01281, params: {'C': 0.46415888336127775, 'gamma': 0.0046415888336127772},
 mean: 0.55616, std: 0.00256, params: {'C': 0.46415888336127775, 'gamma': 0.021544346900318832},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 0.10000000000000001},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 0.46415888336127775, 'gamma': 1000.0},
 mean: 0.63768, std: 0.01793, params: {'C': 2.154434690031882, 'gamma': 0.001},
 mean: 0.60326, std: 0.01174, params: {'C': 2.154434690031882, 'gamma': 0.0046415888336127772},
 mean: 0.51630, std: 0.01600, params: {'C': 2.154434690031882, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 2.154434690031882, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 2.154434690031882, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 2.154434690031882, 'gamma': 1000.0},
 mean: 0.65942, std: 0.01356, params: {'C': 10.0, 'gamma': 0.001},
 mean: 0.61957, std: 0.01174, params: {'C': 10.0, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 10.0, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 10.0, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 10.0, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 10.0, 'gamma': 1000.0},
 mean: 0.67391, std: 0.01174, params: {'C': 46.415888336127729, 'gamma': 0.001},
 mean: 0.61594, std: 0.02600, params: {'C': 46.415888336127729, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 46.415888336127729, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 46.415888336127729, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 46.415888336127729, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 46.415888336127729, 'gamma': 1000.0},
 mean: 0.65036, std: 0.00256, params: {'C': 215.44346900318823, 'gamma': 0.001},
 mean: 0.61594, std: 0.02600, params: {'C': 215.44346900318823, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 215.44346900318823, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 215.44346900318823, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 215.44346900318823, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 215.44346900318823, 'gamma': 1000.0},
 mean: 0.65217, std: 0.00000, params: {'C': 1000.0, 'gamma': 0.001},
 mean: 0.61594, std: 0.02600, params: {'C': 1000.0, 'gamma': 0.0046415888336127772},
 mean: 0.51449, std: 0.01117, params: {'C': 1000.0, 'gamma': 0.021544346900318832},
 mean: 0.54348, std: 0.00444, params: {'C': 1000.0, 'gamma': 0.10000000000000001},
 mean: 0.55616, std: 0.00256, params: {'C': 1000.0, 'gamma': 0.46415888336127775},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 2.154434690031882},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 10.0},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 46.415888336127729},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 215.44346900318823},
 mean: 0.55435, std: 0.00000, params: {'C': 1000.0, 'gamma': 1000.0}]



In [ ]:

    
# LogReg and LinearSVC worked pretty well. Non-linear SVC did not - not sure why.

	A2	A3	A8	A14	A15
count	552.000	552.000	552.000	552.000	552.000
mean	31.445	4.796	2.125	187.393	1144.716
std	11.976	4.856	3.065	177.605	5766.012
min	15.170	0.000	0.000	0.000	0.000
25%	22.730	1.040	0.165	80.000	0.000
50%	28.460	2.855	1.000	160.000	4.500
75%	37.330	7.510	2.551	280.000	379.750
max	80.250	26.335	20.000	2000.000	100000.000

	A2	A3	A8	A14	A15
A16R
0	31.821	4.688	2.315	195.379	1174.624
1	30.615	5.035	1.707	169.750	1078.640

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	f	g	00202	0	+
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	00043	560	+
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	00280	824	+
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	00100	3	+
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	00120	0	+

	A2	A3	A8	A11	A14	A15	A16R	A1_a	A1_b	A4_l	...	A7_z	A9_f	A9_t	A10_f	A10_t	A12_f	A12_t	A13_g	A13_p	A13_s
0	23.75	0.710	0.250	1	240	4	1	0	0	1	...	1	0	0	1	0	1	1	0	0	0
1	25.83	12.835	0.500	0	0	2	0	1	0	1	...	1	0	1	0	1	0	1	0	0	0
2	25.17	2.875	0.875	0	360	0	1	0	0	1	...	0	1	1	0	1	0	1	0	0	1
3	69.50	6.000	0.000	0	0	0	1	0	0	1	...	1	0	1	0	1	0	0	0	1	0
4	32.08	4.000	1.500	0	120	0	0	1	0	0	...	1	0	1	0	0	1	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
15	29.50	0.460	0.540	4	380	500	1	0	0	1	...	0	1	0	1	1	0	1	0	0	1
16	18.58	10.290	0.415	0	80	0	0	1	0	1	...	1	0	1	0	1	0	1	0	0	0
17	17.92	10.210	0.000	0	0	50	1	0	0	1	...	1	0	1	0	1	0	1	0	0	0
18	23.58	0.835	0.085	0	220	5	0	1	0	1	...	1	0	1	0	0	1	1	0	0	0
19	18.17	10.250	1.085	0	320	13	0	1	0	1	...	1	0	1	0	1	0	1	0	0	0

	A2	A3	A8	A14	A15
A2	1.000	0.068	0.224	-0.000	0.018
A3	0.068	1.000	0.228	-0.315	0.095
A8	0.224	0.228	1.000	-0.047	0.089
A14	-0.000	-0.315	-0.047	1.000	-0.083
A15	0.018	0.095	0.089	-0.083	1.000

	0	1
0	A2	[-1.03593533785e-05]
1	A3	[-0.016604785678]
2	A8	[0.0909788100897]
3	A11	[0.122327498209]
4	A14	[-0.00143610989014]
...	...	...
41	A10_t	[0.110658968395]
42	A12_f	[-0.156083948483]
43	A12_t	[-0.577048002548]
44	A13_g	[0.767827860262]
45	A13_p	[-0.236204837802]

	0	1	Predicted credit	True credit
0	0.037	0.963	1	0
1	0.935	0.065	0	0
2	0.231	0.769	1	1
3	0.252	0.748	1	1
4	0.391	0.609	1	1
5	0.011	0.989	1	1
6	0.956	0.044	0	0
7	0.933	0.067	0	1
8	0.062	0.938	1	1
9	0.020	0.980	1	1