In [501]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns

# This enables inline Plots
%matplotlib inline

#Limit rows
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)

In [505]:
#Read the file into a dataframe
credit_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',header=None,names=["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","A11","A12","A13","A14","A15","A16"])

In [506]:
#Understanding the columns and the data
credit_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [507]:
# Printing the unique value from each column to identify the null value. The null value is "?"
for col in range(credit_df.shape[1]):
    print credit_df.iloc[:,col].unique()


['b' 'a' '?']
['30.83' '58.67' '24.50' '27.83' '20.17' '32.08' '33.17' '22.92' '54.42'
 '42.50' '22.08' '29.92' '38.25' '48.08' '45.83' '36.67' '28.25' '23.25'
 '21.83' '19.17' '25.00' '47.75' '27.42' '41.17' '15.83' '47.00' '56.58'
 '57.42' '42.08' '29.25' '42.00' '49.50' '36.75' '22.58' '27.25' '23.00'
 '27.75' '54.58' '34.17' '28.92' '29.67' '39.58' '56.42' '54.33' '41.00'
 '31.92' '41.50' '23.92' '25.75' '26.00' '37.42' '34.92' '34.25' '23.33'
 '23.17' '44.33' '35.17' '43.25' '56.75' '31.67' '23.42' '20.42' '26.67'
 '36.00' '25.50' '19.42' '32.33' '34.83' '38.58' '44.25' '44.83' '20.67'
 '34.08' '21.67' '21.50' '49.58' '27.67' '39.83' '?' '37.17' '25.67'
 '34.00' '49.00' '62.50' '31.42' '52.33' '28.75' '28.58' '22.50' '28.50'
 '37.50' '35.25' '18.67' '54.83' '40.92' '19.75' '29.17' '24.58' '33.75'
 '25.42' '37.75' '52.50' '57.83' '20.75' '39.92' '24.75' '44.17' '23.50'
 '47.67' '22.75' '34.42' '28.42' '67.75' '47.42' '36.25' '32.67' '48.58'
 '33.58' '18.83' '26.92' '31.25' '56.50' '43.00' '22.33' '32.83' '40.33'
 '30.50' '52.83' '46.67' '58.33' '37.33' '23.08' '32.75' '68.67' '28.00'
 '44.00' '25.08' '32.00' '60.58' '40.83' '19.33' '41.33' '56.00' '49.83'
 '22.67' '27.00' '26.08' '18.42' '21.25' '57.08' '22.42' '48.75' '40.00'
 '40.58' '28.67' '33.08' '21.33' '41.75' '34.50' '48.17' '27.58' '24.08'
 '24.83' '36.33' '35.42' '71.58' '39.50' '39.33' '24.33' '60.08' '55.92'
 '53.92' '18.92' '50.08' '65.42' '17.58' '18.08' '19.67' '25.17' '33.50'
 '58.42' '26.17' '42.83' '38.17' '20.50' '48.25' '28.33' '18.75' '18.50'
 '45.00' '40.25' '41.42' '17.83' '18.17' '20.00' '52.17' '50.75' '17.08'
 '18.33' '59.67' '18.00' '37.58' '30.67' '18.58' '16.25' '21.17' '17.67'
 '16.50' '29.50' '21.75' '18.25' '35.75' '16.08' '69.17' '32.92' '16.33'
 '22.17' '57.58' '15.92' '31.75' '19.00' '17.50' '33.67' '30.17' '33.25'
 '25.25' '34.75' '47.33' '39.08' '42.75' '38.92' '62.75' '32.25' '26.75'
 '63.33' '30.75' '16.00' '19.50' '32.42' '30.25' '26.83' '16.92' '24.42'
 '39.42' '23.58' '21.42' '33.00' '26.33' '26.25' '28.17' '20.83' '43.17'
 '56.83' '15.17' '29.83' '31.00' '51.92' '69.50' '19.58' '22.25' '38.42'
 '26.58' '35.00' '29.42' '49.17' '51.83' '58.58' '53.33' '27.17' '25.92'
 '30.58' '17.25' '27.33' '36.50' '29.75' '52.42' '36.17' '34.58' '21.92'
 '36.58' '31.08' '30.42' '21.08' '17.42' '39.17' '26.50' '17.33' '23.75'
 '34.67' '74.83' '45.33' '47.25' '24.17' '39.25' '39.00' '64.08' '31.33'
 '21.00' '13.75' '46.00' '20.25' '60.92' '30.00' '22.83' '45.17' '41.58'
 '55.75' '25.33' '31.83' '33.92' '24.92' '80.25' '30.08' '48.33' '76.75'
 '51.33' '41.92' '29.58' '32.17' '51.42' '42.17' '43.08' '59.50' '65.17'
 '20.33' '48.50' '28.08' '73.42' '51.58' '38.67' '46.08' '20.08' '42.25'
 '16.17' '47.83' '22.00' '38.33' '25.58' '21.58' '36.08' '38.75' '35.58'
 '31.58' '15.75' '17.92' '30.33' '47.17' '25.83' '50.25' '36.42']
[  0.      4.46    0.5     1.54    5.625   4.      1.04   11.585   4.915
   0.83    1.835   6.      6.04   10.5     4.415   0.875   5.875   0.25
   8.585  11.25    1.      8.     14.5     6.5     0.585  13.     18.5
   8.5    14.79    9.79    7.585   5.125  10.75    1.5     1.585  11.75
   9.415   9.17   15.      1.415  13.915  28.      6.75    2.04    0.665
   2.5     3.     11.625   4.5    12.25   16.165   0.79    0.835   4.25
   0.375  25.125   7.5     5.      7.      5.29    1.165   9.75   19.      3.5
   0.625   2.21   12.75   15.5     1.375   3.54   11.      1.75   16.5    12.
   2.25    0.75   12.5     1.25    1.125   7.04   10.335   6.21    6.665
   9.      5.5     0.54    2.75    9.5    13.5     3.75   16.      0.29
   1.665   7.54    0.46   10.     11.5     3.04    2.      0.08    1.71
   3.25    2.54   13.585   8.665   9.25    8.17    2.335  19.5     5.665
   4.625   0.205   0.96    4.04    5.04    3.165   7.625  10.04   10.25
   2.125   9.335   6.625   2.71    9.625  12.54    9.54    8.46   13.75
  21.     10.125  25.085   0.21   21.5    11.125  11.045   1.335   0.085
   1.21    0.165   5.71    5.415  12.625   0.58    0.415   2.415   0.335
   3.125  12.125   2.875  13.665  26.335  10.29    1.29   22.      0.125
   1.085   4.085   4.71    6.165   4.585  11.46   14.585   0.17    1.625
   2.085   5.085   8.125   2.835   1.79    0.705   2.165   2.29   18.125
   3.085  11.665   4.125   1.08   13.335  11.835   4.79    9.96    7.08
  25.21    0.67    3.79   22.29    3.335   0.42    1.46    0.04   12.33
  12.335   0.915  14.     17.75   20.      5.25    4.165  10.915   4.75
  10.415   7.835   0.71    2.46    9.585   3.625   2.665   5.835  12.835
  10.665   7.25   10.21    3.29   10.085   3.375]
['u' 'y' '?' 'l']
['g' 'p' '?' 'gg']
['w' 'q' 'm' 'r' 'cc' 'k' 'c' 'd' 'x' 'i' 'e' 'aa' 'ff' 'j' '?']
['v' 'h' 'bb' 'ff' 'j' 'z' '?' 'o' 'dd' 'n']
[  1.25    3.04    1.5     3.75    1.71    2.5     6.5     0.04    3.96
   3.165   2.165   4.335   1.      5.      0.25    0.96    3.17    0.665
   0.75    0.835   7.875   3.085   0.5     5.165  15.      7.      5.04
   7.96    7.585   0.415   2.      1.835  14.415   4.5     5.335   8.625
  28.5     2.625   0.125   6.04    3.5     0.165   0.875   1.75    0.
   7.415   0.085   5.75    6.      3.      1.585   4.29    1.54    1.46
   1.625  12.5    13.5    10.75    0.375   0.585   0.455   4.      8.5
   9.46    2.25   10.      0.795   1.375   1.29   11.5     6.29   14.
   0.335   1.21    7.375   7.5     3.25   13.      5.5     4.25    0.625
   5.085   2.75    2.375   8.      1.085   2.54    4.165   1.665  11.      9.
   1.335   1.415   1.96    2.585   5.125  15.5     0.71    5.665  18.
   5.25    8.665   2.29   20.      2.46   13.875   2.085   4.58    2.71
   2.04    0.29    4.75    0.46    0.21    0.54    3.335   2.335   1.165
   2.415   2.79    4.625   1.04    6.75    1.875  16.     12.75    5.375
   2.125  17.5     3.125   0.79    8.29 ]
['t' 'f']
['t' 'f']
[ 1  6  0  5  7 10  3 17  2  9  8 15 11 12 40 23  4 20 67 14 16 13 19]
['f' 't']
['g' 's' 'p']
['00202' '00043' '00280' '00100' '00120' '00360' '00164' '00080' '00180'
 '00052' '00128' '00260' '00000' '00320' '00396' '00096' '00200' '00300'
 '00145' '00500' '00168' '00434' '00583' '00030' '00240' '00070' '00455'
 '00311' '00216' '00491' '00400' '00239' '00160' '00711' '00250' '00520'
 '00515' '00420' '?' '00980' '00443' '00140' '00094' '00368' '00288'
 '00928' '00188' '00112' '00171' '00268' '00167' '00075' '00152' '00176'
 '00329' '00212' '00410' '00274' '00375' '00408' '00350' '00204' '00040'
 '00181' '00399' '00440' '00093' '00060' '00395' '00393' '00021' '00029'
 '00102' '00431' '00370' '00024' '00020' '00129' '00510' '00195' '00144'
 '00380' '00049' '00050' '00381' '00150' '00117' '00056' '00211' '00230'
 '00156' '00022' '00228' '00519' '00253' '00487' '00220' '00088' '00073'
 '00121' '00470' '00136' '00132' '00292' '00154' '00272' '00340' '00108'
 '00720' '00450' '00232' '00170' '01160' '00411' '00460' '00348' '00480'
 '00640' '00372' '00276' '00221' '00352' '00141' '00178' '00600' '00550'
 '02000' '00225' '00210' '00110' '00356' '00045' '00062' '00092' '00174'
 '00017' '00086' '00454' '00254' '00028' '00263' '00333' '00312' '00290'
 '00371' '00099' '00252' '00760' '00560' '00130' '00523' '00680' '00163'
 '00208' '00383' '00330' '00422' '00840' '00432' '00032' '00186' '00303'
 '00349' '00224' '00369' '00076' '00231' '00309' '00416' '00465' '00256']
[     0    560    824      3  31285   1349    314   1442    200   2690
    245   1208   1260     11  10000   5000   4000     35    713    551
    500    300    221   2283    100     15    284   1236   5800    730
    400  50000    456  15108   2954      2     20     27    225      1
     38      5    130    147    210  11202   1332     50    258    567
   1000   2510    809    610    150  51100    367    600    247    375
    278    827   2072    582   2300   3065   2200      6   1602   2184
   3376   2000   7544  10561    837  11177    639   2028   1065    540
    158  15000   3000   3257   1655   1430      7    790    396    678
   1187   6590    168   1270   1210    742   8851   7059   1704    857
   6700   2503   9800    196     14  26726  18027     99    444   1200
   2010     13    120     32    722     40    484    204     98   5552
    105   2803    126      4     21    173     10     25     42 100000
    113      8     44   2732    179     16   1062    251    228     67
     12    122   4208   1300    112   1110   1004    286   4500   1212
    195     87     17    184    140     18    146     22     55     70
     60   1058    769   5200     19    316    350   3552    687   1950
     53     41     33     80    351   2100    475    892   4607   2206
   5860     28   1391   2279    591    960    690    234    800    990
   2197     90    340    347    327   4071    109   1249    134   1344
    321    948   2079   2384    458   5298    162   1583     58     59
   1400   1465   8000   4700   1097   3290  13212   5777   5124     23
   4159    918    768    283    108      9     68    587    141    501
    160    390    154    117    246    237    364    537    394    750]
['+' '-']

In [508]:
#Replace '?' with Null(Nan) for all columns
credit_df.replace('?',np.nan,inplace=True)

In [509]:
credit_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     678 non-null object
A2     678 non-null object
A3     690 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     681 non-null object
A7     681 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    677 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [510]:
#Selecting rows with null values
credit_df[credit_df.isnull().any(axis=1)]
#from the output, we see there are 6 rows where A4,A5,A6 and A7 are null. There are 2 rows were A6 and A7 are null. They 
#can be removed from the data set.


Out[510]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
71 b 34.83 4.0 u g d bb 12.5 t f 0 t g NaN 0 -
83 a NaN 3.5 u g d v 3.0 t f 0 t g 00300 0 -
86 b NaN 0.4 u g d v 0.9 t f 0 t s 00928 0 -
92 b NaN 5.0 y p aa v 8.5 t f 0 f g 00000 0 -
97 b NaN 0.5 u g c bb 0.8 t f 0 t s 00320 0 -
202 b 24.83 2.8 u g c v 2.2 t t 6 f g NaN 600 +
206 a 71.58 0.0 NaN NaN NaN NaN 0.0 f f 0 f p NaN 0 +
243 a 18.75 7.5 u g q v 2.7 t t 5 f g NaN 26726 +
248 NaN 24.50 12.8 u g c bb 4.8 t t 2 f g 00073 444 +
254 b NaN 0.6 u g k v 0.2 f f 0 f g 00380 2010 -
270 b 37.58 0.0 NaN NaN NaN NaN 0.0 f f 0 f p NaN 0 +
278 b 24.58 13.5 y p ff ff 0.0 f f 0 f g NaN 0 -
286 a NaN 1.5 u g ff ff 0.0 f t 2 t g 00200 105 -
327 NaN 40.83 3.5 u g i bb 0.5 f f 0 f s 01160 0 -
329 b NaN 4.0 y p i v 0.1 f f 0 t g 00411 0 -
330 b 20.42 0.0 NaN NaN NaN NaN 0.0 f f 0 f p NaN 0 -
346 NaN 32.25 1.5 u g c v 0.2 f f 0 t g 00372 122 -
374 NaN 28.17 0.6 u g aa v 0.0 f f 0 f g 00260 1004 -
406 a 40.33 8.1 y p k v 0.2 f t 2 f g NaN 18 -
445 a NaN 11.2 u g ff ff 0.0 f f 0 f g NaN 5200 -
450 b NaN 3.0 y p i bb 7.0 f f 0 f g 00000 1 -
453 NaN 29.75 0.7 u g w v 0.2 f f 0 t g 00300 0 -
456 b 34.58 0.0 NaN NaN NaN NaN 0.0 f f 0 f p NaN 0 -
479 NaN 26.50 2.7 y p NaN NaN 0.1 f f 0 f s 00080 0 -
489 NaN 45.33 1.0 u g q v 0.1 f f 0 t g 00263 0 -
500 b NaN 4.0 u g x v 5.0 t t 3 t g 00290 2279 +
515 b NaN 10.5 u g x v 6.5 t f 0 f g 00000 0 +
520 NaN 20.42 7.5 u g k v 1.5 t t 1 f g 00160 234 +
539 b 80.25 5.5 u g NaN NaN 0.5 t f 0 f g 00000 340 -
592 b 23.17 0.0 NaN NaN NaN NaN 0.0 f f 0 f p NaN 0 +
598 NaN 20.08 0.1 u g q v 1.0 f t 1 f g 00240 768 +
601 NaN 42.25 1.8 y p NaN NaN 0.0 f f 0 t g 00150 1 -
608 b NaN 0.0 y p d v 4.2 f f 0 t g 00460 0 -
622 a 25.58 0.0 NaN NaN NaN NaN 0.0 f f 0 f p NaN 0 +
626 b 22.00 7.8 y p i bb 0.2 f f 0 t g NaN 0 -
641 NaN 33.17 2.2 y p cc v 3.5 f f 0 t g 00200 141 -
673 NaN 29.50 2.0 y p e h 2.0 f f 0 f g 00256 17 -

In [511]:
# drop rows were A4 is null
credit_df.dropna(how='any',subset = ['A4'],inplace=True)

In [512]:
# drop rows were A6 is null
credit_df.dropna(how='any',subset = ['A6'],inplace=True)

In [513]:
credit_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 16 columns):
A1     671 non-null object
A2     669 non-null object
A3     681 non-null float64
A4     681 non-null object
A5     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    674 non-null object
A15    681 non-null int64
A16    681 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 90.4+ KB

In [514]:
#A2 and A14 are numerical values. Imputing values for the Null rows
#Check the distributions of A2 and A14
credit_df['A2'] = credit_df.A2.astype(float)
credit_df['A14'] = credit_df.A14.astype(float)

In [515]:
#Plot A2 on a histogram
credit_df.A2.hist(bins=20)


Out[515]:
<matplotlib.axes.AxesSubplot at 0x1137f2450>

In [516]:
# Get the Mean and STD of ALL Data
print 'Mean Age:', credit_df.A2.mean()
print 'Std Age:', credit_df.A2.std()


Mean Age: 31.4518834081
Std Age: 11.7616656339

In [517]:
# Create a Normal Distribution centered on Mean of 31.5 and Standard Dev of 11.8
# Get 12 Entries since that's how many missing entries we have for A2
#function is defined so that we can use the same steps on both the training and test data
def get_a2_impute_values(n):
    return np.random.normal(31.5, 11.8, n)

In [518]:
# When we look for entries that are null we can create a mask or boolean filter
a2_missing_mask = credit_df.A2.isnull()

In [519]:
# Notice that we have 12 rows that have missing A2 field
credit_df[a2_missing_mask]


Out[519]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
83 a NaN 3.5 u g d v 3.0 t f 0 t g 300 0 -
86 b NaN 0.4 u g d v 0.9 t f 0 t s 928 0 -
92 b NaN 5.0 y p aa v 8.5 t f 0 f g 0 0 -
97 b NaN 0.5 u g c bb 0.8 t f 0 t s 320 0 -
254 b NaN 0.6 u g k v 0.2 f f 0 f g 380 2010 -
286 a NaN 1.5 u g ff ff 0.0 f t 2 t g 200 105 -
329 b NaN 4.0 y p i v 0.1 f f 0 t g 411 0 -
445 a NaN 11.2 u g ff ff 0.0 f f 0 f g NaN 5200 -
450 b NaN 3.0 y p i bb 7.0 f f 0 f g 0 1 -
500 b NaN 4.0 u g x v 5.0 t t 3 t g 290 2279 +
515 b NaN 10.5 u g x v 6.5 t f 0 f g 0 0 +
608 b NaN 0.0 y p d v 4.2 f f 0 t g 460 0 -

In [520]:
# Now we can set these values to the values we picked from Random Normal Distribution
credit_df.loc[credit_df.A2.isnull(), 'A2'] = get_a2_impute_values(n=12)

In [521]:
#A2 does not have missing values anymore
credit_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 16 columns):
A1     671 non-null object
A2     681 non-null float64
A3     681 non-null float64
A4     681 non-null object
A5     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    674 non-null float64
A15    681 non-null int64
A16    681 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 90.4+ KB

In [522]:
#Imputing Values for A14
credit_df.A14.hist(bins=100)
#from the graph below


Out[522]:
<matplotlib.axes.AxesSubplot at 0x1137d5950>

In [523]:
#from the graph above, it seems that a lot of values for A14 is 0. Hence we can impute the null values to 0
credit_df.A14[credit_df.A14==0].count()


Out[523]:
131

In [524]:
#Imputing the null A14 values to 0
credit_df.loc[credit_df.A14.isnull(), 'A14'] = 0

In [525]:
#checking if the null values have been made 0
credit_df.A14[credit_df.A14==0].count()


Out[525]:
138

In [526]:
#We remain with only A1 having 10 null values
#Printing the distribution of a's and b's in the dataframe
credit_df.groupby(["A1"]).count()["A2"]


Out[526]:
A1
a     208
b     463
Name: A2, dtype: int64

In [527]:
np.random.choice(['a','b'],10,p=[0.3,0.7])


Out[527]:
array(['a', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'a', 'b'], 
      dtype='|S1')

In [528]:
# Create a function to return a random value for A1 as it is a categorical variable and not numerical
# Get 10 Entries since that's how many missing entries we have for A1
#function is defined so that we can use the same steps on both the training and test data
def get_a1_impute_values(n):
    return np.random.choice(['a','b'],n,p=[0.3,0.7])

In [529]:
# When we look for entries that are null we can create a mask or boolean filter
a1_missing_mask = credit_df.A1.isnull()

In [530]:
# Now we can set these values to the values we picked from Random Normal Distribution
credit_df.loc[credit_df.A1.isnull(), 'A1'] = get_a1_impute_values(n=10)

In [531]:
credit_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 16 columns):
A1     681 non-null object
A2     681 non-null float64
A3     681 non-null float64
A4     681 non-null object
A5     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    681 non-null float64
A15    681 non-null int64
A16    681 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 90.4+ KB

In [293]:
#Finalize the variable required to predict. Check for significance
#Before that change the y(predictor) which is A16 in our case to 0's and 1's. 
#Hence adding a new column which acts as the new A16 having 0's and 1's

In [532]:
credit_df["cardapproval"] = credit_df.A16.replace(['+','-'],[1,0])

In [533]:
#Significance of A1
approved_by_a1 = credit_df.groupby('A1').cardapproval.agg(['sum', 'count'])
approved_by_a1['approved_rate'] = approved_by_a1['sum'] / approved_by_a1['count']
approved_by_a1

#The rates are not very different. A1 is not a significant regressor. We can remove it from our model.


Out[533]:
sum count approved_rate
A1
a 97 212 0.5
b 206 469 0.4

In [534]:
#Significance of A4
approved_by_a4 = credit_df.groupby('A4').cardapproval.agg(['sum', 'count'])
approved_by_a4['approved_rate'] = approved_by_a4['sum'] / approved_by_a4['count']
approved_by_a4

#A4 has significance since for different A4, the approved rate is different


Out[534]:
sum count approved_rate
A4
l 2 2 1.0
u 256 518 0.5
y 45 161 0.3

In [535]:
#Significance of A5
approved_by_a5 = credit_df.groupby('A5').cardapproval.agg(['sum', 'count'])
approved_by_a5['approved_rate'] = approved_by_a5['sum'] / approved_by_a5['count']
approved_by_a5


Out[535]:
sum count approved_rate
A5
g 256 518 0.5
gg 2 2 1.0
p 45 161 0.3

In [536]:
#Seems from the above two tables, that A4 and A5 are correlated.
credit_df.groupby(['A4'])['A5'].count()

#df.groupby([‘column1’])[‘column2’].sum()


Out[536]:
A4
l       2
u     518
y     161
Name: A5, dtype: int64

In [537]:
#Concatenating A4 and A5 to see if they are correlated and then doing a group by to see if they are correlated
credit_df["A4A5"]=credit_df.A4 + credit_df.A5
credit_df.groupby(['A4A5'])['A5'].count()
#They are correlated. So we can drop A5 from our model.


Out[537]:
A4A5
lgg       2
ug      518
yp      161
Name: A5, dtype: int64

In [538]:
#Significance of A6
approved_by_a6 = credit_df.groupby('A6').cardapproval.agg(['sum', 'count'])
approved_by_a6['approved_rate'] = approved_by_a6['sum'] / approved_by_a6['count']
approved_by_a6

#A6 has significance since for different A6, the approved rate is different


Out[538]:
sum count approved_rate
A6
aa 19 54 0.4
c 62 137 0.5
cc 29 41 0.7
d 7 30 0.2
e 14 25 0.6
ff 7 53 0.1
i 14 59 0.2
j 3 10 0.3
k 14 51 0.3
m 16 38 0.4
q 51 78 0.7
r 2 3 0.7
w 33 64 0.5
x 32 38 0.8

In [539]:
#Significance of A7
approved_by_a7 = credit_df.groupby('A7').cardapproval.agg(['sum', 'count'])
approved_by_a7['approved_rate'] = approved_by_a7['sum'] / approved_by_a7['count']
approved_by_a7

#A7 has significance since for different A7, the approved rate is different


Out[539]:
sum count approved_rate
A7
bb 25 59 0.4
dd 2 6 0.3
ff 8 57 0.1
h 87 138 0.6
j 3 8 0.4
n 2 4 0.5
o 1 2 0.5
v 169 399 0.4
z 6 8 0.8

In [540]:
#Significance of A9
approved_by_a9 = credit_df.groupby('A9').cardapproval.agg(['sum', 'count'])
approved_by_a9['approved_rate'] = approved_by_a9['sum'] / approved_by_a9['count']
approved_by_a9

#A9 has significance since for different A9, the approved rate is different


Out[540]:
sum count approved_rate
A9
f 19 321 0.1
t 284 360 0.8

In [541]:
#Significance of A10
approved_by_a10 = credit_df.groupby('A10').cardapproval.agg(['sum', 'count'])
approved_by_a10['approved_rate'] = approved_by_a10['sum'] / approved_by_a10['count']
approved_by_a10

#A10 has significance since for different A10, the approved rate is different


Out[541]:
sum count approved_rate
A10
f 94 386 0.2
t 209 295 0.7

In [542]:
#Significance of A12
approved_by_a12 = credit_df.groupby('A12').cardapproval.agg(['sum', 'count'])
approved_by_a12['approved_rate'] = approved_by_a12['sum'] / approved_by_a12['count']
approved_by_a12

#The rates are not very different. A12 is not a significant regressor. We can remove it from our model.


Out[542]:
sum count approved_rate
A12
f 157 366 0.4
t 146 315 0.5

In [543]:
#Significance of A13
approved_by_a13 = credit_df.groupby('A13').cardapproval.agg(['sum', 'count'])
approved_by_a13['approved_rate'] = approved_by_a13['sum'] / approved_by_a13['count']
approved_by_a13

#Significant


Out[543]:
sum count approved_rate
A13
g 287 623 0.5
p 1 2 0.5
s 15 56 0.3

In [544]:
#is there any correlation between the continuous valued attributes

credit_df.corr()

#From the below table, there does not seem to be a correlation among the X's


Out[544]:
A2 A3 A8 A11 A14 A15 cardapproval
A2 1.0 0.2 0.4 0.2 -0.1 0.0 0.2
A3 0.2 1.0 0.3 0.3 -0.2 0.1 0.2
A8 0.4 0.3 1.0 0.3 -0.1 0.0 0.3
A11 0.2 0.3 0.3 1.0 -0.1 0.1 0.4
A14 -0.1 -0.2 -0.1 -0.1 1.0 0.1 -0.1
A15 0.0 0.1 0.0 0.1 0.1 1.0 0.2
cardapproval 0.2 0.2 0.3 0.4 -0.1 0.2 1.0

In [545]:
#Plotting all numerical values to see if the values are significant
credit_df.boxplot(['A15'])


Out[545]:
{'boxes': [<matplotlib.lines.Line2D at 0x114361090>],
 'caps': [<matplotlib.lines.Line2D at 0x1143573d0>,
  <matplotlib.lines.Line2D at 0x114357a10>],
 'fliers': [<matplotlib.lines.Line2D at 0x114361d10>,
  <matplotlib.lines.Line2D at 0x11436e350>],
 'medians': [<matplotlib.lines.Line2D at 0x1143616d0>],
 'whiskers': [<matplotlib.lines.Line2D at 0x11434bb50>,
  <matplotlib.lines.Line2D at 0x11434bdd0>]}

In [546]:
#A8,A11,A15 seem to have a lot of 0 values. Check the count

print credit_df.A8[credit_df.A8==0].count()
#A8 is significant as only 63 have 0 values
print credit_df.A11[credit_df.A11==0].count()
#We can exclude A11, but we will keep it for now.
print credit_df.A15[credit_df.A15==0].count()
#We can exclude A15, but we will keep it for now.


63
386
288

Logistic Regression


In [547]:
X_data = credit_df.drop(["A5","A16","cardapproval","A4A5"],axis=1)
#played with A11,A1,A15,A12 and removing them does not seem to change the score a lot

In [548]:
X_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 14 columns):
A1     681 non-null object
A2     681 non-null float64
A3     681 non-null float64
A4     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    681 non-null float64
A15    681 non-null int64
dtypes: float64(4), int64(2), object(8)
memory usage: 79.8+ KB

In [549]:
X_data = pd.get_dummies(X_data)

In [550]:
y_data = credit_df["cardapproval"]

In [551]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[551]:
0.86131386861313863

In [552]:
#Confusion Matrix for the logistic regression model
from sklearn.metrics import confusion_matrix, classification_report
y_pred = clf.predict(X_test)
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)


Out[552]:
array([[68, 14],
       [ 5, 50]])

In [555]:
#Interpreting the confusion matrix
def confusion_matrix_df(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm.columns.name = 'Predicted label'
    cm.index.name = 'True label'
    error_rate = (y_pred != y_test).mean()
    print('error rate: %.2f' % error_rate)
    return cm
    
confusion_matrix_df(y_test, y_pred)


error rate: 0.14
Out[555]:
Predicted label 0 1
True label
0 68 14
1 5 50

In [556]:
# Examine Precision and Recall
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.93      0.83      0.88        82
          1       0.78      0.91      0.84        55

avg / total       0.87      0.86      0.86       137


In [449]:
# Let's take a look at Predicted Probabilities
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()


Out[449]:
No Yes y_pred y_true
132 0.1 0.9 1 1
133 0.9 0.1 0 0
134 1.0 0.0 0 0
135 0.9 0.1 0 0
136 0.5 0.5 1 0

In [450]:
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)


Out[450]:
<matplotlib.axes.AxesSubplot at 0x10f0db150>

Predict

Create an SVM estimator, using sklearn.svm.LinearSVC


In [558]:
#initialize C=1e-3
est = LinearSVC()

In [559]:
est.fit(X_train,y_train)


Out[559]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [560]:
est.score(X_test,y_test)


Out[560]:
0.67153284671532842

GridSearch

Create a parameter grid for C using np.logspace(-3., 3., 10).

Read the example for grid search sklearn.grid_search.GridSearchCV.


In [561]:
import numpy as np

In [562]:
gs = GridSearchCV(LinearSVC(),{'C':np.logspace(-3,3,10)})

In [563]:
gs.fit(X_train,y_train)


Out[563]:
GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [564]:
gs.best_params_,gs.best_score_


Out[564]:
({'C': 0.021544346900318832}, 0.83639705882352944)

In [565]:
est = LinearSVC(C=0.021544346900318832)

In [566]:
est.fit(X_train,y_train)
est.score(X_test,y_test)


Out[566]:
0.81021897810218979

In [567]:
#Confusion Matrix for the logistic regression model
from sklearn.metrics import confusion_matrix, classification_report
y_pred = est.predict(X_test)
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)


Out[567]:
array([[61, 21],
       [ 5, 50]])

In [568]:
#Interpreting the confusion matrix
def confusion_matrix_df(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm.columns.name = 'Predicted label'
    cm.index.name = 'True label'
    error_rate = (y_pred != y_test).mean()
    print('error rate: %.2f' % error_rate)
    return cm
    
confusion_matrix_df(y_test, y_pred)


error rate: 0.19
Out[568]:
Predicted label 0 1
True label
0 61 21
1 5 50

Create estimator with Non-Linear kernel

For this you will use sklearn.svm.SVC
Let's first make sure we understand how to read the documentation:

  • What is the default kernel for SVC? kernel='rbf'
  • What other kernels are available in the scikit SVC API? ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable

Implement an SVM classifier using the defaults and fit to our data:


In [569]:
svc1 = SVC()

In [570]:
svc1.fit(X_train,y_train)


Out[570]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [571]:
gs = GridSearchCV(SVC(),{'C':np.logspace(-3,3,10)})

In [572]:
gs.fit(X_train,y_train)


Out[572]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [573]:
gs.best_params_,gs.best_score_


Out[573]:
({'C': 10.0}, 0.55514705882352944)

In [574]:
svc1 = SVC(C=10)
svc1.fit(X_train,y_train)
svc1.score(X_test,y_test)


Out[574]:
0.72262773722627738

In [579]:
#The scores from the 3 models are as follows
print "Logistic Regression:" 
print clf.score(X_test,y_test)

print "Linear SVM :" 
print est.score(X_test,y_test)

print "Non-Linear Kernel SVM:" 
print svc1.score(X_test,y_test)


Logistic Regression:
0.861313868613
Linear SVM :
0.810218978102
Non-Linear Kernel SVM:
0.722627737226

In [580]:
#The highest score is for the Logistic Regression and the confusion matrix is already run above. The results look good.
#Hence the best model for this data is Logistic Regression

In [ ]: