notebook.community

Edit and run



In [501]:

    
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns

# This enables inline Plots
%matplotlib inline

#Limit rows
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)



In [505]:

    
#Read the file into a dataframe
credit_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',header=None,names=["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","A11","A12","A13","A14","A15","A16"])



In [506]:

    
#Understanding the columns and the data
credit_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [507]:

    
# Printing the unique value from each column to identify the null value. The null value is "?"
for col in range(credit_df.shape[1]):
    print credit_df.iloc[:,col].unique()









    



['b' 'a' '?']
['30.83' '58.67' '24.50' '27.83' '20.17' '32.08' '33.17' '22.92' '54.42'
 '42.50' '22.08' '29.92' '38.25' '48.08' '45.83' '36.67' '28.25' '23.25'
 '21.83' '19.17' '25.00' '47.75' '27.42' '41.17' '15.83' '47.00' '56.58'
 '57.42' '42.08' '29.25' '42.00' '49.50' '36.75' '22.58' '27.25' '23.00'
 '27.75' '54.58' '34.17' '28.92' '29.67' '39.58' '56.42' '54.33' '41.00'
 '31.92' '41.50' '23.92' '25.75' '26.00' '37.42' '34.92' '34.25' '23.33'
 '23.17' '44.33' '35.17' '43.25' '56.75' '31.67' '23.42' '20.42' '26.67'
 '36.00' '25.50' '19.42' '32.33' '34.83' '38.58' '44.25' '44.83' '20.67'
 '34.08' '21.67' '21.50' '49.58' '27.67' '39.83' '?' '37.17' '25.67'
 '34.00' '49.00' '62.50' '31.42' '52.33' '28.75' '28.58' '22.50' '28.50'
 '37.50' '35.25' '18.67' '54.83' '40.92' '19.75' '29.17' '24.58' '33.75'
 '25.42' '37.75' '52.50' '57.83' '20.75' '39.92' '24.75' '44.17' '23.50'
 '47.67' '22.75' '34.42' '28.42' '67.75' '47.42' '36.25' '32.67' '48.58'
 '33.58' '18.83' '26.92' '31.25' '56.50' '43.00' '22.33' '32.83' '40.33'
 '30.50' '52.83' '46.67' '58.33' '37.33' '23.08' '32.75' '68.67' '28.00'
 '44.00' '25.08' '32.00' '60.58' '40.83' '19.33' '41.33' '56.00' '49.83'
 '22.67' '27.00' '26.08' '18.42' '21.25' '57.08' '22.42' '48.75' '40.00'
 '40.58' '28.67' '33.08' '21.33' '41.75' '34.50' '48.17' '27.58' '24.08'
 '24.83' '36.33' '35.42' '71.58' '39.50' '39.33' '24.33' '60.08' '55.92'
 '53.92' '18.92' '50.08' '65.42' '17.58' '18.08' '19.67' '25.17' '33.50'
 '58.42' '26.17' '42.83' '38.17' '20.50' '48.25' '28.33' '18.75' '18.50'
 '45.00' '40.25' '41.42' '17.83' '18.17' '20.00' '52.17' '50.75' '17.08'
 '18.33' '59.67' '18.00' '37.58' '30.67' '18.58' '16.25' '21.17' '17.67'
 '16.50' '29.50' '21.75' '18.25' '35.75' '16.08' '69.17' '32.92' '16.33'
 '22.17' '57.58' '15.92' '31.75' '19.00' '17.50' '33.67' '30.17' '33.25'
 '25.25' '34.75' '47.33' '39.08' '42.75' '38.92' '62.75' '32.25' '26.75'
 '63.33' '30.75' '16.00' '19.50' '32.42' '30.25' '26.83' '16.92' '24.42'
 '39.42' '23.58' '21.42' '33.00' '26.33' '26.25' '28.17' '20.83' '43.17'
 '56.83' '15.17' '29.83' '31.00' '51.92' '69.50' '19.58' '22.25' '38.42'
 '26.58' '35.00' '29.42' '49.17' '51.83' '58.58' '53.33' '27.17' '25.92'
 '30.58' '17.25' '27.33' '36.50' '29.75' '52.42' '36.17' '34.58' '21.92'
 '36.58' '31.08' '30.42' '21.08' '17.42' '39.17' '26.50' '17.33' '23.75'
 '34.67' '74.83' '45.33' '47.25' '24.17' '39.25' '39.00' '64.08' '31.33'
 '21.00' '13.75' '46.00' '20.25' '60.92' '30.00' '22.83' '45.17' '41.58'
 '55.75' '25.33' '31.83' '33.92' '24.92' '80.25' '30.08' '48.33' '76.75'
 '51.33' '41.92' '29.58' '32.17' '51.42' '42.17' '43.08' '59.50' '65.17'
 '20.33' '48.50' '28.08' '73.42' '51.58' '38.67' '46.08' '20.08' '42.25'
 '16.17' '47.83' '22.00' '38.33' '25.58' '21.58' '36.08' '38.75' '35.58'
 '31.58' '15.75' '17.92' '30.33' '47.17' '25.83' '50.25' '36.42']
[  0.      4.46    0.5     1.54    5.625   4.      1.04   11.585   4.915
   0.83    1.835   6.      6.04   10.5     4.415   0.875   5.875   0.25
   8.585  11.25    1.      8.     14.5     6.5     0.585  13.     18.5
   8.5    14.79    9.79    7.585   5.125  10.75    1.5     1.585  11.75
   9.415   9.17   15.      1.415  13.915  28.      6.75    2.04    0.665
   2.5     3.     11.625   4.5    12.25   16.165   0.79    0.835   4.25
   0.375  25.125   7.5     5.      7.      5.29    1.165   9.75   19.      3.5
   0.625   2.21   12.75   15.5     1.375   3.54   11.      1.75   16.5    12.
   2.25    0.75   12.5     1.25    1.125   7.04   10.335   6.21    6.665
   9.      5.5     0.54    2.75    9.5    13.5     3.75   16.      0.29
   1.665   7.54    0.46   10.     11.5     3.04    2.      0.08    1.71
   3.25    2.54   13.585   8.665   9.25    8.17    2.335  19.5     5.665
   4.625   0.205   0.96    4.04    5.04    3.165   7.625  10.04   10.25
   2.125   9.335   6.625   2.71    9.625  12.54    9.54    8.46   13.75
  21.     10.125  25.085   0.21   21.5    11.125  11.045   1.335   0.085
   1.21    0.165   5.71    5.415  12.625   0.58    0.415   2.415   0.335
   3.125  12.125   2.875  13.665  26.335  10.29    1.29   22.      0.125
   1.085   4.085   4.71    6.165   4.585  11.46   14.585   0.17    1.625
   2.085   5.085   8.125   2.835   1.79    0.705   2.165   2.29   18.125
   3.085  11.665   4.125   1.08   13.335  11.835   4.79    9.96    7.08
  25.21    0.67    3.79   22.29    3.335   0.42    1.46    0.04   12.33
  12.335   0.915  14.     17.75   20.      5.25    4.165  10.915   4.75
  10.415   7.835   0.71    2.46    9.585   3.625   2.665   5.835  12.835
  10.665   7.25   10.21    3.29   10.085   3.375]
['u' 'y' '?' 'l']
['g' 'p' '?' 'gg']
['w' 'q' 'm' 'r' 'cc' 'k' 'c' 'd' 'x' 'i' 'e' 'aa' 'ff' 'j' '?']
['v' 'h' 'bb' 'ff' 'j' 'z' '?' 'o' 'dd' 'n']
[  1.25    3.04    1.5     3.75    1.71    2.5     6.5     0.04    3.96
   3.165   2.165   4.335   1.      5.      0.25    0.96    3.17    0.665
   0.75    0.835   7.875   3.085   0.5     5.165  15.      7.      5.04
   7.96    7.585   0.415   2.      1.835  14.415   4.5     5.335   8.625
  28.5     2.625   0.125   6.04    3.5     0.165   0.875   1.75    0.
   7.415   0.085   5.75    6.      3.      1.585   4.29    1.54    1.46
   1.625  12.5    13.5    10.75    0.375   0.585   0.455   4.      8.5
   9.46    2.25   10.      0.795   1.375   1.29   11.5     6.29   14.
   0.335   1.21    7.375   7.5     3.25   13.      5.5     4.25    0.625
   5.085   2.75    2.375   8.      1.085   2.54    4.165   1.665  11.      9.
   1.335   1.415   1.96    2.585   5.125  15.5     0.71    5.665  18.
   5.25    8.665   2.29   20.      2.46   13.875   2.085   4.58    2.71
   2.04    0.29    4.75    0.46    0.21    0.54    3.335   2.335   1.165
   2.415   2.79    4.625   1.04    6.75    1.875  16.     12.75    5.375
   2.125  17.5     3.125   0.79    8.29 ]
['t' 'f']
['t' 'f']
[ 1  6  0  5  7 10  3 17  2  9  8 15 11 12 40 23  4 20 67 14 16 13 19]
['f' 't']
['g' 's' 'p']
['00202' '00043' '00280' '00100' '00120' '00360' '00164' '00080' '00180'
 '00052' '00128' '00260' '00000' '00320' '00396' '00096' '00200' '00300'
 '00145' '00500' '00168' '00434' '00583' '00030' '00240' '00070' '00455'
 '00311' '00216' '00491' '00400' '00239' '00160' '00711' '00250' '00520'
 '00515' '00420' '?' '00980' '00443' '00140' '00094' '00368' '00288'
 '00928' '00188' '00112' '00171' '00268' '00167' '00075' '00152' '00176'
 '00329' '00212' '00410' '00274' '00375' '00408' '00350' '00204' '00040'
 '00181' '00399' '00440' '00093' '00060' '00395' '00393' '00021' '00029'
 '00102' '00431' '00370' '00024' '00020' '00129' '00510' '00195' '00144'
 '00380' '00049' '00050' '00381' '00150' '00117' '00056' '00211' '00230'
 '00156' '00022' '00228' '00519' '00253' '00487' '00220' '00088' '00073'
 '00121' '00470' '00136' '00132' '00292' '00154' '00272' '00340' '00108'
 '00720' '00450' '00232' '00170' '01160' '00411' '00460' '00348' '00480'
 '00640' '00372' '00276' '00221' '00352' '00141' '00178' '00600' '00550'
 '02000' '00225' '00210' '00110' '00356' '00045' '00062' '00092' '00174'
 '00017' '00086' '00454' '00254' '00028' '00263' '00333' '00312' '00290'
 '00371' '00099' '00252' '00760' '00560' '00130' '00523' '00680' '00163'
 '00208' '00383' '00330' '00422' '00840' '00432' '00032' '00186' '00303'
 '00349' '00224' '00369' '00076' '00231' '00309' '00416' '00465' '00256']
[     0    560    824      3  31285   1349    314   1442    200   2690
    245   1208   1260     11  10000   5000   4000     35    713    551
    500    300    221   2283    100     15    284   1236   5800    730
    400  50000    456  15108   2954      2     20     27    225      1
     38      5    130    147    210  11202   1332     50    258    567
   1000   2510    809    610    150  51100    367    600    247    375
    278    827   2072    582   2300   3065   2200      6   1602   2184
   3376   2000   7544  10561    837  11177    639   2028   1065    540
    158  15000   3000   3257   1655   1430      7    790    396    678
   1187   6590    168   1270   1210    742   8851   7059   1704    857
   6700   2503   9800    196     14  26726  18027     99    444   1200
   2010     13    120     32    722     40    484    204     98   5552
    105   2803    126      4     21    173     10     25     42 100000
    113      8     44   2732    179     16   1062    251    228     67
     12    122   4208   1300    112   1110   1004    286   4500   1212
    195     87     17    184    140     18    146     22     55     70
     60   1058    769   5200     19    316    350   3552    687   1950
     53     41     33     80    351   2100    475    892   4607   2206
   5860     28   1391   2279    591    960    690    234    800    990
   2197     90    340    347    327   4071    109   1249    134   1344
    321    948   2079   2384    458   5298    162   1583     58     59
   1400   1465   8000   4700   1097   3290  13212   5777   5124     23
   4159    918    768    283    108      9     68    587    141    501
    160    390    154    117    246    237    364    537    394    750]
['+' '-']



In [508]:

    
#Replace '?' with Null(Nan) for all columns
credit_df.replace('?',np.nan,inplace=True)



In [509]:

    
credit_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     678 non-null object
A2     678 non-null object
A3     690 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     681 non-null object
A7     681 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    677 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [510]:

    
#Selecting rows with null values
credit_df[credit_df.isnull().any(axis=1)]
#from the output, we see there are 6 rows where A4,A5,A6 and A7 are null. There are 2 rows were A6 and A7 are null. They 
#can be removed from the data set.



In [511]:

    
# drop rows were A4 is null
credit_df.dropna(how='any',subset = ['A4'],inplace=True)



In [512]:

    
# drop rows were A6 is null
credit_df.dropna(how='any',subset = ['A6'],inplace=True)



In [513]:

    
credit_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 16 columns):
A1     671 non-null object
A2     669 non-null object
A3     681 non-null float64
A4     681 non-null object
A5     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    674 non-null object
A15    681 non-null int64
A16    681 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 90.4+ KB



In [514]:

    
#A2 and A14 are numerical values. Imputing values for the Null rows
#Check the distributions of A2 and A14
credit_df['A2'] = credit_df.A2.astype(float)
credit_df['A14'] = credit_df.A14.astype(float)



In [515]:

    
#Plot A2 on a histogram
credit_df.A2.hist(bins=20)









    Out[515]:





<matplotlib.axes.AxesSubplot at 0x1137f2450>



In [516]:

    
# Get the Mean and STD of ALL Data
print 'Mean Age:', credit_df.A2.mean()
print 'Std Age:', credit_df.A2.std()









    



Mean Age: 31.4518834081
Std Age: 11.7616656339



In [517]:

    
# Create a Normal Distribution centered on Mean of 31.5 and Standard Dev of 11.8
# Get 12 Entries since that's how many missing entries we have for A2
#function is defined so that we can use the same steps on both the training and test data
def get_a2_impute_values(n):
    return np.random.normal(31.5, 11.8, n)



In [518]:

    
# When we look for entries that are null we can create a mask or boolean filter
a2_missing_mask = credit_df.A2.isnull()



In [519]:

    
# Notice that we have 12 rows that have missing A2 field
credit_df[a2_missing_mask]



In [520]:

    
# Now we can set these values to the values we picked from Random Normal Distribution
credit_df.loc[credit_df.A2.isnull(), 'A2'] = get_a2_impute_values(n=12)



In [521]:

    
#A2 does not have missing values anymore
credit_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 16 columns):
A1     671 non-null object
A2     681 non-null float64
A3     681 non-null float64
A4     681 non-null object
A5     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    674 non-null float64
A15    681 non-null int64
A16    681 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 90.4+ KB



In [522]:

    
#Imputing Values for A14
credit_df.A14.hist(bins=100)
#from the graph below









    Out[522]:





<matplotlib.axes.AxesSubplot at 0x1137d5950>



In [523]:

    
#from the graph above, it seems that a lot of values for A14 is 0. Hence we can impute the null values to 0
credit_df.A14[credit_df.A14==0].count()









    Out[523]:





131



In [524]:

    
#Imputing the null A14 values to 0
credit_df.loc[credit_df.A14.isnull(), 'A14'] = 0



In [525]:

    
#checking if the null values have been made 0
credit_df.A14[credit_df.A14==0].count()









    Out[525]:





138



In [526]:

    
#We remain with only A1 having 10 null values
#Printing the distribution of a's and b's in the dataframe
credit_df.groupby(["A1"]).count()["A2"]









    Out[526]:





A1
a     208
b     463
Name: A2, dtype: int64



In [527]:

    
np.random.choice(['a','b'],10,p=[0.3,0.7])









    Out[527]:





array(['a', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'a', 'b'], 
      dtype='|S1')



In [528]:

    
# Create a function to return a random value for A1 as it is a categorical variable and not numerical
# Get 10 Entries since that's how many missing entries we have for A1
#function is defined so that we can use the same steps on both the training and test data
def get_a1_impute_values(n):
    return np.random.choice(['a','b'],n,p=[0.3,0.7])



In [529]:

    
# When we look for entries that are null we can create a mask or boolean filter
a1_missing_mask = credit_df.A1.isnull()



In [530]:

    
# Now we can set these values to the values we picked from Random Normal Distribution
credit_df.loc[credit_df.A1.isnull(), 'A1'] = get_a1_impute_values(n=10)



In [531]:

    
credit_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 16 columns):
A1     681 non-null object
A2     681 non-null float64
A3     681 non-null float64
A4     681 non-null object
A5     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    681 non-null float64
A15    681 non-null int64
A16    681 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 90.4+ KB



In [293]:

    
#Finalize the variable required to predict. Check for significance
#Before that change the y(predictor) which is A16 in our case to 0's and 1's. 
#Hence adding a new column which acts as the new A16 having 0's and 1's



In [532]:

    
credit_df["cardapproval"] = credit_df.A16.replace(['+','-'],[1,0])



In [533]:

    
#Significance of A1
approved_by_a1 = credit_df.groupby('A1').cardapproval.agg(['sum', 'count'])
approved_by_a1['approved_rate'] = approved_by_a1['sum'] / approved_by_a1['count']
approved_by_a1

#The rates are not very different. A1 is not a significant regressor. We can remove it from our model.









    Out[533]:






  
    
      
      sum
      count
      approved_rate
    
    
      A1
      
      
      
    
  
  
    
      a
        97
       212
       0.5
    
    
      b
       206
       469
       0.4



In [534]:

    
#Significance of A4
approved_by_a4 = credit_df.groupby('A4').cardapproval.agg(['sum', 'count'])
approved_by_a4['approved_rate'] = approved_by_a4['sum'] / approved_by_a4['count']
approved_by_a4

#A4 has significance since for different A4, the approved rate is different









    Out[534]:






  
    
      
      sum
      count
      approved_rate
    
    
      A4
      
      
      
    
  
  
    
      l
         2
         2
       1.0
    
    
      u
       256
       518
       0.5
    
    
      y
        45
       161
       0.3



In [535]:

    
#Significance of A5
approved_by_a5 = credit_df.groupby('A5').cardapproval.agg(['sum', 'count'])
approved_by_a5['approved_rate'] = approved_by_a5['sum'] / approved_by_a5['count']
approved_by_a5









    Out[535]:






  
    
      
      sum
      count
      approved_rate
    
    
      A5
      
      
      
    
  
  
    
      g
       256
       518
       0.5
    
    
      gg
         2
         2
       1.0
    
    
      p
        45
       161
       0.3



In [536]:

    
#Seems from the above two tables, that A4 and A5 are correlated.
credit_df.groupby(['A4'])['A5'].count()

#df.groupby([‘column1’])[‘column2’].sum()









    Out[536]:





A4
l       2
u     518
y     161
Name: A5, dtype: int64



In [537]:

    
#Concatenating A4 and A5 to see if they are correlated and then doing a group by to see if they are correlated
credit_df["A4A5"]=credit_df.A4 + credit_df.A5
credit_df.groupby(['A4A5'])['A5'].count()
#They are correlated. So we can drop A5 from our model.









    Out[537]:





A4A5
lgg       2
ug      518
yp      161
Name: A5, dtype: int64



In [538]:

    
#Significance of A6
approved_by_a6 = credit_df.groupby('A6').cardapproval.agg(['sum', 'count'])
approved_by_a6['approved_rate'] = approved_by_a6['sum'] / approved_by_a6['count']
approved_by_a6

#A6 has significance since for different A6, the approved rate is different









    Out[538]:






  
    
      
      sum
      count
      approved_rate
    
    
      A6
      
      
      
    
  
  
    
      aa
       19
        54
       0.4
    
    
      c
       62
       137
       0.5
    
    
      cc
       29
        41
       0.7
    
    
      d
        7
        30
       0.2
    
    
      e
       14
        25
       0.6
    
    
      ff
        7
        53
       0.1
    
    
      i
       14
        59
       0.2
    
    
      j
        3
        10
       0.3
    
    
      k
       14
        51
       0.3
    
    
      m
       16
        38
       0.4
    
    
      q
       51
        78
       0.7
    
    
      r
        2
         3
       0.7
    
    
      w
       33
        64
       0.5
    
    
      x
       32
        38
       0.8



In [539]:

    
#Significance of A7
approved_by_a7 = credit_df.groupby('A7').cardapproval.agg(['sum', 'count'])
approved_by_a7['approved_rate'] = approved_by_a7['sum'] / approved_by_a7['count']
approved_by_a7

#A7 has significance since for different A7, the approved rate is different









    Out[539]:






  
    
      
      sum
      count
      approved_rate
    
    
      A7
      
      
      
    
  
  
    
      bb
        25
        59
       0.4
    
    
      dd
         2
         6
       0.3
    
    
      ff
         8
        57
       0.1
    
    
      h
        87
       138
       0.6
    
    
      j
         3
         8
       0.4
    
    
      n
         2
         4
       0.5
    
    
      o
         1
         2
       0.5
    
    
      v
       169
       399
       0.4
    
    
      z
         6
         8
       0.8



In [540]:

    
#Significance of A9
approved_by_a9 = credit_df.groupby('A9').cardapproval.agg(['sum', 'count'])
approved_by_a9['approved_rate'] = approved_by_a9['sum'] / approved_by_a9['count']
approved_by_a9

#A9 has significance since for different A9, the approved rate is different









    Out[540]:






  
    
      
      sum
      count
      approved_rate
    
    
      A9
      
      
      
    
  
  
    
      f
        19
       321
       0.1
    
    
      t
       284
       360
       0.8



In [541]:

    
#Significance of A10
approved_by_a10 = credit_df.groupby('A10').cardapproval.agg(['sum', 'count'])
approved_by_a10['approved_rate'] = approved_by_a10['sum'] / approved_by_a10['count']
approved_by_a10

#A10 has significance since for different A10, the approved rate is different









    Out[541]:






  
    
      
      sum
      count
      approved_rate
    
    
      A10
      
      
      
    
  
  
    
      f
        94
       386
       0.2
    
    
      t
       209
       295
       0.7



In [542]:

    
#Significance of A12
approved_by_a12 = credit_df.groupby('A12').cardapproval.agg(['sum', 'count'])
approved_by_a12['approved_rate'] = approved_by_a12['sum'] / approved_by_a12['count']
approved_by_a12

#The rates are not very different. A12 is not a significant regressor. We can remove it from our model.









    Out[542]:






  
    
      
      sum
      count
      approved_rate
    
    
      A12
      
      
      
    
  
  
    
      f
       157
       366
       0.4
    
    
      t
       146
       315
       0.5



In [543]:

    
#Significance of A13
approved_by_a13 = credit_df.groupby('A13').cardapproval.agg(['sum', 'count'])
approved_by_a13['approved_rate'] = approved_by_a13['sum'] / approved_by_a13['count']
approved_by_a13

#Significant









    Out[543]:






  
    
      
      sum
      count
      approved_rate
    
    
      A13
      
      
      
    
  
  
    
      g
       287
       623
       0.5
    
    
      p
         1
         2
       0.5
    
    
      s
        15
        56
       0.3



In [544]:

    
#is there any correlation between the continuous valued attributes

credit_df.corr()

#From the below table, there does not seem to be a correlation among the X's









    Out[544]:






  
    
      
      A2
      A3
      A8
      A11
      A14
      A15
      cardapproval
    
  
  
    
      A2
       1.0
       0.2
       0.4
       0.2
      -0.1
       0.0
       0.2
    
    
      A3
       0.2
       1.0
       0.3
       0.3
      -0.2
       0.1
       0.2
    
    
      A8
       0.4
       0.3
       1.0
       0.3
      -0.1
       0.0
       0.3
    
    
      A11
       0.2
       0.3
       0.3
       1.0
      -0.1
       0.1
       0.4
    
    
      A14
      -0.1
      -0.2
      -0.1
      -0.1
       1.0
       0.1
      -0.1
    
    
      A15
       0.0
       0.1
       0.0
       0.1
       0.1
       1.0
       0.2
    
    
      cardapproval
       0.2
       0.2
       0.3
       0.4
      -0.1
       0.2
       1.0



In [545]:

    
#Plotting all numerical values to see if the values are significant
credit_df.boxplot(['A15'])









    Out[545]:





{'boxes': [<matplotlib.lines.Line2D at 0x114361090>],
 'caps': [<matplotlib.lines.Line2D at 0x1143573d0>,
  <matplotlib.lines.Line2D at 0x114357a10>],
 'fliers': [<matplotlib.lines.Line2D at 0x114361d10>,
  <matplotlib.lines.Line2D at 0x11436e350>],
 'medians': [<matplotlib.lines.Line2D at 0x1143616d0>],
 'whiskers': [<matplotlib.lines.Line2D at 0x11434bb50>,
  <matplotlib.lines.Line2D at 0x11434bdd0>]}



In [546]:

    
#A8,A11,A15 seem to have a lot of 0 values. Check the count

print credit_df.A8[credit_df.A8==0].count()
#A8 is significant as only 63 have 0 values
print credit_df.A11[credit_df.A11==0].count()
#We can exclude A11, but we will keep it for now.
print credit_df.A15[credit_df.A15==0].count()
#We can exclude A15, but we will keep it for now.

Logistic Regression



In [547]:

    
X_data = credit_df.drop(["A5","A16","cardapproval","A4A5"],axis=1)
#played with A11,A1,A15,A12 and removing them does not seem to change the score a lot



In [548]:

    
X_data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 14 columns):
A1     681 non-null object
A2     681 non-null float64
A3     681 non-null float64
A4     681 non-null object
A6     681 non-null object
A7     681 non-null object
A8     681 non-null float64
A9     681 non-null object
A10    681 non-null object
A11    681 non-null int64
A12    681 non-null object
A13    681 non-null object
A14    681 non-null float64
A15    681 non-null int64
dtypes: float64(4), int64(2), object(8)
memory usage: 79.8+ KB



In [549]:

    
X_data = pd.get_dummies(X_data)



In [550]:

    
y_data = credit_df["cardapproval"]



In [551]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)









    Out[551]:





0.86131386861313863



In [552]:

    
#Confusion Matrix for the logistic regression model
from sklearn.metrics import confusion_matrix, classification_report
y_pred = clf.predict(X_test)
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)









    Out[552]:





array([[68, 14],
       [ 5, 50]])



In [555]:

    
#Interpreting the confusion matrix
def confusion_matrix_df(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm.columns.name = 'Predicted label'
    cm.index.name = 'True label'
    error_rate = (y_pred != y_test).mean()
    print('error rate: %.2f' % error_rate)
    return cm
    
confusion_matrix_df(y_test, y_pred)









    



error rate: 0.14






    Out[555]:






  
    
      Predicted label
      0
      1
    
    
      True label
      
      
    
  
  
    
      0
       68
       14
    
    
      1
        5
       50



In [556]:

    
# Examine Precision and Recall
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.93      0.83      0.88        82
          1       0.78      0.91      0.84        55

avg / total       0.87      0.86      0.86       137



In [449]:

    
# Let's take a look at Predicted Probabilities
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()



In [450]:

    
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    Out[450]:





<matplotlib.axes.AxesSubplot at 0x10f0db150>

Predict

Create an SVM estimator, using `sklearn.svm.LinearSVC`



In [558]:

    
#initialize C=1e-3
est = LinearSVC()



In [559]:

    
est.fit(X_train,y_train)









    Out[559]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [560]:

    
est.score(X_test,y_test)









    Out[560]:





0.67153284671532842

GridSearch

Create a parameter grid for `C` using `np.logspace(-3., 3., 10)`.

Read the example for grid search sklearn.grid_search.GridSearchCV.



In [561]:

    
import numpy as np



In [562]:

    
gs = GridSearchCV(LinearSVC(),{'C':np.logspace(-3,3,10)})



In [563]:

    
gs.fit(X_train,y_train)









    Out[563]:





GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [564]:

    
gs.best_params_,gs.best_score_









    Out[564]:





({'C': 0.021544346900318832}, 0.83639705882352944)



In [565]:

    
est = LinearSVC(C=0.021544346900318832)



In [566]:

    
est.fit(X_train,y_train)
est.score(X_test,y_test)









    Out[566]:





0.81021897810218979



In [567]:

    
#Confusion Matrix for the logistic regression model
from sklearn.metrics import confusion_matrix, classification_report
y_pred = est.predict(X_test)
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)









    Out[567]:





array([[61, 21],
       [ 5, 50]])



In [568]:

    
#Interpreting the confusion matrix
def confusion_matrix_df(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm.columns.name = 'Predicted label'
    cm.index.name = 'True label'
    error_rate = (y_pred != y_test).mean()
    print('error rate: %.2f' % error_rate)
    return cm
    
confusion_matrix_df(y_test, y_pred)









    



error rate: 0.19






    Out[568]:






  
    
      Predicted label
      0
      1
    
    
      True label
      
      
    
  
  
    
      0
       61
       21
    
    
      1
        5
       50

Create estimator with Non-Linear kernel

For this you will use sklearn.svm.SVC
Let's first make sure we understand how to read the documentation:

What is the default kernel for SVC? kernel='rbf'
What other kernels are available in the scikit SVC API? ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable

Implement an SVM classifier using the defaults and fit to our data:



In [569]:

    
svc1 = SVC()



In [570]:

    
svc1.fit(X_train,y_train)









    Out[570]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [571]:

    
gs = GridSearchCV(SVC(),{'C':np.logspace(-3,3,10)})



In [572]:

    
gs.fit(X_train,y_train)









    Out[572]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [573]:

    
gs.best_params_,gs.best_score_









    Out[573]:





({'C': 10.0}, 0.55514705882352944)



In [574]:

    
svc1 = SVC(C=10)
svc1.fit(X_train,y_train)
svc1.score(X_test,y_test)









    Out[574]:





0.72262773722627738



In [579]:

    
#The scores from the 3 models are as follows
print "Logistic Regression:" 
print clf.score(X_test,y_test)

print "Linear SVM :" 
print est.score(X_test,y_test)

print "Non-Linear Kernel SVM:" 
print svc1.score(X_test,y_test)









    



Logistic Regression:
0.861313868613
Linear SVM :
0.810218978102
Non-Linear Kernel SVM:
0.722627737226



In [580]:

    
#The highest score is for the Logistic Regression and the confusion matrix is already run above. The results look good.
#Hence the best model for this data is Logistic Regression



In [ ]:

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
71	b	34.83	4.0	u	g	d	bb	12.5	t	f	0	t	g	NaN	0	-
83	a	NaN	3.5	u	g	d	v	3.0	t	f	0	t	g	00300	0	-
86	b	NaN	0.4	u	g	d	v	0.9	t	f	0	t	s	00928	0	-
92	b	NaN	5.0	y	p	aa	v	8.5	t	f	0	f	g	00000	0	-
97	b	NaN	0.5	u	g	c	bb	0.8	t	f	0	t	s	00320	0	-
202	b	24.83	2.8	u	g	c	v	2.2	t	t	6	f	g	NaN	600	+
206	a	71.58	0.0	NaN	NaN	NaN	NaN	0.0	f	f	0	f	p	NaN	0	+
243	a	18.75	7.5	u	g	q	v	2.7	t	t	5	f	g	NaN	26726	+
248	NaN	24.50	12.8	u	g	c	bb	4.8	t	t	2	f	g	00073	444	+
254	b	NaN	0.6	u	g	k	v	0.2	f	f	0	f	g	00380	2010	-
270	b	37.58	0.0	NaN	NaN	NaN	NaN	0.0	f	f	0	f	p	NaN	0	+
278	b	24.58	13.5	y	p	ff	ff	0.0	f	f	0	f	g	NaN	0	-
286	a	NaN	1.5	u	g	ff	ff	0.0	f	t	2	t	g	00200	105	-
327	NaN	40.83	3.5	u	g	i	bb	0.5	f	f	0	f	s	01160	0	-
329	b	NaN	4.0	y	p	i	v	0.1	f	f	0	t	g	00411	0	-
330	b	20.42	0.0	NaN	NaN	NaN	NaN	0.0	f	f	0	f	p	NaN	0	-
346	NaN	32.25	1.5	u	g	c	v	0.2	f	f	0	t	g	00372	122	-
374	NaN	28.17	0.6	u	g	aa	v	0.0	f	f	0	f	g	00260	1004	-
406	a	40.33	8.1	y	p	k	v	0.2	f	t	2	f	g	NaN	18	-
445	a	NaN	11.2	u	g	ff	ff	0.0	f	f	0	f	g	NaN	5200	-
450	b	NaN	3.0	y	p	i	bb	7.0	f	f	0	f	g	00000	1	-
453	NaN	29.75	0.7	u	g	w	v	0.2	f	f	0	t	g	00300	0	-
456	b	34.58	0.0	NaN	NaN	NaN	NaN	0.0	f	f	0	f	p	NaN	0	-
479	NaN	26.50	2.7	y	p	NaN	NaN	0.1	f	f	0	f	s	00080	0	-
489	NaN	45.33	1.0	u	g	q	v	0.1	f	f	0	t	g	00263	0	-
500	b	NaN	4.0	u	g	x	v	5.0	t	t	3	t	g	00290	2279	+
515	b	NaN	10.5	u	g	x	v	6.5	t	f	0	f	g	00000	0	+
520	NaN	20.42	7.5	u	g	k	v	1.5	t	t	1	f	g	00160	234	+
539	b	80.25	5.5	u	g	NaN	NaN	0.5	t	f	0	f	g	00000	340	-
592	b	23.17	0.0	NaN	NaN	NaN	NaN	0.0	f	f	0	f	p	NaN	0	+
598	NaN	20.08	0.1	u	g	q	v	1.0	f	t	1	f	g	00240	768	+
601	NaN	42.25	1.8	y	p	NaN	NaN	0.0	f	f	0	t	g	00150	1	-
608	b	NaN	0.0	y	p	d	v	4.2	f	f	0	t	g	00460	0	-
622	a	25.58	0.0	NaN	NaN	NaN	NaN	0.0	f	f	0	f	p	NaN	0	+
626	b	22.00	7.8	y	p	i	bb	0.2	f	f	0	t	g	NaN	0	-
641	NaN	33.17	2.2	y	p	cc	v	3.5	f	f	0	t	g	00200	141	-
673	NaN	29.50	2.0	y	p	e	h	2.0	f	f	0	f	g	00256	17	-

	sum	count	approved_rate
A6
aa	19	54	0.4
c	62	137	0.5
cc	29	41	0.7
d	7	30	0.2
e	14	25	0.6
ff	7	53	0.1
i	14	59	0.2
j	3	10	0.3
k	14	51	0.3
m	16	38	0.4
q	51	78	0.7
r	2	3	0.7
w	33	64	0.5
x	32	38	0.8

	sum	count	approved_rate
A7
bb	25	59	0.4
dd	2	6	0.3
ff	8	57	0.1
h	87	138	0.6
j	3	8	0.4
n	2	4	0.5
o	1	2	0.5
v	169	399	0.4
z	6	8	0.8

	A2	A3	A8	A11	A14	A15	cardapproval
A2	1.0	0.2	0.4	0.2	-0.1	0.0	0.2
A3	0.2	1.0	0.3	0.3	-0.2	0.1	0.2
A8	0.4	0.3	1.0	0.3	-0.1	0.0	0.3
A11	0.2	0.3	0.3	1.0	-0.1	0.1	0.4
A14	-0.1	-0.2	-0.1	-0.1	1.0	0.1	-0.1
A15	0.0	0.1	0.0	0.1	0.1	1.0	0.2
cardapproval	0.2	0.2	0.3	0.4	-0.1	0.2	1.0

	sum	count	approved_rate
A6
aa	19	54	0.4
c	62	137	0.5
cc	29	41	0.7
d	7	30	0.2
e	14	25	0.6
ff	7	53	0.1
i	14	59	0.2
j	3	10	0.3
k	14	51	0.3
m	16	38	0.4
q	51	78	0.7
r	2	3	0.7
w	33	64	0.5
x	32	38	0.8

	sum	count	approved_rate
A7
bb	25	59	0.4
dd	2	6	0.3
ff	8	57	0.1
h	87	138	0.6
j	3	8	0.4
n	2	4	0.5
o	1	2	0.5
v	169	399	0.4
z	6	8	0.8

Logistic Regression

Predict

Create an SVM estimator, using sklearn.svm.LinearSVC

GridSearch

Create a parameter grid for C using np.logspace(-3., 3., 10).

Create estimator with Non-Linear kernel

Create an SVM estimator, using `sklearn.svm.LinearSVC`

Create a parameter grid for `C` using `np.logspace(-3., 3., 10)`.

	sum	count	approved_rate
A6
aa	19	54	0.4
c	62	137	0.5
cc	29	41	0.7
d	7	30	0.2
e	14	25	0.6
ff	7	53	0.1
i	14	59	0.2
j	3	10	0.3
k	14	51	0.3
m	16	38	0.4
q	51	78	0.7
r	2	3	0.7
w	33	64	0.5
x	32	38	0.8

	sum	count	approved_rate
A7
bb	25	59	0.4
dd	2	6	0.3
ff	8	57	0.1
h	87	138	0.6
j	3	8	0.4
n	2	4	0.5
o	1	2	0.5
v	169	399	0.4
z	6	8	0.8