notebook.community

Edit and run



In [175]:

    
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.lda import LDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
# This enables inline Plots
%matplotlib inline

# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)



In [176]:

    
!pwd









    



/Users/megara/DAT_SF_11/homeworks/hw2



In [189]:

    
#I pull in the data twice. One set I filled using normal distributions for continous values and histograms/random integers 
# for discrete values which I replaced with strings to replicate the distribution. One set I just removed all ? data.
col_names=[ "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15","A16"]
credit = pd.read_csv('.//data/crx.data', names=col_names)
credit1 = pd.read_csv('.//data/crx.data', names=col_names, na_values='?')



In [192]:

    
credit1.head()



In [191]:

    
credit1=credit1.dropna()



In [85]:

    
credit.head()



In [9]:

    
credit.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [10]:

    
credit['A1'].unique()









    Out[10]:





array(['b', 'a', '?'], dtype=object)



In [11]:

    
credit['A1'].value_counts().plot(kind='bar');



In [12]:

    
credit['A1'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)



In [13]:

    
credit['A1'].replace(to_replace=0, value='a',inplace=True)



In [14]:

    
credit['A1'].replace(to_replace=1, value='b',inplace=True)
credit['A1'].replace(to_replace=2, value='b',inplace=True)



In [15]:

    
credit['A1'].unique()









    Out[15]:





array(['b', 'a'], dtype=object)



In [16]:

    
credit['A2'].replace(to_replace='?', value= 0,inplace=True)



In [17]:

    
credit['A2'].unique()
credit['A2']=list(map(float, credit['A2']))
#credit['A2'].replace(to_replace='?', value=credit['A2'].mean(),inplace=True)



In [18]:

    
listremove=credit['A2']



In [19]:

    
data=[]
for i in listremove:
    if i != '0':
        data.append(i)
mean = sum(data)/len(data)
std=np.std(data)



In [20]:

    
credit['A2'].replace(to_replace=0, value=np.random.normal(mean,std),inplace=True)



In [21]:

    
credit['A4'].unique()









    Out[21]:





array(['u', 'y', '?', 'l'], dtype=object)



In [21]:



In [22]:

    
credit['A4'].value_counts().plot(kind='bar');



In [23]:

    
credit['A4'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)



In [24]:

    
credit['A4'].replace(to_replace=0, value='u' ,inplace=True)
credit['A4'].replace(to_replace=1, value='u' ,inplace=True)
credit['A4'].replace(to_replace=2, value='y' ,inplace=True)
credit['A4'].replace(to_replace='l', value='y' ,inplace=True)



In [25]:

    
credit['A5'].unique()









    Out[25]:





array(['g', 'p', '?', 'gg'], dtype=object)



In [26]:

    
credit['A5'].value_counts().plot(kind='bar');



In [27]:

    
credit['A5'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)



In [28]:

    
credit['A5'].replace(to_replace=0, value='g' ,inplace=True)
credit['A5'].replace(to_replace=1, value='g' ,inplace=True)
credit['A5'].replace(to_replace=2, value='p' ,inplace=True)



In [29]:

    
credit['A6'].unique()









    Out[29]:





array(['w', 'q', 'm', 'r', 'cc', 'k', 'c', 'd', 'x', 'i', 'e', 'aa', 'ff',
       'j', '?'], dtype=object)



In [30]:

    
credit['A6'].value_counts().plot(kind='bar');



In [31]:

    
credit['A6'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)



In [32]:

    
credit['A6'].replace(to_replace=0, value='c' ,inplace=True)
credit['A6'].replace(to_replace=1, value='q' ,inplace=True)
credit['A6'].replace(to_replace=2, value='w' ,inplace=True)



In [33]:

    
credit['A6'].value_counts().plot(kind='bar');



In [34]:

    
credit['A7'].unique()









    Out[34]:





array(['v', 'h', 'bb', 'ff', 'j', 'z', '?', 'o', 'dd', 'n'], dtype=object)



In [35]:

    
credit['A7'].value_counts().plot(kind='bar');



In [36]:

    
credit['A7'].replace(to_replace='?', value=np.random.randint(0,7),inplace=True)



In [37]:

    
credit['A7'].replace(to_replace=0, value='v' ,inplace=True)
credit['A7'].replace(to_replace=1, value='v' ,inplace=True)
credit['A7'].replace(to_replace=2, value='v' ,inplace=True)
credit['A7'].replace(to_replace=3, value='h' ,inplace=True)
credit['A7'].replace(to_replace=4, value='h' ,inplace=True)
credit['A7'].replace(to_replace=5, value='bb' ,inplace=True)
credit['A7'].replace(to_replace=6, value='ff' ,inplace=True)
credit['A7'].replace(to_replace=7, value='z' ,inplace=True)



In [38]:

    
credit['A8'].unique()
#no missing values









    Out[38]:





array([  1.25 ,   3.04 ,   1.5  ,   3.75 ,   1.71 ,   2.5  ,   6.5  ,
         0.04 ,   3.96 ,   3.165,   2.165,   4.335,   1.   ,   5.   ,
         0.25 ,   0.96 ,   3.17 ,   0.665,   0.75 ,   0.835,   7.875,
         3.085,   0.5  ,   5.165,  15.   ,   7.   ,   5.04 ,   7.96 ,
         7.585,   0.415,   2.   ,   1.835,  14.415,   4.5  ,   5.335,
         8.625,  28.5  ,   2.625,   0.125,   6.04 ,   3.5  ,   0.165,
         0.875,   1.75 ,   0.   ,   7.415,   0.085,   5.75 ,   6.   ,
         3.   ,   1.585,   4.29 ,   1.54 ,   1.46 ,   1.625,  12.5  ,
        13.5  ,  10.75 ,   0.375,   0.585,   0.455,   4.   ,   8.5  ,
         9.46 ,   2.25 ,  10.   ,   0.795,   1.375,   1.29 ,  11.5  ,
         6.29 ,  14.   ,   0.335,   1.21 ,   7.375,   7.5  ,   3.25 ,
        13.   ,   5.5  ,   4.25 ,   0.625,   5.085,   2.75 ,   2.375,
         8.   ,   1.085,   2.54 ,   4.165,   1.665,  11.   ,   9.   ,
         1.335,   1.415,   1.96 ,   2.585,   5.125,  15.5  ,   0.71 ,
         5.665,  18.   ,   5.25 ,   8.665,   2.29 ,  20.   ,   2.46 ,
        13.875,   2.085,   4.58 ,   2.71 ,   2.04 ,   0.29 ,   4.75 ,
         0.46 ,   0.21 ,   0.54 ,   3.335,   2.335,   1.165,   2.415,
         2.79 ,   4.625,   1.04 ,   6.75 ,   1.875,  16.   ,  12.75 ,
         5.375,   2.125,  17.5  ,   3.125,   0.79 ,   8.29 ])



In [39]:

    
credit['A9'].unique()









    Out[39]:





array(['t', 'f'], dtype=object)



In [40]:

    
credit['A10'].unique()









    Out[40]:





array(['t', 'f'], dtype=object)



In [41]:

    
credit['A11'].unique()









    Out[41]:





array([ 1,  6,  0,  5,  7, 10,  3, 17,  2,  9,  8, 15, 11, 12, 40, 23,  4,
       20, 67, 14, 16, 13, 19])



In [42]:

    
credit['A12'].unique()









    Out[42]:





array(['f', 't'], dtype=object)



In [43]:

    
credit['A13'].unique()









    Out[43]:





array(['g', 's', 'p'], dtype=object)



In [44]:

    
credit['A14'].unique()









    Out[44]:





array(['00202', '00043', '00280', '00100', '00120', '00360', '00164',
       '00080', '00180', '00052', '00128', '00260', '00000', '00320',
       '00396', '00096', '00200', '00300', '00145', '00500', '00168',
       '00434', '00583', '00030', '00240', '00070', '00455', '00311',
       '00216', '00491', '00400', '00239', '00160', '00711', '00250',
       '00520', '00515', '00420', '?', '00980', '00443', '00140', '00094',
       '00368', '00288', '00928', '00188', '00112', '00171', '00268',
       '00167', '00075', '00152', '00176', '00329', '00212', '00410',
       '00274', '00375', '00408', '00350', '00204', '00040', '00181',
       '00399', '00440', '00093', '00060', '00395', '00393', '00021',
       '00029', '00102', '00431', '00370', '00024', '00020', '00129',
       '00510', '00195', '00144', '00380', '00049', '00050', '00381',
       '00150', '00117', '00056', '00211', '00230', '00156', '00022',
       '00228', '00519', '00253', '00487', '00220', '00088', '00073',
       '00121', '00470', '00136', '00132', '00292', '00154', '00272',
       '00340', '00108', '00720', '00450', '00232', '00170', '01160',
       '00411', '00460', '00348', '00480', '00640', '00372', '00276',
       '00221', '00352', '00141', '00178', '00600', '00550', '02000',
       '00225', '00210', '00110', '00356', '00045', '00062', '00092',
       '00174', '00017', '00086', '00454', '00254', '00028', '00263',
       '00333', '00312', '00290', '00371', '00099', '00252', '00760',
       '00560', '00130', '00523', '00680', '00163', '00208', '00383',
       '00330', '00422', '00840', '00432', '00032', '00186', '00303',
       '00349', '00224', '00369', '00076', '00231', '00309', '00416',
       '00465', '00256'], dtype=object)



In [45]:

    
credit['A15'].unique()









    Out[45]:





array([     0,    560,    824,      3,  31285,   1349,    314,   1442,
          200,   2690,    245,   1208,   1260,     11,  10000,   5000,
         4000,     35,    713,    551,    500,    300,    221,   2283,
          100,     15,    284,   1236,   5800,    730,    400,  50000,
          456,  15108,   2954,      2,     20,     27,    225,      1,
           38,      5,    130,    147,    210,  11202,   1332,     50,
          258,    567,   1000,   2510,    809,    610,    150,  51100,
          367,    600,    247,    375,    278,    827,   2072,    582,
         2300,   3065,   2200,      6,   1602,   2184,   3376,   2000,
         7544,  10561,    837,  11177,    639,   2028,   1065,    540,
          158,  15000,   3000,   3257,   1655,   1430,      7,    790,
          396,    678,   1187,   6590,    168,   1270,   1210,    742,
         8851,   7059,   1704,    857,   6700,   2503,   9800,    196,
           14,  26726,  18027,     99,    444,   1200,   2010,     13,
          120,     32,    722,     40,    484,    204,     98,   5552,
          105,   2803,    126,      4,     21,    173,     10,     25,
           42, 100000,    113,      8,     44,   2732,    179,     16,
         1062,    251,    228,     67,     12,    122,   4208,   1300,
          112,   1110,   1004,    286,   4500,   1212,    195,     87,
           17,    184,    140,     18,    146,     22,     55,     70,
           60,   1058,    769,   5200,     19,    316,    350,   3552,
          687,   1950,     53,     41,     33,     80,    351,   2100,
          475,    892,   4607,   2206,   5860,     28,   1391,   2279,
          591,    960,    690,    234,    800,    990,   2197,     90,
          340,    347,    327,   4071,    109,   1249,    134,   1344,
          321,    948,   2079,   2384,    458,   5298,    162,   1583,
           58,     59,   1400,   1465,   8000,   4700,   1097,   3290,
        13212,   5777,   5124,     23,   4159,    918,    768,    283,
          108,      9,     68,    587,    141,    501,    160,    390,
          154,    117,    246,    237,    364,    537,    394,    750])



In [46]:

    
type(credit['A15'][1])









    Out[46]:





numpy.int64



In [47]:

    
credit['A16'].unique()









    Out[47]:





array(['+', '-'], dtype=object)



In [48]:

    
credit['Approval']=credit['A16']



In [49]:

    
credit['Approval']=credit['Approval'].replace('+',1)
credit['Approval']=credit['Approval'].replace('-',0)
credit['Approval']









    Out[49]:





0    1
1    1
2    1
...
687    0
688    0
689    0
Name: Approval, Length: 690, dtype: int64



In [194]:

    
credit1['Approval']=credit1['A16']
credit1['Approval']=credit1['Approval'].replace('+',1)
credit1['Approval']=credit1['Approval'].replace('-',0)
credit1['Approval']









    Out[194]:





0    1
1    1
2    1
...
687    0
688    0
689    0
Name: Approval, Length: 653, dtype: int64



In [50]:

    
#Col A1 significant? Slightly
approved_by_a1 = credit.groupby('A1').Approval.agg(['sum', 'count'])
approved_by_a1['creditrating'] = approved_by_a1['sum'] / approved_by_a1['count']
approved_by_a1









    Out[50]:






  
    
      
      sum
      count
      creditrating
    
    
      A1
      
      
      
    
  
  
    
      a
       101
       222
       0.5
    
    
      b
       206
       468
       0.4



In [51]:

    
counta=0
for i in credit['A16']:
    if i=='+':
        counta = counta+1
counta









    Out[51]:





307



In [52]:

    
countb=0
for i in credit['A16']:
    if i=='-':
        countb = countb+1
countb









    Out[52]:





383



In [53]:

    
#Col A4 significant? Yes
approved_by_a4 = credit.groupby('A4').Approval.agg(['sum', 'count'])
approved_by_a4['creditrating'] = approved_by_a4['sum'] / approved_by_a4['count']
approved_by_a4









    Out[53]:






  
    
      
      sum
      count
      creditrating
    
    
      A4
      
      
      
    
  
  
    
      u
       260
       525
       0.5
    
    
      y
        47
       165
       0.3



In [54]:

    
approved_by_a5 = credit.groupby('A5').Approval.agg(['sum', 'count'])
approved_by_a5['creditrating'] = approved_by_a5['sum'] / approved_by_a5['count']
approved_by_a5









    Out[54]:






  
    
      
      sum
      count
      creditrating
    
    
      A5
      
      
      
    
  
  
    
      g
       260
       525
       0.5
    
    
      gg
         2
         2
       1.0
    
    
      p
        45
       163
       0.3



In [55]:

    
approved_by_a6 = credit.groupby('A6').Approval.agg(['sum', 'count'])
approved_by_a6['creditrating'] = approved_by_a6['sum'] / approved_by_a6['count']
approved_by_a6









    Out[55]:






  
    
      
      sum
      count
      creditrating
    
    
      A6
      
      
      
    
  
  
    
      aa
       19
        54
       0.4
    
    
      c
       66
       146
       0.5
    
    
      cc
       29
        41
       0.7
    
    
      d
        7
        30
       0.2
    
    
      e
       14
        25
       0.6
    
    
      ...
      ...
      ...
      ...
    
    
      m
       16
        38
       0.4
    
    
      q
       51
        78
       0.7
    
    
      r
        2
         3
       0.7
    
    
      w
       33
        64
       0.5
    
    
      x
       32
        38
       0.8
    
  

14 rows × 3 columns



In [ ]:

    
# I start fitting and scoring my data to sereral models below including logistic regression, LDA, nearest neighbor, SVC, 
#and Linear SVC. I found logistic regression to return the best score with a grid search on the parameter C. 
#My confusion matrix shows I predicted the disapproval very well and approval moderately well. 
#When I recheck my model using a dataset that threw away all of the ? data rather than filling it. My model does even better. 
#This makes me question whether the idea to fill data is a good idea or now.



In [56]:

    
from sklearn.cross_validation import train_test_split



In [57]:

    
X_data = credit[["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A15"]]



In [58]:

    
X_data = pd.get_dummies(X_data)



In [119]:

    
y_data=credit["Approval"]



In [120]:

    
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)



In [195]:

    
X_data1 = credit1[["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A15"]]
X_data1 = pd.get_dummies(X_data1)



In [197]:

    
y_data1=credit1["Approval"]



In [198]:

    
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_data1, y_data1, random_state=12, test_size=0.2)



In [121]:

    
from sklearn.linear_model import LogisticRegression



In [122]:

    
clf = LogisticRegression(C=1.5)



In [123]:

    
clf.fit(X_train, y_train)









    Out[123]:





LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [124]:

    
#looks like logistic regression is the best
clf.score(X_test, y_test)









    Out[124]:





0.85507246376811596



In [155]:

    
param={'C':np.logspace(-3,3,100)}
lin_grid = GridSearchCV(LogisticRegression(),param)
lin_grid.fit(X_train,y_train)









    Out[155]:





GridSearchCV(cv=None,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   1.14976e-03, ...,   8.69749e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [156]:

    
lin_grid.score(X_test, y_test)









    Out[156]:





0.85507246376811596



In [200]:

    
param={'C':np.logspace(-3,3,100)}
lin_grid1 = GridSearchCV(LogisticRegression(),param)
lin_grid1.fit(X_train1,y_train1)









    Out[200]:





GridSearchCV(cv=None,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   1.14976e-03, ...,   8.69749e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [201]:

    
lin_grid1.score(X_test1, y_test1)









    Out[201]:





0.90839694656488545



In [125]:

    
from sklearn.metrics import confusion_matrix, classification_report



In [126]:

    
y_pred = clf.predict(X_test)



In [157]:

    
confusion_matrix(y_test, y_pred)









    Out[157]:





array([[64, 13],
       [ 7, 54]])



In [166]:

    
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[64 13]
 [ 7 54]]



In [128]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.90      0.83      0.86        77
          1       0.81      0.89      0.84        61

avg / total       0.86      0.86      0.86       138



In [207]:

    
y_pred2 = lin_grid.predict(X_test)



In [208]:

    
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred2)

print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[64 13]
 [ 7 54]]



In [202]:

    
y_pred1 = lin_grid1.predict(X_test1)



In [204]:

    
# Compute confusion matrix
cm = confusion_matrix(y_test1, y_pred1)

print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[70 11]
 [ 1 49]]



In [129]:

    
pd.DataFrame(zip(X_data.columns, np.transpose(clf.coef_)))









    Out[129]:






  
    
      
      0
      1
    
  
  
    
      0 
          A2
        [0.00421308149235]
    
    
      1 
          A3
       [0.000404238963896]
    
    
      2 
          A8
         [0.0810899397003]
    
    
      3 
         A11
          [0.143919472832]
    
    
      4 
         A15
       [0.000405937251194]
    
    
      ...
      ...
      ...
    
    
      39
       A12_f
         [0.0298110076537]
    
    
      40
       A12_t
         [-0.170524958022]
    
    
      41
       A13_g
           [-0.7433672102]
    
    
      42
       A13_p
          [0.907141521981]
    
    
      43
       A13_s
         [-0.304488262149]
    
  

44 rows × 2 columns



In [158]:

    
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.head()



In [130]:



In [131]:

    
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)









    Out[131]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [132]:

    
svc.score(X_test,y_test)
#not as good as logistic regression









    Out[132]:





0.71014492753623193



In [133]:

    
linest=LinearSVC(C=1e-3)
linest.fit(X_train, y_train)









    Out[133]:





LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [134]:

    
#not as good as logistic regression
linest.score(X_train, y_train)









    Out[134]:





0.78442028985507251



In [135]:

    
lda = LDA()
lda.fit(X_train, y_train)









    Out[135]:





LDA(n_components=None, priors=None)



In [136]:

    
lda.score(X_test, y_test)
#linear discriminant analysis not quite as good as logistic regression









    Out[136]:





0.84782608695652173



In [137]:

    
neighbor = KNeighborsClassifier(n_neighbors=1)
neighbor.fit(X_train, y_train)









    Out[137]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')



In [138]:

    
neighbor.score(X_test, y_test)
#nearest neighbor not as good as logitic regression









    Out[138]:





0.72463768115942029



In [139]:

    
X_train.shape









    Out[139]:





(552, 44)



In [140]:

    
y_train.shape









    Out[140]:





(552,)



In [141]:

    
param={'C':np.logspace(-3,3,10), 'gamma':np.logspace(-3,3,10)}
svc_grid = GridSearchCV(SVC(),param)
svc_grid.fit(X_train,y_train)









    Out[141]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [142]:

    
svc_grid.score(X_test,y_test)









    Out[142]:





0.76086956521739135



In [162]:

    
param={'C':np.logspace(-3,3,500)}
linsvc_grid = GridSearchCV(LinearSVC(),param)
linsvc_grid.fit(X_train,y_train)









    Out[162]:





GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   1.02807e-03, ...,   9.72693e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [160]:

    
linsvc_grid.score(X_test,y_test)









    Out[160]:





0.8188405797101449



In [82]:



In [ ]:

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
0	b	30.8	0.0	u	g	w	v	1.2	t	t	1	f	g	202	0	+
1	a	58.7	4.5	u	g	q	h	3.0	t	t	6	f	g	43	560	+
2	a	24.5	0.5	u	g	q	h	1.5	t	f	0	f	g	280	824	+
3	b	27.8	1.5	u	g	w	v	3.8	t	t	5	t	g	100	3	+
4	b	20.2	5.6	u	g	w	v	1.7	t	f	0	f	s	120	0	+

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16	Approval	result
0	b	30.8	0.0	u	g	w	v	1.2	t	t	1	f	g	00202	0	1	1	None
1	a	58.7	4.5	u	g	q	h	3.0	t	t	6	f	g	00043	560	1	1	None
2	a	24.5	0.5	u	g	q	h	1.5	t	f	0	f	g	00280	824	1	1	None
3	b	27.8	1.5	u	g	w	v	3.8	t	t	5	t	g	00100	3	1	1	None
4	b	20.2	5.6	u	g	w	v	1.7	t	f	0	f	s	00120	0	1	1	None

	sum	count	creditrating
A6
aa	19	54	0.4
c	66	146	0.5
cc	29	41	0.7
d	7	30	0.2
e	14	25	0.6
...	...	...	...
m	16	38	0.4
q	51	78	0.7
r	2	3	0.7
w	33	64	0.5
x	32	38	0.8

	0	1
0	A2	[0.00421308149235]
1	A3	[0.000404238963896]
2	A8	[0.0810899397003]
3	A11	[0.143919472832]
4	A15	[0.000405937251194]
...	...	...
39	A12_f	[0.0298110076537]
40	A12_t	[-0.170524958022]
41	A13_g	[-0.7433672102]
42	A13_p	[0.907141521981]
43	A13_s	[-0.304488262149]

	sum	count	creditrating
A6
aa	19	54	0.4
c	66	146	0.5
cc	29	41	0.7
d	7	30	0.2
e	14	25	0.6
...	...	...	...
m	16	38	0.4
q	51	78	0.7
r	2	3	0.7
w	33	64	0.5
x	32	38	0.8

	sum	count	creditrating
A6
aa	19	54	0.4
c	66	146	0.5
cc	29	41	0.7
d	7	30	0.2
e	14	25	0.6
...	...	...	...
m	16	38	0.4
q	51	78	0.7
r	2	3	0.7
w	33	64	0.5
x	32	38	0.8