notebook.community

Edit and run



In [12]:

    
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

%matplotlib inline

# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 8)



In [13]:

    
print 'Pandas Version: ', pd.__version__
print 'Numpy Version: ', np.__version__
print 'Python Version: ', sys.version









    



Pandas Version:  0.15.2
Numpy Version:  1.9.1
Python Version:  2.7.9 |Continuum Analytics, Inc.| (default, Dec 12 2014, 14:56:35) 
[GCC 4.2.1 (Apple Inc. build 5577)]



In [16]:

    
df = pd.read_csv('../data/credit-screening/crx_data.csv', header = None)



In [17]:

    
df[df.values=='?']









    Out[17]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
    
  
  
    
      71 
       b
       34.83
       4.000
       u
       g
        d
       bb
       12.500
       t
       f
       0
       t
       g
           ?
         0
       -
    
    
      83 
       a
           ?
       3.500
       u
       g
        d
        v
        3.000
       t
       f
       0
       t
       g
       00300
         0
       -
    
    
      86 
       b
           ?
       0.375
       u
       g
        d
        v
        0.875
       t
       f
       0
       t
       s
       00928
         0
       -
    
    
      92 
       b
           ?
       5.000
       y
       p
       aa
        v
        8.500
       t
       f
       0
       f
       g
       00000
         0
       -
    
    
      97 
       b
           ?
       0.500
       u
       g
        c
       bb
        0.835
       t
       f
       0
       t
       s
       00320
         0
       -
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      622
       a
       25.58
       0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
           ?
         0
       +
    
    
      622
       a
       25.58
       0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
           ?
         0
       +
    
    
      626
       b
       22.00
       7.835
       y
       p
        i
       bb
        0.165
       f
       f
       0
       t
       g
           ?
         0
       -
    
    
      641
       ?
       33.17
       2.250
       y
       p
       cc
        v
        3.500
       f
       f
       0
       t
       g
       00200
       141
       -
    
    
      673
       ?
       29.50
       2.000
       y
       p
        e
        h
        2.000
       f
       f
       0
       f
       g
       00256
        17
       -
    
  

67 rows × 16 columns



In [18]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64
15    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [19]:

    
#replace ? with nan
df.replace('?', np.nan, inplace = True)
df[1] = df[1].astype(np.float)
df[13]=df[13].astype(np.float)

#rename columns with information from metadata
df.columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16']



In [20]:

    
#convert y into 1 and 0
def converter(category):
    conv = {}
    conv['+'] = 1
    conv['-'] = 0
    return conv[category]
df['a17'] = df.a16.map(converter)



In [21]:

    
#histograms of continuous_columns by y
continuous_columns = ['a2','a3','a8','a11','a14','a15']

for i in continuous_columns:
    df[i].hist(by=df.a17,sharey=True, bins=20, figsize=(10,3))



In [22]:

    
#bars of categorical_columns as a proportion of y variables
categorical_columns = ['a1','a4','a5','a6','a7','a8','a9','a10','a12','a13']

for i in categorical_columns:
    df.groupby(i).a17.agg(['mean']).plot(kind='bar',legend=False)



In [23]:

    
#fill in continous_columns with random N(mean,sd)
continuous_columns = ['a2','a3','a8','a11','a14','a15']
def impute_continuous_values(mean, sd, n):
    return np.random.normal(mean, sd, n)

for i in continuous_columns:
    df.loc[df[i].isnull(), i] = impute_continuous_values(df[i].mean(), 
                                                                   df[i].std(), 
                                                                   len(df[df[i].isnull()].index))



In [24]:

    
#class that imputes categorical variables with most frequent values and imputes continuous variables with mean
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """
        Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
    
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.fill)



In [25]:

    
X = df.ix[:,:15]
X = pd.get_dummies(DataFrameImputer().fit_transform(X))
y= df.a17



In [26]:

    
X_train, X_test, y_train, y_test = train_test_split(X.values,y.values, test_size=0.2, random_state=12)



In [27]:

    
params = {'penalty':['l1','l2'],
          'C':np.logspace(-3,3,100)}

log_est = GridSearchCV(LogisticRegression(),params, n_jobs=-1)



In [28]:

    
log_est.fit(X_train,y_train)









    Out[28]:





GridSearchCV(cv=None,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-03,   1.14976e-03, ...,   8.69749e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [29]:

    
log_est.best_params_









    Out[29]:





{'C': 4.3287612810830618, 'penalty': 'l1'}



In [30]:

    
log_est.score(X_test,y_test)









    Out[30]:





0.86956521739130432



In [31]:

    
y_pred = log_est.predict(X_test)



In [32]:

    
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[64 13]
 [ 5 56]]



In [33]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.93      0.83      0.88        77
          1       0.81      0.92      0.86        61

avg / total       0.88      0.87      0.87       138



In [34]:

    
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b12b090>



In [35]:

    
param_grid = {'C': np.logspace(-3,3,10),'kernel': ['rbf'], 'gamma':np.logspace(-3,3,10)}

svm_est = GridSearchCV(SVC(), param_grid, n_jobs=-1)



In [36]:

    
svm_est.fit(X_train,y_train)









    Out[36]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'kernel': ['rbf'], 'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [37]:

    
svm_est.best_params_









    Out[37]:





{'C': 46.415888336127729, 'gamma': 0.001, 'kernel': 'rbf'}



In [38]:

    
y_pred = svm_est.predict(X_test)



In [39]:

    
confusion_matrix(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[54 23]
 [15 46]]



In [40]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.78      0.70      0.74        77
          1       0.67      0.75      0.71        61

avg / total       0.73      0.72      0.73       138



In [41]:

    
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x1097c4790>



In [ ]:



In [ ]:



In [ ]:

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
71	b	34.83	4.000	u	g	d	bb	12.500	t	f	0	t	g	?	0	-
83	a	?	3.500	u	g	d	v	3.000	t	f	0	t	g	00300	0	-
86	b	?	0.375	u	g	d	v	0.875	t	f	0	t	s	00928	0	-
92	b	?	5.000	y	p	aa	v	8.500	t	f	0	f	g	00000	0	-
97	b	?	0.500	u	g	c	bb	0.835	t	f	0	t	s	00320	0	-
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
622	a	25.58	0.000	?	?	?	?	0.000	f	f	0	f	p	?	0	+
622	a	25.58	0.000	?	?	?	?	0.000	f	f	0	f	p	?	0	+
626	b	22.00	7.835	y	p	i	bb	0.165	f	f	0	t	g	?	0	-
641	?	33.17	2.250	y	p	cc	v	3.500	f	f	0	t	g	00200	141	-
673	?	29.50	2.000	y	p	e	h	2.000	f	f	0	f	g	00256	17	-