In [12]:
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

%matplotlib inline

# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 8)

In [13]:
print 'Pandas Version: ', pd.__version__
print 'Numpy Version: ', np.__version__
print 'Python Version: ', sys.version


Pandas Version:  0.15.2
Numpy Version:  1.9.1
Python Version:  2.7.9 |Continuum Analytics, Inc.| (default, Dec 12 2014, 14:56:35) 
[GCC 4.2.1 (Apple Inc. build 5577)]

In [16]:
df = pd.read_csv('../data/credit-screening/crx_data.csv', header = None)

In [17]:
df[df.values=='?']


Out[17]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
71 b 34.83 4.000 u g d bb 12.500 t f 0 t g ? 0 -
83 a ? 3.500 u g d v 3.000 t f 0 t g 00300 0 -
86 b ? 0.375 u g d v 0.875 t f 0 t s 00928 0 -
92 b ? 5.000 y p aa v 8.500 t f 0 f g 00000 0 -
97 b ? 0.500 u g c bb 0.835 t f 0 t s 00320 0 -
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
622 a 25.58 0.000 ? ? ? ? 0.000 f f 0 f p ? 0 +
622 a 25.58 0.000 ? ? ? ? 0.000 f f 0 f p ? 0 +
626 b 22.00 7.835 y p i bb 0.165 f f 0 t g ? 0 -
641 ? 33.17 2.250 y p cc v 3.500 f f 0 t g 00200 141 -
673 ? 29.50 2.000 y p e h 2.000 f f 0 f g 00256 17 -

67 rows × 16 columns


In [18]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64
15    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [19]:
#replace ? with nan
df.replace('?', np.nan, inplace = True)
df[1] = df[1].astype(np.float)
df[13]=df[13].astype(np.float)

#rename columns with information from metadata
df.columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16']

In [20]:
#convert y into 1 and 0
def converter(category):
    conv = {}
    conv['+'] = 1
    conv['-'] = 0
    return conv[category]
df['a17'] = df.a16.map(converter)

In [21]:
#histograms of continuous_columns by y
continuous_columns = ['a2','a3','a8','a11','a14','a15']

for i in continuous_columns:
    df[i].hist(by=df.a17,sharey=True, bins=20, figsize=(10,3))



In [22]:
#bars of categorical_columns as a proportion of y variables
categorical_columns = ['a1','a4','a5','a6','a7','a8','a9','a10','a12','a13']

for i in categorical_columns:
    df.groupby(i).a17.agg(['mean']).plot(kind='bar',legend=False)



In [23]:
#fill in continous_columns with random N(mean,sd)
continuous_columns = ['a2','a3','a8','a11','a14','a15']
def impute_continuous_values(mean, sd, n):
    return np.random.normal(mean, sd, n)

for i in continuous_columns:
    df.loc[df[i].isnull(), i] = impute_continuous_values(df[i].mean(), 
                                                                   df[i].std(), 
                                                                   len(df[df[i].isnull()].index))

In [24]:
#class that imputes categorical variables with most frequent values and imputes continuous variables with mean
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """
        Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
    
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [25]:
X = df.ix[:,:15]
X = pd.get_dummies(DataFrameImputer().fit_transform(X))
y= df.a17

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X.values,y.values, test_size=0.2, random_state=12)

In [27]:
params = {'penalty':['l1','l2'],
          'C':np.logspace(-3,3,100)}

log_est = GridSearchCV(LogisticRegression(),params, n_jobs=-1)

In [28]:
log_est.fit(X_train,y_train)


Out[28]:
GridSearchCV(cv=None,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-03,   1.14976e-03, ...,   8.69749e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [29]:
log_est.best_params_


Out[29]:
{'C': 4.3287612810830618, 'penalty': 'l1'}

In [30]:
log_est.score(X_test,y_test)


Out[30]:
0.86956521739130432

In [31]:
y_pred = log_est.predict(X_test)

In [32]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


[[64 13]
 [ 5 56]]

In [33]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.93      0.83      0.88        77
          1       0.81      0.92      0.86        61

avg / total       0.88      0.87      0.87       138


In [34]:
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b12b090>

In [35]:
param_grid = {'C': np.logspace(-3,3,10),'kernel': ['rbf'], 'gamma':np.logspace(-3,3,10)}

svm_est = GridSearchCV(SVC(), param_grid, n_jobs=-1)

In [36]:
svm_est.fit(X_train,y_train)


Out[36]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'kernel': ['rbf'], 'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [37]:
svm_est.best_params_


Out[37]:
{'C': 46.415888336127729, 'gamma': 0.001, 'kernel': 'rbf'}

In [38]:
y_pred = svm_est.predict(X_test)

In [39]:
confusion_matrix(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


[[54 23]
 [15 46]]

In [40]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.78      0.70      0.74        77
          1       0.67      0.75      0.71        61

avg / total       0.73      0.72      0.73       138


In [41]:
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x1097c4790>

In [ ]:


In [ ]:


In [ ]: