In [12]:
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 8)
In [13]:
print 'Pandas Version: ', pd.__version__
print 'Numpy Version: ', np.__version__
print 'Python Version: ', sys.version
In [16]:
df = pd.read_csv('../data/credit-screening/crx_data.csv', header = None)
In [17]:
df[df.values=='?']
Out[17]:
In [18]:
df.info()
In [19]:
#replace ? with nan
df.replace('?', np.nan, inplace = True)
df[1] = df[1].astype(np.float)
df[13]=df[13].astype(np.float)
#rename columns with information from metadata
df.columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16']
In [20]:
#convert y into 1 and 0
def converter(category):
conv = {}
conv['+'] = 1
conv['-'] = 0
return conv[category]
df['a17'] = df.a16.map(converter)
In [21]:
#histograms of continuous_columns by y
continuous_columns = ['a2','a3','a8','a11','a14','a15']
for i in continuous_columns:
df[i].hist(by=df.a17,sharey=True, bins=20, figsize=(10,3))
In [22]:
#bars of categorical_columns as a proportion of y variables
categorical_columns = ['a1','a4','a5','a6','a7','a8','a9','a10','a12','a13']
for i in categorical_columns:
df.groupby(i).a17.agg(['mean']).plot(kind='bar',legend=False)
In [23]:
#fill in continous_columns with random N(mean,sd)
continuous_columns = ['a2','a3','a8','a11','a14','a15']
def impute_continuous_values(mean, sd, n):
return np.random.normal(mean, sd, n)
for i in continuous_columns:
df.loc[df[i].isnull(), i] = impute_continuous_values(df[i].mean(),
df[i].std(),
len(df[df[i].isnull()].index))
In [24]:
#class that imputes categorical variables with most frequent values and imputes continuous variables with mean
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
def __init__(self):
"""
Impute missing values.
Columns of dtype object are imputed with the most frequent value
in column.
Columns of other types are imputed with mean of column.
"""
def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0]
if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
In [25]:
X = df.ix[:,:15]
X = pd.get_dummies(DataFrameImputer().fit_transform(X))
y= df.a17
In [26]:
X_train, X_test, y_train, y_test = train_test_split(X.values,y.values, test_size=0.2, random_state=12)
In [27]:
params = {'penalty':['l1','l2'],
'C':np.logspace(-3,3,100)}
log_est = GridSearchCV(LogisticRegression(),params, n_jobs=-1)
In [28]:
log_est.fit(X_train,y_train)
Out[28]:
In [29]:
log_est.best_params_
Out[29]:
In [30]:
log_est.score(X_test,y_test)
Out[30]:
In [31]:
y_pred = log_est.predict(X_test)
In [32]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [33]:
print classification_report(y_test, y_pred)
In [34]:
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[34]:
In [35]:
param_grid = {'C': np.logspace(-3,3,10),'kernel': ['rbf'], 'gamma':np.logspace(-3,3,10)}
svm_est = GridSearchCV(SVC(), param_grid, n_jobs=-1)
In [36]:
svm_est.fit(X_train,y_train)
Out[36]:
In [37]:
svm_est.best_params_
Out[37]:
In [38]:
y_pred = svm_est.predict(X_test)
In [39]:
confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [40]:
print classification_report(y_test, y_pred)
In [41]:
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[41]:
In [ ]:
In [ ]:
In [ ]: