In [340]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
pd.set_option('display.max_rows', 500)
from pandas.tools.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix, classification_report
In [341]:
#get the data
#data does no headers, so import with header=None to prevent first row of data from being used as header
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',header=None)
df.head()
Out[341]:
In [342]:
#create a list to be used as new headers for each column
h=[]
for i in range(0,16):
h.append('A%s' % str(i+1))
#rename the headers with the newly created list
for i in range(0,16):
df.rename(columns={i:h[i]},inplace=True)
df.head()
Out[342]:
In [343]:
#determine if there are any missing rows
df.info()
In [344]:
df.head()
Out[344]:
In [345]:
#there are a bunch of question marks (missing data)
#replace them with 'Nan'
df.replace('?', np.nan, inplace = True)
In [346]:
#now, impute values for the new 'Nan' values
#A1 - randomly impute 'a' or 'b'
#A2 - impute A2 mean
#A4 - randomly impute 'u' or 'l' or 'y' or 't'
#A5 - randomly impute 'g' or 'p' or 'gg'
#A6 - randomly impute 'c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff'
#A7 - randomly impute 'v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o'
#A14 - impute A14 mean
#switch data type from object to float
df.A2 = df.A2.astype(float)
df['A2'].fillna(df['A2'].mean(), inplace=True)
df.A14 = df.A14.astype(float)
df['A14'].fillna(df['A14'].mean(), inplace=True)
df['A1'].fillna(np.random.choice(df.A1), inplace=True)
df['A4'].fillna(np.random.choice(df.A4), inplace=True)
df['A5'].fillna(np.random.choice(df.A5), inplace=True)
df['A6'].fillna(np.random.choice(df.A6), inplace=True)
df['A7'].fillna(np.random.choice(df.A7), inplace=True)
In [347]:
#test to make sure there are no more Nan values
df[pd.isnull(df.A7)] #A1, A2, etc..
Out[347]:
In [348]:
#describe the data
df.describe()
Out[348]:
In [349]:
scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde')
Out[349]:
In [350]:
df.corr()
Out[350]:
In [351]:
X = df.ix[:,'A1':'A15']
X = pd.get_dummies(X)
X.head()
Out[351]:
In [352]:
y = df['A16']
y.head()
Out[352]:
In [353]:
#Split the data into train/test sets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.3, random_state=12)
In [354]:
y_train.shape
Out[354]:
In [355]:
#call Logistic Regression model
lr_est = LogisticRegression()
In [356]:
#fit training data to the LR model
lr_est.fit(X_train,y_train)
Out[356]:
In [357]:
#score the test data using the LR model
lr_est.score(X_test,y_test)
Out[357]:
In [358]:
#call SVC
svc_est = SVC()
In [359]:
#fit training data to SVC
svc_est.fit(X_train,y_train)
Out[359]:
In [360]:
#score the test data using SVC
svc_est.score(X_test,y_test)
Out[360]:
In [361]:
#use grid search to find the best C for the LR model
#standard np.logspace(-3,3,10)
param = {'C': np.logspace(-3,3,10)}
gs_lr = GridSearchCV(LogisticRegression(),param)
In [362]:
#fit the training data to grid search
gs_lr.fit(X_train,y_train)
Out[362]:
In [363]:
#what is the best score in the range? what is the best C in the range?
gs_lr.grid_scores_
Out[363]:
In [364]:
print 'Best Score: ', gs_lr.best_score_
print 'Best Parameter: ', gs_lr.best_params_
In [365]:
#Incorporate best parameters
lr_est2 = LogisticRegression(C=gs_lr.best_params_['C'])
lr_est2.fit(X_train, y_train)
Out[365]:
In [366]:
lr_est2.score(X_test,y_test)
Out[366]:
In [367]:
#use grid search to find the best C and gamme for the SVC model
param = {'C': np.logspace(-3,3,10),'gamma':np.logspace(-3,3,10)}
gs_svc = GridSearchCV(SVC(),param)
In [368]:
#fir the training data to grid serach
gs_svc.fit(X_train,y_train)
Out[368]:
In [369]:
#which is the best score? The best 'C' and 'gamma' parameters?
print "Best Score: ", gs_svc.best_score_
print "Best Params: ", gs_svc.best_params_
In [370]:
#Incorporate best parameters
svc_est2 = SVC(C=gs_svc.best_params_['C'],gamma=gs_svc.best_params_['gamma'])
svc_est2.fit(X_train,y_train)
Out[370]:
In [371]:
svc_est2.score(X_test,y_test)
Out[371]:
In [372]:
#Create a confusion matrix for the Logistic Regression model
y_pred_lr = lr_est2.predict(X_test)
y_pred_lr
Out[372]:
In [373]:
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred_lr)
Out[373]:
In [374]:
# Examine Precision and Recall
print classification_report(y_test, y_pred_lr)
In [375]:
# Examine the coefficients and significance of Variables
pd.DataFrame(zip(X.columns, np.transpose(lr_est2.coef_)))
Out[375]:
In [376]:
#Now run the confusion matrix for the SVC model
y_pred_svc = svc_est2.predict(X_test)
y_pred_svc
Out[376]:
In [377]:
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred_svc)
Out[377]:
In [378]:
# Examine Precision and Recall
print classification_report(y_test,y_pred_svc)
In [379]:
#Of the two models, Logistic Regression seems like the better model for this data