In [1]:
#Importing all the required packages
# show plots in the notebook
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt
In [2]:
inputDataset = pd.DataFrame.from_csv(path="/Users/poj871/IUCourse/Fall2016/Project/CreditScreening.csv", index_col=None)
print inputDataset.head(2)
print inputDataset.count()
In [3]:
inputDataset = inputDataset.dropna()
#Droping with missing Male features
inputDataset = inputDataset.drop(inputDataset[inputDataset['Male'] == '?'].index)
#Dropping with missing Age Features
inputDataset = inputDataset.drop(inputDataset[inputDataset['Age'] == '?'].index)
#Dropping with missing Married Features
inputDataset = inputDataset.drop(inputDataset[inputDataset['Married'] == '?'].index)
inputDataset = inputDataset.drop(inputDataset[inputDataset['BankCustomer'] == '?'].index)
inputDataset = inputDataset.drop(inputDataset[inputDataset['EducationalLevel'] == '?'].index)
inputDataset = inputDataset.drop(inputDataset[inputDataset['Ethnicity'] == '?'].index)
inputDataset = inputDataset.drop(inputDataset[inputDataset['ZipCode'] == '?'].index)
print inputDataset.count()
In [4]:
inputDataset['Male'] = inputDataset['Male'].astype('category')
inputDataset['Married'] = inputDataset['Married'].astype('category')
inputDataset['BankCustomer'] = inputDataset['BankCustomer'].astype('category')
inputDataset['EducationalLevel'] = inputDataset['EducationalLevel'].astype('category')
inputDataset['Ethnicity'] = inputDataset['Ethnicity'].astype('category')
inputDataset['PriorDefault'] = inputDataset['PriorDefault'].astype('category')
inputDataset['Employed'] = inputDataset['Employed'].astype('category')
inputDataset['DriversLicense'] = inputDataset['DriversLicense'].astype('category')
inputDataset['Citizen'] = inputDataset['Citizen'].astype('category')
#Class label
inputDataset['Approved'] = inputDataset['Approved'].astype('category')
print inputDataset.dtypes
In [5]:
cat_columns = inputDataset.select_dtypes(['category']).columns
inputDataset[cat_columns] = inputDataset[cat_columns].apply(lambda x: x.cat.codes)
print inputDataset.head(2)
In [6]:
inputDataset.groupby('Approved').mean()
Out[6]:
In [7]:
X = inputDataset.values[:, :15]
y = inputDataset.values[:, 15:]
# Flattening the labels into 1-D array for SkLearn to work
y = np.ravel(y)
#Converting the datatype to integer
y = np.asarray(y, dtype="intc")
print y.dtype
In [8]:
logisticRegressionModel = LogisticRegression()
logisticRegressionModel = logisticRegressionModel.fit(X, y)
logisticRegressionModel.score(X, y)
Out[8]:
In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=0)
trainModel = LogisticRegression()
trainModel.fit(X= X_train, y=y_train)
Out[9]:
In [10]:
predictedLabels = trainModel.predict(X_test)
In [11]:
print metrics.accuracy_score(y_test, predictedLabels)
print ("\n")
print metrics.confusion_matrix(y_test, predictedLabels)
In [12]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,predictedLabels)
roc_auc = auc(false_positive_rate, true_positive_rate)
print false_positive_rate
print true_positive_rate
plt.title('Receiver Operating Characteristic Logistic Regression')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [13]:
decisionTreeModel = tree.DecisionTreeClassifier()
decisionTreeModel.fit(X=X_train, y=y_train)
Out[13]:
In [14]:
predictedLabelsDT = decisionTreeModel.predict(X_test)
In [15]:
print metrics.accuracy_score(y_test, predictedLabelsDT)
print ("\n")
print metrics.confusion_matrix(y_test, predictedLabelsDT)
In [17]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,predictedLabelsDT)
roc_auc = auc(false_positive_rate, true_positive_rate)
print false_positive_rate
print true_positive_rate
plt.title('Receiver Operating Characteristic Decision Trees')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()