Logistic Reasoning Modelling - Credit Screening dataset http://mlr.cs.umass.edu/ml/datasets/Credit+Approval



In [1]:

    
#Importing all the required packages

# show plots in the notebook
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt









    



/Users/poj871/SparkVirtualEnv/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
inputDataset = pd.DataFrame.from_csv(path="/Users/poj871/IUCourse/Fall2016/Project/CreditScreening.csv", index_col=None)

print inputDataset.head(2)

print inputDataset.count()









    



  Male    Age  Debt Married BankCustomer EducationalLevel Ethnicity  \
0    b  30.83  0.00       u            g                w         v   
1    a  58.67  4.46       u            g                q         h   

   YearsEmployed PriorDefault Employed  CreditScore DriversLicense Citizen  \
0           1.25            t        t            1              f       g   
1           3.04            t        t            6              f       g   

  ZipCode  Income Approved  
0   00202       0        +  
1   00043     560        +  
Male                690
Age                 690
Debt                690
Married             690
BankCustomer        690
EducationalLevel    690
Ethnicity           690
YearsEmployed       690
PriorDefault        690
Employed            690
CreditScore         690
DriversLicense      690
Citizen             690
ZipCode             690
Income              690
Approved            690
dtype: int64

Drop rows with Missing values



In [3]:

    
inputDataset = inputDataset.dropna()

#Droping with missing Male features
inputDataset = inputDataset.drop(inputDataset[inputDataset['Male'] == '?'].index)


#Dropping with missing Age Features
inputDataset = inputDataset.drop(inputDataset[inputDataset['Age'] == '?'].index)

#Dropping with missing Married Features

inputDataset = inputDataset.drop(inputDataset[inputDataset['Married'] == '?'].index)

inputDataset = inputDataset.drop(inputDataset[inputDataset['BankCustomer'] == '?'].index)

inputDataset = inputDataset.drop(inputDataset[inputDataset['EducationalLevel'] == '?'].index)

inputDataset = inputDataset.drop(inputDataset[inputDataset['Ethnicity'] == '?'].index)

inputDataset = inputDataset.drop(inputDataset[inputDataset['ZipCode'] == '?'].index)
print inputDataset.count()









    



Male                653
Age                 653
Debt                653
Married             653
BankCustomer        653
EducationalLevel    653
Ethnicity           653
YearsEmployed       653
PriorDefault        653
Employed            653
CreditScore         653
DriversLicense      653
Citizen             653
ZipCode             653
Income              653
Approved            653
dtype: int64

Adding categorical datatypes



In [4]:

    
inputDataset['Male'] = inputDataset['Male'].astype('category')

inputDataset['Married'] = inputDataset['Married'].astype('category')

inputDataset['BankCustomer'] = inputDataset['BankCustomer'].astype('category')

inputDataset['EducationalLevel'] = inputDataset['EducationalLevel'].astype('category')

inputDataset['Ethnicity'] = inputDataset['Ethnicity'].astype('category')

inputDataset['PriorDefault'] = inputDataset['PriorDefault'].astype('category')

inputDataset['Employed'] = inputDataset['Employed'].astype('category')

inputDataset['DriversLicense'] = inputDataset['DriversLicense'].astype('category')

inputDataset['Citizen'] = inputDataset['Citizen'].astype('category')

#Class label
inputDataset['Approved'] = inputDataset['Approved'].astype('category')


print inputDataset.dtypes









    



Male                category
Age                   object
Debt                 float64
Married             category
BankCustomer        category
EducationalLevel    category
Ethnicity           category
YearsEmployed        float64
PriorDefault        category
Employed            category
CreditScore            int64
DriversLicense      category
Citizen             category
ZipCode               object
Income                 int64
Approved            category
dtype: object

Making the output label as 0 or 1 instead of '+' or '-'



In [5]:

    
cat_columns = inputDataset.select_dtypes(['category']).columns

inputDataset[cat_columns] = inputDataset[cat_columns].apply(lambda x: x.cat.codes)

print inputDataset.head(2)









    



   Male    Age  Debt  Married  BankCustomer  EducationalLevel  Ethnicity  \
0     1  30.83  0.00        1             0                12          7   
1     0  58.67  4.46        1             0                10          3   

   YearsEmployed  PriorDefault  Employed  CreditScore  DriversLicense  \
0           1.25             1         1            1               0   
1           3.04             1         1            6               0   

   Citizen ZipCode  Income  Approved  
0        0   00202       0         0  
1        0   00043     560         0

Exploring Dataset with GroupBy



In [6]:

    
inputDataset.groupby('Approved').mean()









    Out[6]:






  
    
      
      Male
      Debt
      Married
      BankCustomer
      EducationalLevel
      Ethnicity
      YearsEmployed
      PriorDefault
      Employed
      CreditScore
      DriversLicense
      Citizen
      Income
    
    
      Approved
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      0.679054
      5.971943
      1.14527
      0.310811
      6.354730
      5.054054
      3.475186
      0.939189
      0.685811
      4.716216
      0.489865
      0.104730
      2009.726351
    
    
      1
      0.697479
      3.882325
      1.29972
      0.599440
      5.291317
      5.109244
      1.223725
      0.198880
      0.235294
      0.666667
      0.439776
      0.215686
      187.974790

Creating Features and label sets for the model



In [7]:

    
X = inputDataset.values[:, :15]

y = inputDataset.values[:, 15:]

# Flattening the labels into 1-D array for SkLearn to work
y = np.ravel(y)

#Converting the datatype to integer
y = np.asarray(y, dtype="intc")
print y.dtype









    



int32

Creating the model based on the features and labels



In [8]:

    
logisticRegressionModel = LogisticRegression()

logisticRegressionModel = logisticRegressionModel.fit(X, y)


logisticRegressionModel.score(X, y)









    Out[8]:





0.87748851454823895

Split the data into training and test to predict on the random test dataset



In [9]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=0)

trainModel = LogisticRegression()

trainModel.fit(X= X_train, y=y_train)









    Out[9]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Predicting the output labels using the model



In [10]:

    
predictedLabels = trainModel.predict(X_test)

Calculating the accuracy score & Confusion Matrix



In [11]:

    
print metrics.accuracy_score(y_test, predictedLabels)

print ("\n")
print metrics.confusion_matrix(y_test, predictedLabels)









    



0.896174863388


[[76  5]
 [14 88]]

We can see the accuracy of the Logistic Regression model is 89%

Plot the ROC using Matplotlib



In [12]:

    
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,predictedLabels)
roc_auc = auc(false_positive_rate, true_positive_rate)

print false_positive_rate
print true_positive_rate

plt.title('Receiver Operating Characteristic Logistic Regression')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



[ 0.         0.0617284  1.       ]
[ 0.         0.8627451  1.       ]

Now using the same training and test splits we are going to model using the decision trees algorithms and compare the accuracy, confusion matrix and ROC area



In [13]:

    
decisionTreeModel = tree.DecisionTreeClassifier()

decisionTreeModel.fit(X=X_train, y=y_train)









    Out[13]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

Predicting the output labels using the Decision Trees Classifier



In [14]:

    
predictedLabelsDT = decisionTreeModel.predict(X_test)

Accuracy and Confusion matrix using the Decision Trees Classifier



In [15]:

    
print metrics.accuracy_score(y_test, predictedLabelsDT)

print ("\n")
print metrics.confusion_matrix(y_test, predictedLabelsDT)









    



0.868852459016


[[67 14]
 [10 92]]

Accuracy using the Decision Tree algorithm is 86% on the test dataset

Plotting the ROC



In [17]:

    
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,predictedLabelsDT)
roc_auc = auc(false_positive_rate, true_positive_rate)

print false_positive_rate
print true_positive_rate


plt.title('Receiver Operating Characteristic Decision Trees')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



[ 0.          0.17283951  1.        ]
[ 0.          0.90196078  1.        ]

	Male	Debt	Married	BankCustomer	EducationalLevel	Ethnicity	YearsEmployed	PriorDefault	Employed	CreditScore	DriversLicense	Citizen	Income
Approved
0	0.679054	5.971943	1.14527	0.310811	6.354730	5.054054	3.475186	0.939189	0.685811	4.716216	0.489865	0.104730	2009.726351
1	0.697479	3.882325	1.29972	0.599440	5.291317	5.109244	1.223725	0.198880	0.235294	0.666667	0.439776	0.215686	187.974790