In [2]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
# Limit the rows displayed in dataframe by inserting this line along with your imports.
pd.set_option('display.max_rows', 10)
In [3]:
# Create a data frame from the listings dataset
crx_data = pd.read_csv('../hw2/CRX_Data.csv', header=None)
In [4]:
# Replace the field names
crx_data.rename(columns={0: 'A1', 1: 'A2', 2: 'A3', 3: 'A4', 4: 'A5', 5: 'A6', 6: 'A7', 7: 'A8', 8: 'A9', 9: 'A10', 10: 'A11', 11: 'A12', 12: 'A13', 13: 'A14', 14: 'A15', 15: 'A16'}, inplace=True)
In [5]:
# Check the header of the file
crx_data.head(5)
Out[5]:
In [6]:
# Check for missing values
crx_data.info()
In [7]:
# Remove NaN values and convert A2 and A14 to continuous variables
crx_data.A2.replace('?', np.nan, inplace = True)
crx_data.A2 = crx_data.A2.astype(float)
crx_data.A14.replace('?', np.nan, inplace = True)
crx_data.A14 = crx_data.A14.astype(float)
In [8]:
# Run some descriptive statistics on numeric variables
crx_data.describe()
Out[8]:
In [9]:
# Impute values for A1
# Replace '?' with NaN
crx_data.A1.replace('?', np.nan, inplace = True)
# Get the distribution of values for A1
crx_data.A1.value_counts()
# Create a function that randomly assigns these values
a1 = ['b', 'a']
p = [0.69, 0.31]
def get_a1_impute_values(n):
return np.random.choice(a1, n, p)
# Get the NULL values for A1
crx_data.loc[crx_data.A1.isnull(), 'A1']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A1.isnull(), 'A1'] = get_a1_impute_values(n=12)
In [10]:
# Impute values for A2
# View the values for A2
crx_data.A2.value_counts()
# Get the Mean and Std of A2 Data
print 'Mean A2:', crx_data.A2.mean()
print 'Std A2:', crx_data.A2.std()
# Create a Normal Distribution centered on Mean of 31.57 and Standard Dev of 11.96
def get_a2_impute_values(n):
return np.random.normal(31.57, 11.96, n)
# Get the NULL values for A2
crx_data.loc[crx_data.A2.isnull(), 'A2']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A2.isnull(), 'A2'] = get_a2_impute_values(n=12)
In [11]:
# Impute values for A4
# Replace '?' with NaN
crx_data.A4.replace('?', np.nan, inplace = True)
# Get the distribution of values for A4
crx_data.A4.value_counts()
# Create a function that randomly assigns these values
a4 = ['u', 'y', 'l']
p = [0.76, 0.24, 0.]
def get_a4_impute_values(n):
return np.random.choice(a4, n, p)
# Get the NaN values for A4
crx_data.loc[crx_data.A4.isnull(), 'A4']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A4.isnull(), 'A4'] = get_a4_impute_values(n=6)
In [12]:
# Impute values for A5
# Replace '?' with NaN
crx_data.A5.replace('?', np.nan, inplace = True)
# Get the distribution of values for A1
crx_data.A5.value_counts()
# Create a function that randomly assigns these values
a5 = ['g', 'p', 'gg']
p = [0.76, 0.24, 0.00]
def get_a5_impute_values(n):
return np.random.choice(a5, n, p)
# Get the NaN values for A5
crx_data.loc[crx_data.A5.isnull(), 'A5']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A5.isnull(), 'A5'] = get_a1_impute_values(n=6)
In [13]:
# Impute values for A6
# Replace '?' with NaN
crx_data.A6.replace('?', np.nan, inplace = True)
# Get the distribution of values for A1
crx_data.A6.value_counts()
# Create a function that randomly assigns these values
a6 = ['aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm', 'q', 'r', 'w', 'x']
p = [0.08, 0.20, 0.06, 0.04, 0.04, 0.08, 0.09, 0.01, 0.07, 0.06, 0.11, 0.00, 0.09, 0,06]
def get_a6_impute_values(n):
return np.random.choice(a6, n, p)
# Get the NaN values for A6
crx_data.loc[crx_data.A6.isnull(), 'A6']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A6.isnull(), 'A6'] = get_a6_impute_values(n=9)
In [14]:
# Impute values for A7
# Replace '?' with NaN
crx_data.A7.replace('?', np.nan, inplace = True)
# Get the distribution of values for A7
crx_data.A7.value_counts()
# Create a function that randomly assigns these values
a7 = ['v', 'h', 'bb', 'ff', 'z', 'j', 'dd', 'n', 'o']
p = [0.59, 0.20, 0.09, 0.08, 0.01, 0.01, 0.01, 0.01, 0.00]
def get_a7_impute_values(n):
return np.random.choice(a7, n, p)
# Get the NaN values for A7
crx_data.loc[crx_data.A7.isnull(), 'A7']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A7.isnull(), 'A7'] = get_a7_impute_values(n=9)
In [15]:
# Impute values for A14
# View the values for A14
crx_data.A14.value_counts()
# Get the Mean and Std of A14 Data
print 'Mean A14:', crx_data.A14.mean()
print 'Std A14:', crx_data.A14.std()
# Create a Normal Distribution centered on Mean of 184.01 and Standard Dev of 173.81
def get_a14_impute_values(n):
return np.random.normal(184.01, 173.81, n)
# Get the NULL values for A2
crx_data.loc[crx_data.A14.isnull(), 'A14']
# Set these values to the values we picked from Random Normal Distribution
crx_data.loc[crx_data.A14.isnull(), 'A14'] = get_a14_impute_values(n=13)
In [16]:
# Convert the approval variable from a categorical to a continuous variable
for elem in crx_data['A16'].unique():
crx_data[str(elem)] = crx_data['A16'] == elem
# Rename the fields
crx_data.rename(columns={'+': 'Approved', '-': 'Denied'}, inplace=True)
In [75]:
# Plotting histograms with A1
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A1.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A1.value_counts().plot(kind='bar', ax=ax[1])
Out[75]:
In [63]:
# Plotting histograms with A4
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A4.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A4.value_counts().plot(kind='bar', ax=ax[1])
Out[63]:
In [64]:
# Plotting histograms with A5
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A5.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A5.value_counts().plot(kind='bar', ax=ax[1])
Out[64]:
In [65]:
# Plotting histograms with A6
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A6.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A6.value_counts().plot(kind='bar', ax=ax[1])
Out[65]:
In [66]:
# Plotting histograms with A7
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A7.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A7.value_counts().plot(kind='bar', ax=ax[1])
Out[66]:
In [67]:
# Plotting histograms with A9
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A9.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A9.value_counts().plot(kind='bar', ax=ax[1])
Out[67]:
In [69]:
# Plotting histograms with A10
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A10.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A10.value_counts().plot(kind='bar', ax=ax[1])
Out[69]:
In [70]:
# Plotting histograms with A12
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A12.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A12.value_counts().plot(kind='bar', ax=ax[1])
Out[70]:
In [71]:
# Plotting histograms with A13
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
crx_data[crx_data.Approved].A13.value_counts().plot(kind='bar', ax=ax[0])
crx_data[~crx_data.Approved].A13.value_counts().plot(kind='bar', ax=ax[1])
Out[71]:
In [57]:
# Import scatter_matrix functionality
from pandas.tools.plotting import scatter_matrix
# Generate a scatterplot matrix with the continuous variables
scat = scatter_matrix(crx_data[['A2', 'A3', 'A8', 'A11', 'A14', 'A15', 'Approved']], figsize=(15,15))
In [27]:
# Generate x data frame from credit card dataset
x_data = crx_data[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15']]
In [28]:
# Create dummy variables
x_data = pd.get_dummies(x_data)
x_data
Out[28]:
In [29]:
# Generate y data frame from credit card dataset
y_data = crx_data['Approved']
y_data
Out[29]:
In [30]:
# Import train_test_split from scikit-learn
from sklearn.cross_validation import train_test_split
# Divide the dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state=12, test_size=0.2)
In [31]:
# Import logistic regression package
from sklearn.linear_model import LogisticRegression
# Create estimator with logistic regression
clf = LogisticRegression()
# Fit the model with the training data
clf.fit(x_train, y_train)
# Score the model using test data
clf.score(x_test, y_test)
Out[31]:
In [32]:
# Score of 0.86 seems to suggest that the model predicts the actual approval outcomes extremely well
In [33]:
# Import SVC from scikit-learn
from sklearn.svm import SVC
# Create estimator with non-linear kernel
est = SVC()
# Fit the model with the training data
est.fit(x_train, y_train)
# Score the model using test data
est.score(x_test, y_test)
Out[33]:
In [34]:
# Score of 0.54 suggests that the SVM approach does a good job of predicting approval outcomes
# but not as good as the linear regression
In [35]:
# Import GridSearchCV from scikit-learn
from sklearn.grid_search import GridSearchCV
# Establish the search space for parameters of C and Gamma
param = {'C':np.logspace(-3,3,10),'gamma':np.logspace(-3,3,10)}
# Set up the grid search
gs = GridSearchCV(SVC(),param)
# Run the grid search on our model
gs.fit(x_train, y_train)
Out[35]:
In [36]:
# Display the parameters and score for the best fitting model
gs.best_params_,gs.best_score_
Out[36]:
In [37]:
# GridSearchCV finds that the best fitting model has a C = 46.4 and a gamma = 0.001
# The score of 0.67 suggests that this model behaves better than the ordinary SVC model (with the defaults)
# but that it does worse than the logistic regression model
In [38]:
# Import confusion matrix and classification report from scikit-learn
from sklearn.metrics import confusion_matrix, classification_report
# Predict values of Y based on values of X
y_pred = clf.predict(x_test)
In [39]:
# Confusion Matrix for Type 1 and Type 2 Error
print confusion_matrix(y_test, y_pred)
In [40]:
# The confusion matrix suggests that the SVM model accurately predicted 63 + 56 = 119 credit card approvals / denials
# But that it mis-categorized 5 + 14 = 19 approvals / denials, which are Type I or Type II errors
In [41]:
# Examine Precision and Recall
print classification_report(y_test, y_pred)
In [42]:
# The precision - representing the classifier's ability to not label as positive a sample that is negative - is 0.80
# The recall - representing the classifier's ability to find all the positive samples - is 0.90