In [1]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns # for pretty layout of plots
import matplotlib.pyplot as plt
from pprint import pprint # for pretty printing
from pandas.tools.plotting import scatter_matrix #scatter matrix plot
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import random
import scipy.stats
# This enables inline Plots
%matplotlib inline
In [2]:
# Read data from data file
crx = pd.read_csv('crx.data', parse_dates=True)
In [3]:
# first check out some sample values
crx.head(5)
Out[3]:
In [4]:
# next see the info
crx.info()
In [5]:
# Replace '?' character with python "nan"
crx.replace(to_replace='?', value=float("nan"), inplace=True)
crx.info()
In [6]:
# Change the column names to something more easy to work with
crx.columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'approved']
In [7]:
# The 2nd column is contains floats but are stored as objects. Convert them to float
a = crx['b'].map(lambda x: float(x))
crx['b'] = a
In [8]:
# Column n is described as being a contiuous. Thus we convert to float
n = crx['n'].map(lambda x: float(x))
crx['n'] = n
In [9]:
# the description of numerical data
crx.describe()
Out[9]:
Missing Attribute Values: 37 cases (5%) have one or more missing values. The missing values from particular attributes are:
A1: 12
A2: 12
A4: 6
A5: 6
A6: 9
A7: 9
A14: 13
Attribute Information:
A1: b, a.
A2: continuous.
A3: continuous.
A4: u, y, l, t.
A5: g, p, gg.
A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
A7: v, h, bb, j, n, z, dd, ff, o.
A8: continuous.
A9: t, f.
A10: t, f.
A11: continuous.
A12: t, f.
A13: g, p, s.
A14: continuous.
A15: continuous.
A16: +,- (class attribute)
In [10]:
# Impute each column that is missing data using the appropriate method
# The first column
a = crx.b[crx.a == 'a'].count()
b = crx.b[crx.a == 'b'].count()
print(float(a) / float(a + b))
In [11]:
# Roughtly 31% of the first column has the value 'a'
# We want to create a function that returns value 'a' 31% of the time and value 'b' 79% of the time
def get_a_impute_values():
my_values = ['a'] * 31 + ['b'] * 79
return random.choice(my_values)
In [12]:
crx.loc[crx.a.isnull(), 'a'] = get_a_impute_values()
In [13]:
# Create a Normal Distribution centered on Mean of 31.57 and Standard Dev of 11.97
# Get 12 Entries since that's how many missing entries we have for column b
def get_b_impute_values():
return np.random.normal(31.57, 11.97, 12)
In [14]:
crx.loc[crx.b.isnull(), 'b'] = get_b_impute_values()
In [15]:
# According to the attribute information, the 4th column has unqiue values u, y, l, t.
# Based on this info, we can determine that the 6 missing values will be 't' since the values u, y
# and l are all present in the dataset
crx.loc[crx.d.isnull(), 'd'] = 't'
In [16]:
# Get the counts of the distinct values in column e
g = crx.e[crx.e == 'g'].count()
p = crx.e[crx.e == 'p'].count()
gg = crx.e[crx.e == 'gg'].count()
print g, p, gg
In [17]:
# Impute according to the chance of getting the above value
def get_e_impute_values():
my_values = ['g'] * 518 + ['p'] * 163 + ['gg'] * 2
return random.choice(my_values)
In [18]:
crx.loc[crx.e.isnull(), 'e'] = get_e_impute_values()
In [19]:
# Since column f has a wide range of values, we can impute according to those set of values
def get_f_impute_values():
my_values = ['aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm', 'q', 'r', 'w', 'x']
return random.choice(my_values)
In [23]:
# Imput and plot
crx.loc[crx.f.isnull(), 'f'] = get_f_impute_values()
crx.f.value_counts().plot(kind='bar')
Out[23]:
In [20]:
# Column g also has a wide range of values so we can impute the same way as we did in column f
def get_g_impute_values():
my_values = ['v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o']
return random.choice(my_values)
In [21]:
crx.loc[crx.g.isnull(), 'g'] = get_g_impute_values()
In [22]:
# Create a Normal Distribution centered on Mean of 183.99 and Standard Dev of 173.93
# Get 13 Entries since that's how many missing entries we have for column n
def get_n_impute_values():
return np.random.normal(183.99, 173.93, 13)
In [23]:
crx.loc[crx.n.isnull(), 'n'] = get_n_impute_values()
crx.info()
In [24]:
# lets plot the categorical columns a, d, e, f, g, i, j, l, m, approved
crx.a.value_counts().plot(kind='bar')
Out[24]:
In [25]:
crx.d.value_counts().plot(kind='bar')
Out[25]:
In [26]:
crx.e.value_counts().plot(kind='bar')
Out[26]:
In [27]:
crx.f.value_counts().plot(kind='bar')
Out[27]:
In [28]:
crx.g.value_counts().plot(kind='bar')
Out[28]:
In [29]:
crx.i.value_counts().plot(kind='bar')
Out[29]:
In [30]:
crx.j.value_counts().plot(kind='bar')
Out[30]:
In [31]:
crx.l.value_counts().plot(kind='bar')
Out[31]:
In [32]:
crx.m.value_counts().plot(kind='bar')
Out[32]:
In [33]:
# looking at the distrubtion of x values, it looks like column b could be the age of the participant
crx.b.hist()
Out[33]:
In [34]:
# b, c, h, k, n, o
crx.plot(kind='scatter', x='b', y='k')
#scipy.stats.pointbiserialr(crx.b, crx.a)
Out[34]:
In [35]:
scatter_matrix(crx, alpha=0.2, figsize=(6, 6), diagonal='kde')
Out[35]:
In [36]:
# convert true/false and dichotomous columns to 0s and 1s
crx.loc[crx.a == 'a', 'a'] = 1
crx.loc[crx.a == 'b', 'a'] = 0
crx.loc[crx.i == 't', 'i'] = 1
crx.loc[crx.i == 'f', 'i'] = 0
crx.loc[crx.j == 't', 'j'] = 1
crx.loc[crx.j == 'f', 'j'] = 0
crx.loc[crx.l == 't', 'l'] = 1
crx.loc[crx.l == 'f', 'l'] = 0
crx.loc[crx.approved == '+', 'approved'] = 1
crx.loc[crx.approved == '-', 'approved'] = 0
In [37]:
# create dummies
dummies = pd.get_dummies(crx[['d', 'e', 'f', 'g', 'm']])
crx_dummies = crx.join(dummies)
to_split = crx_dummies.drop(['d', 'e', 'f', 'g', 'm'], axis=1)
In [38]:
# Convert objects to float
a = to_split['a'].map(lambda x: float(x))
to_split.a = a
i = to_split['i'].map(lambda x: float(x))
to_split.i = i
j = to_split['j'].map(lambda x: float(x))
to_split.j = j
l = to_split['l'].map(lambda x: float(x))
to_split.l = l
approved = to_split['approved'].map(lambda x: float(x))
to_split.approved = approved
In [39]:
# create a train/test split with .3 testing size
y = to_split.approved
data = to_split.drop('approved', axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)
In [40]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[40]:
In [41]:
# SVM
#initialize C=1e-3 and fit the model
SVM = SVC(C=1e-3, kernel='rbf')
SVM.fit(X_train, y_train)
SVM.score(X_test, y_test)
Out[41]:
In [42]:
# SVM
# Try the linear kernel instead
l_SVM = SVC(C=1e-3, kernel='linear')
l_SVM.fit(X_train, y_train)
l_SVM.score(X_test, y_test)
Out[42]:
In [43]:
d= {'C': [0,2,1]}
d['C'] = np.logspace(-3, 3, 10)
gds = GridSearchCV(LinearSVC(), d)
In [44]:
fitted = gds.fit(X_train, y_train)
In [45]:
print fitted.best_score_
print fitted.best_params_
print fitted.best_estimator_
print fitted.score(X_test, y_test)
In [47]:
y_pred = clf.predict(X_test)
In [48]:
# Confusion Matrix for Type 1 and Type 2 Error
cm = confusion_matrix(y_test, y_pred)
print cm
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [49]:
target_names = ['denied', 'approved']
print classification_report(y_test, y_pred, target_names=target_names)
In [50]:
# examine the coefficients and significance of Variables
pd.DataFrame(zip(data.columns, np.transpose(clf.coef_)))
Out[50]:
In [51]:
# Let's take a look at Predicted Probabilities
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()
Out[51]:
In [52]:
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[52]:
In [ ]: