In [1]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns  # for pretty layout of plots
import matplotlib.pyplot as plt
from pprint import pprint  # for pretty printing
from pandas.tools.plotting import scatter_matrix #scatter matrix plot
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import random
import scipy.stats

# This enables inline Plots
%matplotlib inline

In [2]:
# Read data from data file
crx = pd.read_csv('crx.data', parse_dates=True)

In [3]:
# first check out some sample values
crx.head(5)


Out[3]:
b 30.83 0 u g w v 1.25 t t.1 01 f g.1 00202 0.1 +
0 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
1 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
2 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
3 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +
4 b 32.08 4.000 u g m v 2.50 t f 0 t g 00360 0 +

In [4]:
# next see the info 
crx.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 689 entries, 0 to 688
Data columns (total 16 columns):
b        689 non-null object
30.83    689 non-null object
0        689 non-null float64
u        689 non-null object
g        689 non-null object
w        689 non-null object
v        689 non-null object
1.25     689 non-null float64
t        689 non-null object
t.1      689 non-null object
01       689 non-null int64
f        689 non-null object
g.1      689 non-null object
00202    689 non-null object
0.1      689 non-null int64
+        689 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.5+ KB

In [5]:
# Replace '?' character with python "nan"
crx.replace(to_replace='?', value=float("nan"), inplace=True)
crx.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 689 entries, 0 to 688
Data columns (total 16 columns):
b        677 non-null object
30.83    677 non-null object
0        689 non-null float64
u        683 non-null object
g        683 non-null object
w        680 non-null object
v        680 non-null object
1.25     689 non-null float64
t        689 non-null object
t.1      689 non-null object
01       689 non-null int64
f        689 non-null object
g.1      689 non-null object
00202    676 non-null object
0.1      689 non-null int64
+        689 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.5+ KB

In [6]:
# Change the column names to something more easy to work with
crx.columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'approved']

In [7]:
# The 2nd column is contains floats but are stored as objects. Convert them to float
a = crx['b'].map(lambda x: float(x))
crx['b'] = a

In [8]:
# Column n is described as being a contiuous. Thus we convert to float
n = crx['n'].map(lambda x: float(x))
crx['n'] = n

In [9]:
# the description of numerical data
crx.describe()


Out[9]:
b c h k n o
count 677.000000 689.000000 689.000000 689.000000 676.000000 689.000000
mean 31.569261 4.765631 2.224819 2.402032 183.988166 1018.862119
std 11.966670 4.978470 3.348739 4.866180 173.934087 5213.743149
min 13.750000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 22.580000 1.000000 0.165000 0.000000 74.500000 0.000000
50% 28.420000 2.750000 1.000000 0.000000 160.000000 5.000000
75% 38.250000 7.250000 2.625000 3.000000 277.000000 396.000000
max 80.250000 28.000000 28.500000 67.000000 2000.000000 100000.000000

Part 1 - Impute Missing Data

Missing Attribute Values: 37 cases (5%) have one or more missing values. The missing values from particular attributes are:

A1:  12
A2:  12
A4:   6
A5:   6
A6:   9
A7:   9
A14: 13

Attribute Information:

A1: b, a.
A2: continuous.
A3: continuous.
A4: u, y, l, t.
A5: g, p, gg.
A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
A7: v, h, bb, j, n, z, dd, ff, o.
A8: continuous.
A9: t, f.
A10:    t, f.
A11:    continuous.
A12:    t, f.
A13:    g, p, s.
A14:    continuous.
A15:    continuous.
A16: +,-         (class attribute)

In [10]:
# Impute each column that is missing data using the appropriate method
# The first column 
a =  crx.b[crx.a == 'a'].count() 
b =  crx.b[crx.a == 'b'].count()
print(float(a) / float(a + b))


0.311278195489

In [11]:
# Roughtly 31% of the first column has the value 'a'
# We want to create a function that returns value 'a' 31% of the time and value 'b' 79% of the time
def get_a_impute_values():
    my_values = ['a'] * 31 + ['b'] * 79
    return random.choice(my_values)

In [12]:
crx.loc[crx.a.isnull(), 'a'] = get_a_impute_values()

In [13]:
# Create a Normal Distribution centered on Mean of 31.57 and Standard Dev of 11.97
# Get 12 Entries since that's how many missing entries we have for column b
def get_b_impute_values():
    return np.random.normal(31.57, 11.97, 12)

In [14]:
crx.loc[crx.b.isnull(), 'b'] = get_b_impute_values()

In [15]:
# According to the attribute information, the 4th column has unqiue values u, y, l, t.
# Based on this info, we can determine that the 6 missing values will be 't' since the values u, y 
# and l are all present in the dataset
crx.loc[crx.d.isnull(), 'd'] = 't'

In [16]:
# Get the counts of the distinct values in column e
g = crx.e[crx.e == 'g'].count()
p = crx.e[crx.e == 'p'].count()
gg = crx.e[crx.e == 'gg'].count()
print g, p, gg


518 163 2

In [17]:
# Impute according to the chance of getting the above value
def get_e_impute_values():
    my_values = ['g'] * 518 + ['p'] * 163 + ['gg'] * 2
    return random.choice(my_values)

In [18]:
crx.loc[crx.e.isnull(), 'e'] = get_e_impute_values()

In [19]:
# Since column f has a wide range of values, we can impute according to those set of values
def get_f_impute_values():
    my_values = ['aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm', 'q', 'r', 'w', 'x']
    return random.choice(my_values)

In [23]:
# Imput and plot
crx.loc[crx.f.isnull(), 'f'] = get_f_impute_values()
crx.f.value_counts().plot(kind='bar')


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a2efed0>

In [20]:
# Column g also has a wide range of values so we can impute the same way as we did in column f
def get_g_impute_values():
    my_values = ['v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o']
    return random.choice(my_values)

In [21]:
crx.loc[crx.g.isnull(), 'g'] = get_g_impute_values()

In [22]:
# Create a Normal Distribution centered on Mean of 183.99 and Standard Dev of 173.93
# Get 13 Entries since that's how many missing entries we have for column n
def get_n_impute_values():
    return np.random.normal(183.99, 173.93, 13)

In [23]:
crx.loc[crx.n.isnull(), 'n'] = get_n_impute_values()
crx.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 689 entries, 0 to 688
Data columns (total 16 columns):
a           689 non-null object
b           689 non-null float64
c           689 non-null float64
d           689 non-null object
e           689 non-null object
f           680 non-null object
g           689 non-null object
h           689 non-null float64
i           689 non-null object
j           689 non-null object
k           689 non-null int64
l           689 non-null object
m           689 non-null object
n           689 non-null float64
o           689 non-null int64
approved    689 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 91.5+ KB

Part 2 - Plot and visualize data to see any patterns


In [24]:
# lets plot the categorical columns a, d, e, f, g, i, j, l, m, approved
crx.a.value_counts().plot(kind='bar')


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a7aa2d0>

In [25]:
crx.d.value_counts().plot(kind='bar')


Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a7f9810>

In [26]:
crx.e.value_counts().plot(kind='bar')


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aa3ac50>

In [27]:
crx.f.value_counts().plot(kind='bar')


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x10abc7f50>

In [28]:
crx.g.value_counts().plot(kind='bar')


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aee6790>

In [29]:
crx.i.value_counts().plot(kind='bar')


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x10af82250>

In [30]:
crx.j.value_counts().plot(kind='bar')


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b101e50>

In [31]:
crx.l.value_counts().plot(kind='bar')


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b29a990>

In [32]:
crx.m.value_counts().plot(kind='bar')


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b3ae390>

In [33]:
# looking at the distrubtion of x values, it looks like column b could be the age of the participant
crx.b.hist()


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b4becd0>

In [34]:
# b, c, h, k, n, o
crx.plot(kind='scatter', x='b', y='k')
#scipy.stats.pointbiserialr(crx.b, crx.a)


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b901390>

In [35]:
scatter_matrix(crx, alpha=0.2, figsize=(6, 6), diagonal='kde')


Out[35]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10ba4e390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bc7bbd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bcfda10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bd60e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bde4f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bd9b7d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10bed4a90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c06ecd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c0df150>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c162050>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c1adad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c23ae10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10c2bdc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c32c6d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c3ae910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c4154d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c496810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c44c290>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10c687190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8093d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c84af90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8cee90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c879b90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c9beb90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10cb41c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cbb2250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cc35090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cc80bd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ce0fd10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ce93b50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10cf027d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cf83a10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cfe85d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10d06b910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10d023390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10d25e290>]], dtype=object)

Part 3 - Build Models using Logistics Regression and SVM


In [36]:
# convert true/false and dichotomous columns to 0s and 1s

crx.loc[crx.a == 'a', 'a'] = 1
crx.loc[crx.a == 'b', 'a'] = 0

crx.loc[crx.i == 't', 'i'] = 1
crx.loc[crx.i == 'f', 'i'] = 0

crx.loc[crx.j == 't', 'j'] = 1
crx.loc[crx.j == 'f', 'j'] = 0

crx.loc[crx.l == 't', 'l'] = 1
crx.loc[crx.l == 'f', 'l'] = 0

crx.loc[crx.approved == '+', 'approved'] = 1
crx.loc[crx.approved == '-', 'approved'] = 0

In [37]:
# create dummies
dummies = pd.get_dummies(crx[['d', 'e', 'f', 'g', 'm']])
crx_dummies = crx.join(dummies)
to_split = crx_dummies.drop(['d', 'e', 'f', 'g', 'm'], axis=1)

In [38]:
# Convert objects to float
a = to_split['a'].map(lambda x: float(x))
to_split.a = a

i = to_split['i'].map(lambda x: float(x))
to_split.i = i 

j = to_split['j'].map(lambda x: float(x))
to_split.j = j

l = to_split['l'].map(lambda x: float(x))
to_split.l = l 

approved = to_split['approved'].map(lambda x: float(x))
to_split.approved = approved

In [39]:
# create a train/test split with .3 testing size
y = to_split.approved
data = to_split.drop('approved', axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)

In [40]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[40]:
0.86956521739130432

In [41]:
# SVM
#initialize C=1e-3 and fit the model
SVM = SVC(C=1e-3, kernel='rbf')
SVM.fit(X_train, y_train)
SVM.score(X_test, y_test)


Out[41]:
0.55555555555555558

In [42]:
# SVM 
# Try the linear kernel instead
l_SVM = SVC(C=1e-3, kernel='linear')
l_SVM.fit(X_train, y_train)
l_SVM.score(X_test, y_test)


Out[42]:
0.79227053140096615

Part 4 - Use Grid Search to evaluate model parameters


In [43]:
d= {'C': [0,2,1]}
d['C'] = np.logspace(-3, 3, 10)

gds = GridSearchCV(LinearSVC(), d)

In [44]:
fitted = gds.fit(X_train, y_train)

In [45]:
print fitted.best_score_
print fitted.best_params_
print fitted.best_estimator_
print fitted.score(X_test, y_test)


0.804979253112
{'C': 0.46415888336127775}
LinearSVC(C=0.46415888336127775, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)
0.753623188406

Part 5 - Build a Confusion Matrix to show how well your prediction did


In [47]:
y_pred = clf.predict(X_test)

In [48]:
# Confusion Matrix for Type 1 and Type 2 Error
cm = confusion_matrix(y_test, y_pred)

print cm

plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


[[97 18]
 [ 9 83]]

In [49]:
target_names = ['denied', 'approved']
print classification_report(y_test, y_pred, target_names=target_names)


             precision    recall  f1-score   support

     denied       0.92      0.84      0.88       115
   approved       0.82      0.90      0.86        92

avg / total       0.87      0.87      0.87       207


In [50]:
# examine the coefficients and significance of Variables
pd.DataFrame(zip(data.columns, np.transpose(clf.coef_)))


Out[50]:
0 1
0 a [-0.00535731709431]
1 b [0.00111297656792]
2 c [-0.00760652448945]
3 h [0.0856407014633]
4 i [3.19710717167]
5 j [0.396698245587]
6 k [0.145921066958]
7 l [-0.228300404731]
8 n [-0.00263941283429]
9 o [0.000573564740139]
10 d_l [0.23050819727]
11 d_t [0.117062230616]
12 d_u [-0.375815776567]
13 d_y [-0.779348627285]
14 e_g [-0.258753545951]
15 e_gg [0.23050819727]
16 e_p [-0.779348627285]
17 f_aa [-0.479979022549]
18 f_c [0.152056590077]
19 f_cc [0.804659755035]
20 f_d [-0.326855429067]
21 f_e [0.0613055663145]
22 f_ff [-0.74790512828]
23 f_i [-0.744052240498]
24 f_j [-0.187415632468]
25 f_k [-0.57075351188]
26 f_m [-0.280833086163]
27 f_q [-0.0801215504202]
28 f_r [0.0135430568836]
29 f_w [0.538268277825]
30 f_x [0.980041106721]
31 g_bb [-0.300605882773]
32 g_dd [-0.0719533471077]
33 g_ff [-0.558296871476]
34 g_h [0.0775304935285]
35 g_j [0.0962285646752]
36 g_n [0.223857446088]
37 g_o [-0.035035435899]
38 g_v [-0.232458985822]
39 g_z [-0.00685995718057]
40 m_g [-0.51334748776]
41 m_p [0.0560094810371]
42 m_s [-0.350255969244]

In [51]:
# Let's take a look at Predicted Probabilities
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()


Out[51]:
No Yes y_pred y_true
202 0.428665 0.571335 1 1
203 0.981830 0.018170 0 0
204 0.958122 0.041878 0 0
205 0.969404 0.030596 0 0
206 0.013344 0.986656 1 1

In [52]:
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e27ba90>

In [ ]: