notebook.community

Edit and run



In [1]:

    
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns  # for pretty layout of plots
import matplotlib.pyplot as plt
from pprint import pprint  # for pretty printing
from pandas.tools.plotting import scatter_matrix #scatter matrix plot
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import random
import scipy.stats

# This enables inline Plots
%matplotlib inline



In [2]:

    
# Read data from data file
crx = pd.read_csv('crx.data', parse_dates=True)



In [3]:

    
# first check out some sample values
crx.head(5)



In [4]:

    
# next see the info 
crx.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 689 entries, 0 to 688
Data columns (total 16 columns):
b        689 non-null object
30.83    689 non-null object
0        689 non-null float64
u        689 non-null object
g        689 non-null object
w        689 non-null object
v        689 non-null object
1.25     689 non-null float64
t        689 non-null object
t.1      689 non-null object
01       689 non-null int64
f        689 non-null object
g.1      689 non-null object
00202    689 non-null object
0.1      689 non-null int64
+        689 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.5+ KB



In [5]:

    
# Replace '?' character with python "nan"
crx.replace(to_replace='?', value=float("nan"), inplace=True)
crx.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 689 entries, 0 to 688
Data columns (total 16 columns):
b        677 non-null object
30.83    677 non-null object
0        689 non-null float64
u        683 non-null object
g        683 non-null object
w        680 non-null object
v        680 non-null object
1.25     689 non-null float64
t        689 non-null object
t.1      689 non-null object
01       689 non-null int64
f        689 non-null object
g.1      689 non-null object
00202    676 non-null object
0.1      689 non-null int64
+        689 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.5+ KB



In [6]:

    
# Change the column names to something more easy to work with
crx.columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'approved']



In [7]:

    
# The 2nd column is contains floats but are stored as objects. Convert them to float
a = crx['b'].map(lambda x: float(x))
crx['b'] = a



In [8]:

    
# Column n is described as being a contiuous. Thus we convert to float
n = crx['n'].map(lambda x: float(x))
crx['n'] = n



In [9]:

    
# the description of numerical data
crx.describe()









    Out[9]:






  
    
      
      b
      c
      h
      k
      n
      o
    
  
  
    
      count
       677.000000
       689.000000
       689.000000
       689.000000
        676.000000
          689.000000
    
    
      mean
        31.569261
         4.765631
         2.224819
         2.402032
        183.988166
         1018.862119
    
    
      std
        11.966670
         4.978470
         3.348739
         4.866180
        173.934087
         5213.743149
    
    
      min
        13.750000
         0.000000
         0.000000
         0.000000
          0.000000
            0.000000
    
    
      25%
        22.580000
         1.000000
         0.165000
         0.000000
         74.500000
            0.000000
    
    
      50%
        28.420000
         2.750000
         1.000000
         0.000000
        160.000000
            5.000000
    
    
      75%
        38.250000
         7.250000
         2.625000
         3.000000
        277.000000
          396.000000
    
    
      max
        80.250000
        28.000000
        28.500000
        67.000000
       2000.000000
       100000.000000

Part 1 - Impute Missing Data

Missing Attribute Values: 37 cases (5%) have one or more missing values. The missing values from particular attributes are:

A1:  12
A2:  12
A4:   6
A5:   6
A6:   9
A7:   9
A14: 13

Attribute Information:

A1: b, a.
A2: continuous.
A3: continuous.
A4: u, y, l, t.
A5: g, p, gg.
A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
A7: v, h, bb, j, n, z, dd, ff, o.
A8: continuous.
A9: t, f.
A10:    t, f.
A11:    continuous.
A12:    t, f.
A13:    g, p, s.
A14:    continuous.
A15:    continuous.
A16: +,-         (class attribute)



In [10]:

    
# Impute each column that is missing data using the appropriate method
# The first column 
a =  crx.b[crx.a == 'a'].count() 
b =  crx.b[crx.a == 'b'].count()
print(float(a) / float(a + b))









    



0.311278195489



In [11]:

    
# Roughtly 31% of the first column has the value 'a'
# We want to create a function that returns value 'a' 31% of the time and value 'b' 79% of the time
def get_a_impute_values():
    my_values = ['a'] * 31 + ['b'] * 79
    return random.choice(my_values)



In [12]:

    
crx.loc[crx.a.isnull(), 'a'] = get_a_impute_values()



In [13]:

    
# Create a Normal Distribution centered on Mean of 31.57 and Standard Dev of 11.97
# Get 12 Entries since that's how many missing entries we have for column b
def get_b_impute_values():
    return np.random.normal(31.57, 11.97, 12)



In [14]:

    
crx.loc[crx.b.isnull(), 'b'] = get_b_impute_values()



In [15]:

    
# According to the attribute information, the 4th column has unqiue values u, y, l, t.
# Based on this info, we can determine that the 6 missing values will be 't' since the values u, y 
# and l are all present in the dataset
crx.loc[crx.d.isnull(), 'd'] = 't'



In [16]:

    
# Get the counts of the distinct values in column e
g = crx.e[crx.e == 'g'].count()
p = crx.e[crx.e == 'p'].count()
gg = crx.e[crx.e == 'gg'].count()
print g, p, gg



In [17]:

    
# Impute according to the chance of getting the above value
def get_e_impute_values():
    my_values = ['g'] * 518 + ['p'] * 163 + ['gg'] * 2
    return random.choice(my_values)



In [18]:

    
crx.loc[crx.e.isnull(), 'e'] = get_e_impute_values()



In [19]:

    
# Since column f has a wide range of values, we can impute according to those set of values
def get_f_impute_values():
    my_values = ['aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm', 'q', 'r', 'w', 'x']
    return random.choice(my_values)



In [23]:

    
# Imput and plot
crx.loc[crx.f.isnull(), 'f'] = get_f_impute_values()
crx.f.value_counts().plot(kind='bar')









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a2efed0>



In [20]:

    
# Column g also has a wide range of values so we can impute the same way as we did in column f
def get_g_impute_values():
    my_values = ['v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o']
    return random.choice(my_values)



In [21]:

    
crx.loc[crx.g.isnull(), 'g'] = get_g_impute_values()



In [22]:

    
# Create a Normal Distribution centered on Mean of 183.99 and Standard Dev of 173.93
# Get 13 Entries since that's how many missing entries we have for column n
def get_n_impute_values():
    return np.random.normal(183.99, 173.93, 13)



In [23]:

    
crx.loc[crx.n.isnull(), 'n'] = get_n_impute_values()
crx.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 689 entries, 0 to 688
Data columns (total 16 columns):
a           689 non-null object
b           689 non-null float64
c           689 non-null float64
d           689 non-null object
e           689 non-null object
f           680 non-null object
g           689 non-null object
h           689 non-null float64
i           689 non-null object
j           689 non-null object
k           689 non-null int64
l           689 non-null object
m           689 non-null object
n           689 non-null float64
o           689 non-null int64
approved    689 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 91.5+ KB

Part 2 - Plot and visualize data to see any patterns



In [24]:

    
# lets plot the categorical columns a, d, e, f, g, i, j, l, m, approved
crx.a.value_counts().plot(kind='bar')









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a7aa2d0>



In [25]:

    
crx.d.value_counts().plot(kind='bar')









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a7f9810>



In [26]:

    
crx.e.value_counts().plot(kind='bar')









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x10aa3ac50>



In [27]:

    
crx.f.value_counts().plot(kind='bar')









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x10abc7f50>



In [28]:

    
crx.g.value_counts().plot(kind='bar')









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x10aee6790>



In [29]:

    
crx.i.value_counts().plot(kind='bar')









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x10af82250>



In [30]:

    
crx.j.value_counts().plot(kind='bar')









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b101e50>



In [31]:

    
crx.l.value_counts().plot(kind='bar')









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b29a990>



In [32]:

    
crx.m.value_counts().plot(kind='bar')









    Out[32]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b3ae390>



In [33]:

    
# looking at the distrubtion of x values, it looks like column b could be the age of the participant
crx.b.hist()









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b4becd0>



In [34]:

    
# b, c, h, k, n, o
crx.plot(kind='scatter', x='b', y='k')
#scipy.stats.pointbiserialr(crx.b, crx.a)









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b901390>



In [35]:

    
scatter_matrix(crx, alpha=0.2, figsize=(6, 6), diagonal='kde')









    Out[35]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10ba4e390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bc7bbd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bcfda10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bd60e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bde4f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bd9b7d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10bed4a90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c06ecd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c0df150>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c162050>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c1adad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c23ae10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10c2bdc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c32c6d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c3ae910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c4154d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c496810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c44c290>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10c687190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8093d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c84af90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8cee90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c879b90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c9beb90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10cb41c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cbb2250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cc35090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cc80bd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ce0fd10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ce93b50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10cf027d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cf83a10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10cfe85d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10d06b910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10d023390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10d25e290>]], dtype=object)

Part 3 - Build Models using Logistics Regression and SVM



In [36]:

    
# convert true/false and dichotomous columns to 0s and 1s

crx.loc[crx.a == 'a', 'a'] = 1
crx.loc[crx.a == 'b', 'a'] = 0

crx.loc[crx.i == 't', 'i'] = 1
crx.loc[crx.i == 'f', 'i'] = 0

crx.loc[crx.j == 't', 'j'] = 1
crx.loc[crx.j == 'f', 'j'] = 0

crx.loc[crx.l == 't', 'l'] = 1
crx.loc[crx.l == 'f', 'l'] = 0

crx.loc[crx.approved == '+', 'approved'] = 1
crx.loc[crx.approved == '-', 'approved'] = 0



In [37]:

    
# create dummies
dummies = pd.get_dummies(crx[['d', 'e', 'f', 'g', 'm']])
crx_dummies = crx.join(dummies)
to_split = crx_dummies.drop(['d', 'e', 'f', 'g', 'm'], axis=1)



In [38]:

    
# Convert objects to float
a = to_split['a'].map(lambda x: float(x))
to_split.a = a

i = to_split['i'].map(lambda x: float(x))
to_split.i = i 

j = to_split['j'].map(lambda x: float(x))
to_split.j = j

l = to_split['l'].map(lambda x: float(x))
to_split.l = l 

approved = to_split['approved'].map(lambda x: float(x))
to_split.approved = approved



In [39]:

    
# create a train/test split with .3 testing size
y = to_split.approved
data = to_split.drop('approved', axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)



In [40]:

    
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)









    Out[40]:





0.86956521739130432



In [41]:

    
# SVM
#initialize C=1e-3 and fit the model
SVM = SVC(C=1e-3, kernel='rbf')
SVM.fit(X_train, y_train)
SVM.score(X_test, y_test)









    Out[41]:





0.55555555555555558



In [42]:

    
# SVM 
# Try the linear kernel instead
l_SVM = SVC(C=1e-3, kernel='linear')
l_SVM.fit(X_train, y_train)
l_SVM.score(X_test, y_test)









    Out[42]:





0.79227053140096615

Part 4 - Use Grid Search to evaluate model parameters



In [43]:

    
d= {'C': [0,2,1]}
d['C'] = np.logspace(-3, 3, 10)

gds = GridSearchCV(LinearSVC(), d)



In [44]:

    
fitted = gds.fit(X_train, y_train)



In [45]:

    
print fitted.best_score_
print fitted.best_params_
print fitted.best_estimator_
print fitted.score(X_test, y_test)









    



0.804979253112
{'C': 0.46415888336127775}
LinearSVC(C=0.46415888336127775, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)
0.753623188406

Part 5 - Build a Confusion Matrix to show how well your prediction did



In [47]:

    
y_pred = clf.predict(X_test)



In [48]:

    
# Confusion Matrix for Type 1 and Type 2 Error
cm = confusion_matrix(y_test, y_pred)

print cm

plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[97 18]
 [ 9 83]]



In [49]:

    
target_names = ['denied', 'approved']
print classification_report(y_test, y_pred, target_names=target_names)









    



             precision    recall  f1-score   support

     denied       0.92      0.84      0.88       115
   approved       0.82      0.90      0.86        92

avg / total       0.87      0.87      0.87       207



In [50]:

    
# examine the coefficients and significance of Variables
pd.DataFrame(zip(data.columns, np.transpose(clf.coef_)))









    Out[50]:






  
    
      
      0
      1
    
  
  
    
      0 
          a
       [-0.00535731709431]
    
    
      1 
          b
        [0.00111297656792]
    
    
      2 
          c
       [-0.00760652448945]
    
    
      3 
          h
         [0.0856407014633]
    
    
      4 
          i
           [3.19710717167]
    
    
      5 
          j
          [0.396698245587]
    
    
      6 
          k
          [0.145921066958]
    
    
      7 
          l
         [-0.228300404731]
    
    
      8 
          n
       [-0.00263941283429]
    
    
      9 
          o
       [0.000573564740139]
    
    
      10
        d_l
           [0.23050819727]
    
    
      11
        d_t
          [0.117062230616]
    
    
      12
        d_u
         [-0.375815776567]
    
    
      13
        d_y
         [-0.779348627285]
    
    
      14
        e_g
         [-0.258753545951]
    
    
      15
       e_gg
           [0.23050819727]
    
    
      16
        e_p
         [-0.779348627285]
    
    
      17
       f_aa
         [-0.479979022549]
    
    
      18
        f_c
          [0.152056590077]
    
    
      19
       f_cc
          [0.804659755035]
    
    
      20
        f_d
         [-0.326855429067]
    
    
      21
        f_e
         [0.0613055663145]
    
    
      22
       f_ff
          [-0.74790512828]
    
    
      23
        f_i
         [-0.744052240498]
    
    
      24
        f_j
         [-0.187415632468]
    
    
      25
        f_k
          [-0.57075351188]
    
    
      26
        f_m
         [-0.280833086163]
    
    
      27
        f_q
        [-0.0801215504202]
    
    
      28
        f_r
         [0.0135430568836]
    
    
      29
        f_w
          [0.538268277825]
    
    
      30
        f_x
          [0.980041106721]
    
    
      31
       g_bb
         [-0.300605882773]
    
    
      32
       g_dd
        [-0.0719533471077]
    
    
      33
       g_ff
         [-0.558296871476]
    
    
      34
        g_h
         [0.0775304935285]
    
    
      35
        g_j
         [0.0962285646752]
    
    
      36
        g_n
          [0.223857446088]
    
    
      37
        g_o
         [-0.035035435899]
    
    
      38
        g_v
         [-0.232458985822]
    
    
      39
        g_z
       [-0.00685995718057]
    
    
      40
        m_g
          [-0.51334748776]
    
    
      41
        m_p
         [0.0560094810371]
    
    
      42
        m_s
         [-0.350255969244]



In [51]:

    
# Let's take a look at Predicted Probabilities
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()



In [52]:

    
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    Out[52]:





<matplotlib.axes._subplots.AxesSubplot at 0x10e27ba90>



In [ ]:

	b	30.83	0	u	g	w	v	1.25	t	t.1	01	f	g.1	00202	0.1	+
0	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	00043	560	+
1	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	00280	824	+
2	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	00100	3	+
3	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	00120	0	+
4	b	32.08	4.000	u	g	m	v	2.50	t	f	0	t	g	00360	0	+

	No	Yes	y_pred	y_true
202	0.428665	0.571335	1	1
203	0.981830	0.018170	0	0
204	0.958122	0.041878	0	0
205	0.969404	0.030596	0	0
206	0.013344	0.986656	1	1

	b	c	h	k	n	o
count	677.000000	689.000000	689.000000	689.000000	676.000000	689.000000
mean	31.569261	4.765631	2.224819	2.402032	183.988166	1018.862119
std	11.966670	4.978470	3.348739	4.866180	173.934087	5213.743149
min	13.750000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	22.580000	1.000000	0.165000	0.000000	74.500000	0.000000
50%	28.420000	2.750000	1.000000	0.000000	160.000000	5.000000
75%	38.250000	7.250000	2.625000	3.000000	277.000000	396.000000
max	80.250000	28.000000	28.500000	67.000000	2000.000000	100000.000000

	0	1
0	a	[-0.00535731709431]
1	b	[0.00111297656792]
2	c	[-0.00760652448945]
3	h	[0.0856407014633]
4	i	[3.19710717167]
5	j	[0.396698245587]
6	k	[0.145921066958]
7	l	[-0.228300404731]
8	n	[-0.00263941283429]
9	o	[0.000573564740139]
10	d_l	[0.23050819727]
11	d_t	[0.117062230616]
12	d_u	[-0.375815776567]
13	d_y	[-0.779348627285]
14	e_g	[-0.258753545951]
15	e_gg	[0.23050819727]
16	e_p	[-0.779348627285]
17	f_aa	[-0.479979022549]
18	f_c	[0.152056590077]
19	f_cc	[0.804659755035]
20	f_d	[-0.326855429067]
21	f_e	[0.0613055663145]
22	f_ff	[-0.74790512828]
23	f_i	[-0.744052240498]
24	f_j	[-0.187415632468]
25	f_k	[-0.57075351188]
26	f_m	[-0.280833086163]
27	f_q	[-0.0801215504202]
28	f_r	[0.0135430568836]
29	f_w	[0.538268277825]
30	f_x	[0.980041106721]
31	g_bb	[-0.300605882773]
32	g_dd	[-0.0719533471077]
33	g_ff	[-0.558296871476]
34	g_h	[0.0775304935285]
35	g_j	[0.0962285646752]
36	g_n	[0.223857446088]
37	g_o	[-0.035035435899]
38	g_v	[-0.232458985822]
39	g_z	[-0.00685995718057]
40	m_g	[-0.51334748776]
41	m_p	[0.0560094810371]
42	m_s	[-0.350255969244]