Chapter 4: Classification



In [1]:

    
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
%matplotlib inline



In [2]:

    
smarket_df = pd.read_csv("../data/Smarket.csv")
smarket_df.head()



In [3]:

    
# equivalent to the R pairs(df) command.
axes = pd.tools.plotting.scatter_matrix(smarket_df, color="brown")
f=plt.gcf()
f.set_size_inches(10,8)









    



/usr/local/lib/python2.7/dist-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Logistic Regression

According to this blog post, Scikit-Learn's LogisticRegression has some significant differences from R's glm - to reproduce R's glm, one should use statsmodels.api.GLM. But my preference is to work with Scikit-Learn because of its greater breadth, so unless there is something that is only available from statsmodels, I am going to use Scikit-Learn. The numbers below are not identical but close.



In [4]:

    
X = smarket_df[smarket_df.columns[1:-2]]
y = pd.factorize(smarket_df["Direction"])[0]
clf = LogisticRegression()
clf.fit(X, y)
(clf.intercept_, clf.coef_)
y









    Out[4]:





array([0, 0, 1, ..., 0, 1, 1])



In [5]:

    
# return probability for each class. R's predict() only returns the probability of the first 
# class, so we do the same.
probs = clf.predict_proba(X)
[prob[0] for prob in probs[0:5]]









    Out[5]:





[0.50775595757199166,
 0.48208765404825193,
 0.48147907945142998,
 0.51561031224141374,
 0.51134987027174317]



In [6]:

    
ypreds = ["Up" if prob[0] > 0.5 else "Down" for prob in probs]
ypreds[0:5]









    Out[6]:





['Up', 'Down', 'Down', 'Up', 'Up']



In [7]:

    
# R uses table() to tabulate the confusion matrix below
yacts = [str(x) for x in smarket_df["Direction"].values]
confusion_matrix(yacts, ypreds)









    Out[7]:





array([[143, 459],
       [135, 513]])



In [8]:

    
accuracy_score(yacts, ypreds)









    Out[8]:





0.52480000000000004

Make Training and Test Set



In [5]:

    
# Split dataset into training and test sets
smarket_train_df = smarket_df[smarket_df["Year"] < 2005]
smarket_test_df = smarket_df[smarket_df["Year"] >= 2005]
# train Logistic Regression model with training data
clf2 = LogisticRegression()
Xtrain = smarket_train_df[smarket_df.columns[1:-2]]
ytrain = pd.factorize(smarket_train_df["Direction"])[0]
clf2.fit(Xtrain, ytrain)
# test model with test data
Xtest = smarket_test_df[smarket_df.columns[1:-2]]
ytest = pd.factorize(smarket_test_df["Direction"])[0]
ypred = clf2.predict(Xtest)
# calculate confusion matrix and accuracy
confusion_matrix(ytest, ypred)









    Out[5]:





array([[40, 71],
       [52, 89]])



In [6]:

    
accuracy_score(ytest, ypred)









    Out[6]:





0.51190476190476186

Fit smaller model



In [7]:

    
clf3 = LogisticRegression()
Xtrain = smarket_train_df[smarket_df.columns[1:3]]
Xtest = smarket_test_df[smarket_df.columns[1:3]]
clf3.fit(Xtrain, ytrain)
ypred = clf3.predict(Xtest)
confusion_matrix(ytest, ypred)









    Out[7]:





array([[ 76,  35],
       [106,  35]])



In [8]:

    
accuracy_score(ytest, ypred)









    Out[8]:





0.44047619047619047

Linear Discriminant Analysis



In [11]:

    
clf4 = LDA()
clf4.fit(Xtrain, ytrain)
ypred = clf4.predict(Xtest)
ypred[0:5]









    Out[11]:





array([0, 0, 0, 0, 0])



In [14]:

    
dir(clf4)
#clf4.classes_









    Out[14]:





['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_decision_function',
 '_get_param_names',
 'classes_',
 'coef_',
 'decision_function',
 'fit',
 'fit_transform',
 'get_params',
 'intercept_',
 'means_',
 'n_components',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'priors',
 'priors_',
 'scaling',
 'scalings_',
 'score',
 'set_params',
 'transform',
 'xbar_']



In [31]:

    
probs_positive_class = clf4.predict_proba(Xtest)[:,1]
thresholds = np.arange(0,1.1,.1)

thresholds.shape
#for t in thresholds:
#    preds=[clf4.predict_proba > t for t in thresholds]
probs_positive_class.shape[0],thresholds.shape[0]









    Out[31]:





(252,)



In [ ]:

    
clf4.predict_proba > thresh



In [10]:

    
confusion_matrix(ytest, ypred)









    Out[10]:





array([[ 76,  35],
       [106,  35]])



In [15]:

    
accuracy_score(ytest, ypred)









    Out[15]:





0.44047619047619047

K-Nearest Neighbors



In [16]:

    
clf5 = KNeighborsClassifier()
clf5.fit(Xtrain, ytrain)
ypred = clf5.predict(Xtest)
confusion_matrix(ytest, ypred)









    Out[16]:





array([[71, 40],
       [82, 59]])



In [17]:

    
accuracy_score(ytest, ypred)









    Out[17]:





0.51587301587301593

	Year	Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
0	2001	0.381	-0.192	-2.624	-1.055	5.010	1.1913	0.959	Up
1	2001	0.959	0.381	-0.192	-2.624	-1.055	1.2965	1.032	Up
2	2001	1.032	0.959	0.381	-0.192	-2.624	1.4112	-0.623	Down
3	2001	-0.623	1.032	0.959	0.381	-0.192	1.2760	0.614	Up
4	2001	0.614	-0.623	1.032	0.959	0.381	1.2057	0.213	Up