This exercise tries to recreate the results in table 4.1 of the ESL book, using the vowel dataset.
First we import all the necessary libraries.
In [1]:
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn import preprocessing, linear_model
import numpy as np
import scipy as sc
import pandas as pd
import operator
import matplotlib.pylab as plt
Read and store the data.
In [2]:
train = pd.read_csv('vowel.test')
test = pd.read_csv('vowel.train')
Separate the training data (X) from the labels (y). Do the same for the test data.
In [3]:
# Get X and y for training set
y = train['y']
X = train.drop(['row.names', 'y'], axis=1)
# Get Xtest and yetest from test set
ytest = test['y']
Xtest = test.drop(['row.names', 'y'], axis=1)
LDA, QDA and Logistic Regression require the data to be standardized before used. This means that the data are centered around the mean (per column) and also have unit variance. The scaler is used to apply the same transformation in the test data.
In [4]:
# Standardize X and apply the same scaling in the test set
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
Xtest_scaled = scaler.transform(Xtest)
test_err = []
train_err = []
In [6]:
%matplotlib inline
plt.scatter(X['x.1'], X['x.2'], c=y)
plt.xlabel('Coordinate 1 for Training data')
plt.ylabel('Coordinate 2 for Training data')
plt.show()
In [7]:
# LDA
lda = LDA()
y_pred = lda.fit(X_scaled, y, store_covariance=False).predict(X_scaled)
y_pred_test = lda.predict(Xtest_scaled)
train_err.append(1.0 - sum((y_pred - y) == 0)/ float(len(y)))
test_err.append(1.0 - sum((y_pred_test - ytest) == 0)/ float(len(ytest)))
In [8]:
# QDA
qda = QDA()
y_pred = qda.fit(X_scaled, y).predict(X_scaled)
y_pred_test = qda.predict(Xtest_scaled)
train_err.append(1.0 - sum((y_pred - y) == 0)/ float(len(y)))
test_err.append(1.0 - sum((y_pred_test - ytest) == 0)/ float(len(ytest)))
Linear regression requires the y variable to be transformed into an indicator matrix first.
In [9]:
# Binarize y for regression
lb = preprocessing.LabelBinarizer()
lb.fit(y)
y_bin = lb.transform(y)
ytest_bin = lb.transform(ytest)
# Add the column of ones in the training and test data
X_train_one = np.hstack((np.ones((len(X), 1)), X))
X_test_one = np.hstack((np.ones((len(Xtest), 1)), Xtest))
# Calculate beta_hat and f(x)
beta_hat = np.dot(np.linalg.inv(np.dot(X_train_one.T, X_train_one)), np.dot(X_train_one.T, y_bin))
f_x = np.dot(X_train_one, beta_hat)
f_x_test = np.dot(X_test_one, beta_hat)
# Find the index with the max value and assign it to the corresponding class
g_hat = []
for row in f_x:
index, value = max(enumerate(row), key=operator.itemgetter(1))
g_hat.append(index + 1) # Align the class because index starts from 0
g_hat_test = []
for row in f_x_test:
index, value = max(enumerate(row), key=operator.itemgetter(1))
g_hat_test.append(index + 1) # Align the class because index starts from 0
train_err.append(1.0 - sum((g_hat - y) == 0)/ float(len(y)))
test_err.append(1.0 - sum((g_hat_test - ytest) == 0)/ float(len(ytest)))
In the multiclass case, the training algorithm uses a one-vs.-all (OvA) scheme, rather than the “true” multinomial LR and that is why the results differ a bit with the ones in the book.
In [10]:
regr = linear_model.LogisticRegression(penalty='l1', C=1.0)
y_pred = regr.fit(X_scaled, y).predict(X_scaled)
y_pred_test = regr.predict(Xtest_scaled)
train_err.append(1.0 - sum((y_pred - y) == 0)/ float(len(y)))
test_err.append(1.0 - sum((y_pred_test - ytest) == 0)/ float(len(ytest)))
In [11]:
#Table 4.1 in ESL book
print "Method\t\tTraining error\tTest error"
print "-------------------------------------------"
print "Linear R.\t", train_err[2], "\t",test_err[2]
print "LDA \t\t", train_err[0], "\t",test_err[0]
print "QDA \t\t", train_err[1], "\t",test_err[1]
print "Logistic R.\t", train_err[3], "\t",test_err[3]
In [ ]: