In [14]:
# Load libraries
import numpy as np # Math
import scipy.io # Import data
import time
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes, sklearn.metrics # Baseline classification techniques
import matplotlib.pyplot as plt
In [23]:
# Load 400 text documents representing 5 classes
# X_train matrix contains the training data
# y_train vector contains the training labels
# X_test matrix contains the test data
# y_test vector contains the test labels
[X_train, y_train, X_test, y_test] = np.load('datasets/20news_5classes_400docs.npy')
print('X_train size=',X_train.shape)
print('X_test size=',X_test.shape)
print('y_train size=',y_train.shape)
print('y_test size=',y_test.shape)
In [ ]:
In [3]:
t_start = time.process_time()
neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
In [4]:
t_start = time.process_time()
neigh = sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
In [5]:
t_start = time.process_time()
neigh = sklearn.linear_model.LogisticRegression()
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
In [6]:
t_start = time.process_time()
neigh = sklearn.ensemble.RandomForestClassifier()
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
In [7]:
t_start = time.process_time()
neigh = sklearn.linear_model.RidgeClassifier()
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
In [8]:
t_start = time.process_time()
neigh = sklearn.naive_bayes.BernoulliNB()
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
In [9]:
t_start = time.process_time()
neigh = sklearn.naive_bayes.MultinomialNB()
neigh.fit(X_train, y_train)
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)
Observe the best result. What is the best technique?
Do you think the other classification techniques are not as efficient?
Should you believe all blackbox data analysis techniques?
Let us consider one classification technique like logistic regression:
model = sklearn.linear_model.LogisticRegression(C=C_value)
and its hyperparamater C, which is the trade-off between the data term and the regularization term.
In [27]:
num_folds = 5
X_train = X_train.toarray()
#Y_train = X_train.toarray()
X_train_folds = np.array_split(X_train,num_folds)
Y_train_folds = np.array_split(y_train,num_folds)
Values of the hyperparameter C:
In [28]:
C_choices = [1e-2, 5*1e-2, 1e-1, 5*1e-1, 1e0, 5*1e0, 1e1, 5*1e1, 1e2, 5*1e2, 1e3, 5*1e3]
num_Cs = len(C_choices)
In [31]:
accuracy_tab = np.zeros([num_folds,num_Cs])
for C_idx, C_value in enumerate(C_choices):
for fold_idx in range(num_folds):
# Extract train dataset for the current fold
fold_x_train = np.concatenate([X_train_folds[i] for i in range(num_folds) if i!=fold_idx])
fold_y_train = np.concatenate([Y_train_folds[i] for i in range(num_folds) if i!=fold_idx])
# Extract validation dataset for the current fold
fold_x_val = X_train_folds[fold_idx]
fold_y_val = Y_train_folds[fold_idx]
# Run Logistic Regression model for the current fold
neigh = sklearn.linear_model.LogisticRegression(C = C_value)
#neigh.fit(fold_x_val, fold_y_val)
neigh.fit(fold_x_train, fold_y_train)
y_predVal = neigh.predict(fold_x_val)
#check overfitting
y_predTrain = neigh.predict(fold_x_train)
#print('y_predTrain',sklearn.metrics.accuracy_score(fold_y_train,y_predTrain))
accuracy = sklearn.metrics.accuracy_score(fold_y_val, y_predVal)
# Store accuracy value
accuracy_tab[fold_idx,C_idx] = accuracy
print(accuracy_tab)
In [32]:
# plot the raw observations
for C_idx, C_value in enumerate(C_choices):
accuracies_C_idx = accuracy_tab[:,C_idx]
plt.scatter(np.ones(5)*np.log(C_value),accuracies_C_idx)
# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.mean(accuracy_tab,axis=0)
accuracies_std = np.std(accuracy_tab,axis=0)
plt.errorbar(np.log(C_choices), accuracies_mean, yerr=accuracies_std)
# Add text
plt.title('Cross-validation on C')
plt.xlabel('log C')
plt.ylabel('Cross-validation accuracy')
plt.show()
In [ ]:
idx_best_C = 9
neigh = sklearn.linear_model.LogisticRegression(C=C_choices[idx_best_C])
neigh.fit(X_train, y_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
accuracy_testset = sklearn.metrics.accuracy_score(y_test, y_pred)
print('best accuracy=',accuracy_testset)
In [ ]: