A Network Tour of Data Science

Xavier Bresson, Winter 2016/17

Exercise 3 : Baseline Classification Techniques



In [14]:

    
# Load libraries
import numpy as np # Math
import scipy.io # Import data
import time
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes, sklearn.metrics # Baseline classification techniques
import matplotlib.pyplot as plt



In [23]:

    
# Load 400 text documents representing 5 classes
# X_train matrix contains the training data
# y_train vector contains the training labels
# X_test matrix contains the test data
# y_test vector contains the test labels
[X_train, y_train, X_test, y_test] = np.load('datasets/20news_5classes_400docs.npy')
print('X_train size=',X_train.shape)
print('X_test size=',X_test.shape)
print('y_train size=',y_train.shape)
print('y_test size=',y_test.shape)









    



X_train size= (200, 7939)
X_test size= (200, 7939)
y_train size= (200,)
y_test size= (200,)

Question 1a: Run the following baseline classification techniques:

k-NN classifier: You may use sklearn.neighbors.KNeighborsClassifier()
Linear SVM classifier: You may use sklearn.svm.LinearSVC()
Logistic Regression classifier: You may use sklearn.linear_model.LogisticRegression()
Random Forest classifier: You may use sklearn.ensemble.RandomForestClassifier()
Ridge classifier: You may use sklearn.linear_model.RidgeClassifier()
Naive Bayes classifier with Bernoulli: You may use sklearn.naive_bayes.BernoulliNB()
Naive Bayes classifier with Multinomial: You may use sklearn.naive_bayes.MultinomialNB()

Question 1b:

Print accuracy for train dataset and test dataset: You may use function sklearn.metrics.accuracy_score()
Print the computational time to train each model: You may use commands t_start = time.process_time(), and exec_time = time.process_time() - t_start



In [ ]:



In [3]:

    
t_start = time.process_time()
neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.255
test_accuracy 0.395
exec_time 0.018635999999999875



In [4]:

    
t_start = time.process_time()
neigh = sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.17
test_accuracy 0.89
exec_time 0.014839999999999964



In [5]:

    
t_start = time.process_time()
neigh = sklearn.linear_model.LogisticRegression()
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.945
test_accuracy 0.4
exec_time 0.08986500000000053



In [6]:

    
t_start = time.process_time()
neigh = sklearn.ensemble.RandomForestClassifier()
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.17
test_accuracy 0.6
exec_time 0.1242829999999997



In [7]:

    
t_start = time.process_time()
neigh = sklearn.linear_model.RidgeClassifier()
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.17
test_accuracy 0.86
exec_time 0.07446799999999953



In [8]:

    
t_start = time.process_time()
neigh = sklearn.naive_bayes.BernoulliNB()
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.185
test_accuracy 0.585
exec_time 0.025198000000000054



In [9]:

    
t_start = time.process_time()
neigh = sklearn.naive_bayes.MultinomialNB()
neigh.fit(X_train, y_train) 
y_predTest = neigh.predict(X_train)
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
exec_time = time.process_time() - t_start
print('train_accuracy',train_accuracy)
print('test_accuracy',test_accuracy)
print('exec_time',exec_time)









    



train_accuracy 0.16
test_accuracy 0.51
exec_time 0.01383999999999963

Observe the best result. What is the best technique?
Do you think the other classification techniques are not as efficient?
Should you believe all blackbox data analysis techniques?

Let us consider one classification technique like logistic regression:
model = sklearn.linear_model.LogisticRegression(C=C_value)
and its hyperparamater C, which is the trade-off between the data term and the regularization term.

Question 2: Estimate the hyperparameter C of the logistic regression classifier by cross-validation

Question 2a: First, split the training set into 5 folds

Hint: You may use the function np.array_split()



In [27]:

    
num_folds = 5 
X_train = X_train.toarray()
#Y_train = X_train.toarray()
X_train_folds = np.array_split(X_train,num_folds)
Y_train_folds = np.array_split(y_train,num_folds)

Values of the hyperparameter C:



In [28]:

    
C_choices = [1e-2, 5*1e-2, 1e-1, 5*1e-1, 1e0, 5*1e0, 1e1, 5*1e1, 1e2, 5*1e2, 1e3, 5*1e3]
num_Cs = len(C_choices)

Question 2b: Compute the accuracy for all folds and all hyperparameter values (and store it for example in a tab like accuracy_tab)



In [31]:

    
accuracy_tab = np.zeros([num_folds,num_Cs])

for C_idx, C_value in enumerate(C_choices):

    for fold_idx in range(num_folds):
        
        # Extract train dataset for the current fold
        fold_x_train = np.concatenate([X_train_folds[i] for i in range(num_folds) if i!=fold_idx])       
        fold_y_train = np.concatenate([Y_train_folds[i] for i in range(num_folds) if i!=fold_idx]) 

        # Extract validation dataset for the current fold
        fold_x_val  = X_train_folds[fold_idx]
        fold_y_val  = Y_train_folds[fold_idx]
        
        # Run Logistic Regression model for the current fold
        neigh = sklearn.linear_model.LogisticRegression(C = C_value)
        #neigh.fit(fold_x_val, fold_y_val)
        neigh.fit(fold_x_train, fold_y_train)
        y_predVal = neigh.predict(fold_x_val)
        
        #check overfitting
        y_predTrain = neigh.predict(fold_x_train)
        #print('y_predTrain',sklearn.metrics.accuracy_score(fold_y_train,y_predTrain))
        accuracy = sklearn.metrics.accuracy_score(fold_y_val, y_predVal)
        
        # Store accuracy value
        accuracy_tab[fold_idx,C_idx] = accuracy

print(accuracy_tab)









    



[[ 0.425  0.425  0.45   0.6    0.65   0.825  0.825  0.85   0.85   0.875
   0.875  0.875]
 [ 0.225  0.225  0.225  0.25   0.35   0.625  0.725  0.8    0.825  0.85
   0.775  0.675]
 [ 0.175  0.175  0.175  0.175  0.2    0.575  0.675  0.8    0.8    0.8
   0.775  0.7  ]
 [ 0.25   0.25   0.25   0.4    0.45   0.775  0.8    0.8    0.8    0.825
   0.8    0.75 ]
 [ 0.2    0.2    0.2    0.225  0.25   0.55   0.675  0.825  0.825  0.775
   0.725  0.625]]

Question 2c: Plot the following:

The accuracy values for all folds and all hyperparameter values
The mean and standard deviation accuracies over the folds for all hyperparameter values

Hint: You may use the function plt.scatter(), np.mean(), np.std(), plt.errorbar(), plt.show()



In [32]:

    
# plot the raw observations

for C_idx, C_value in enumerate(C_choices):
    accuracies_C_idx = accuracy_tab[:,C_idx]

    plt.scatter(np.ones(5)*np.log(C_value),accuracies_C_idx)
    
# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.mean(accuracy_tab,axis=0)
accuracies_std =  np.std(accuracy_tab,axis=0)
plt.errorbar(np.log(C_choices), accuracies_mean, yerr=accuracies_std)

# Add text
plt.title('Cross-validation on C')
plt.xlabel('log C')
plt.ylabel('Cross-validation accuracy')

plt.show()

Question 2d: Based on the cross-validation results above, choose the best value for C and apply it on the test set. What is the accuracy for the best C value?

Did we beat the best technique above? or not?

Hint: You may use the function np.argmax()



In [ ]:

    
idx_best_C = 9
neigh = sklearn.linear_model.LogisticRegression(C=C_choices[idx_best_C])
neigh.fit(X_train, y_train) 
train_accuracy = sklearn.metrics.accuracy_score(y_test, y_predTest)
y_pred = neigh.predict(X_test)
accuracy_testset = sklearn.metrics.accuracy_score(y_test, y_pred)
print('best accuracy=',accuracy_testset)



In [ ]: