In [1]:
from __future__ import division
import re
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
#%qtconsole
I found some help with parameters here:
--cache_file train.cache
converts train_ALL.vw to a binary file for future faster processing.
Next time we go through the model building, we will use the cache file
and not the text file.
--passes
is the number of passes
--oaa 10
refers to oaa learning algorithm with 10 classes (1 to 10)
-q ii
creates interaction between variables in the two referred to namespaces
which here are the same i.e. 'image' Namespace.
An interaction variable is created from two variables 'A' and 'B'
by multiplying the values of 'A' and 'B'.
-f mnist_ALL.model
refers to file where model will be saved.
-b
refers to number of bits in the feature table.
Default number is 18 but as we have increased the number of features much more
by introducing interaction features, value of '-b' has been increased to 22.
-l rate
Adjust the learning rate. Defaults to 0.5
--power_t p
This specifies the power on the learning rate decay. You can adjust this --power_t p where p is in the range [0,1]. 0 means the learning rate does not decay, which can be helpful when state tracking, while 1 is very aggressive. Defaults to 0.5
In [3]:
!rm train.vw.cache
In [4]:
!rm pca_train.model
In [5]:
!vw -d data/mnist_train_pca.vw --cache_file train.vw.cache -f pca_train.model -b 19 --oaa 10 -q ii --passes 35 --early_terminate 3 -l 0.4 --power_t 0.6 --initial_t 0 --decay_learning_rate 1
-t
is for test file
-i
specifies the model file created earlier
-p
where to store the class predictions [1,10]
In [6]:
!rm predict.txt
In [7]:
!vw -t data/mnist_test_pca.vw -i pca_train.model -p predict.txt
In [8]:
y_true=[]
with open("data/mnist_test_pca.vw", 'rb') as f:
for line in f:
m = re.search('^\d+', line)
if m:
found = m.group()
y_true.append(int(found))
y_pred = []
with open("predict.txt", 'rb') as f:
for line in f:
m = re.search('^\d+', line)
if m:
found = m.group()
y_pred.append(int(found))
target_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] # NOTE: plus one
In [9]:
def plot_confusion_matrix(cm,
target_names,
title='Proportional Confusion matrix: VW on PCA data',
cmap=plt.cm.Paired):
"""
given a confusion matrix (cm), make a nice plot
see the skikit-learn documentation for the original done for the iris dataset
"""
plt.figure(figsize=(8, 6))
plt.imshow((cm/cm.sum(axis=1)), interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(y_true, y_pred)
print(cm)
model_accuracy = sum(cm.diagonal())/len(y_pred)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))
plot_confusion_matrix(cm, target_names)
In [ ]: