In [3]:
%matplotlib inline
import os, time
import csv
import numpy as np
from scipy import io
from scipy.stats import multivariate_normal
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['xtick.direction'] = 'out'
matplotlib.rcParams['ytick.direction'] = 'out'
fig_directory = '../figures'
if not os.path.exists(fig_directory):
os.makedirs(fig_directory)
save_fig = True
fig_size = (6, 4.5)
fig_dpi = 200
In [2]:
train_data_path = "../data/digit-dataset/train.mat"
train_data = io.loadmat(train_data_path)
In [3]:
def process(image):
raveled = image.ravel()
norm = np.linalg.norm(raveled)
if norm:
return raveled / np.linalg.norm(raveled)
else:
return raveled
In [4]:
assert len(train_data['train_image'][0][0]) == len(train_data['train_label'])
train_data_size = len(train_data['train_image'][0][0])
train_images = np.array([process(train_data['train_image'][:,:,i]) for i in xrange(train_data_size)])
train_labels = np.array([train_data['train_label'][i][0] for i in xrange(train_data_size)])
In [5]:
grouped_train_images = {}
for digit in range(10):
grouped_train_images[digit] = train_images[train_labels == digit]
In [6]:
distributions = {}
covs = {}
means = {}
for digit in grouped_train_images:
means[digit] = np.mean(grouped_train_images[digit], axis=0)
covs[digit] = np.cov(grouped_train_images[digit], rowvar=0)
adjusted_cov = covs[digit] + 0.001 * np.identity(covs[digit].shape[0])
distributions[digit] = multivariate_normal(means[digit], adjusted_cov)
In [7]:
prior = {}
for digit in range(10):
total_size = train_labels.size
digit_size = train_labels[train_labels == digit].size
prior[digit] = digit_size / float(total_size)
In [8]:
def plot_cov_mat_heatmap(digit):
fig = plt.figure()
plt.grid()
plt.imshow(covs[digit])
cb = plt.colorbar()
plt.show()
fig.savefig('{0}/p4_heatmap_{1}.png'.format(fig_directory, digit),
dpi=fig_dpi,
bbox_inches='tight')
In [9]:
plot_cov_mat_heatmap(0)
In [10]:
plot_cov_mat_heatmap(6)
In [11]:
from gaussian_classifier import GaussianClassifier
In [12]:
from gaussian_classifier import SameCovGaussianClassifier
In [13]:
randomize_indices = np.random.permutation(train_data_size)
data_sizes = [100, 200, 500, 1000, 2000, 5000, 10000, 30000, 60000]
classifiers_by_sizes = {}
same_sov_classifiers_by_sizes = {}
for size in data_sizes:
images = train_images[randomize_indices[:size]]
labels = train_labels[randomize_indices[:size]]
# Train $\Sigma_{i}$ classifier
classifier = GaussianClassifier()
classifier.fit(images, labels)
classifiers_by_sizes[size] = classifier
# Train $\Sigma_{overall}$ classifier
same_cov_classifier = SameCovGaussianClassifier()
same_cov_classifier.fit(images, labels)
same_sov_classifiers_by_sizes[size] = same_cov_classifier
In [14]:
test_data_path = "../data/digit-dataset/test.mat"
test_data = io.loadmat(test_data_path)
assert len(test_data['test_image'][0][0]) == len(test_data['test_label'])
test_data_size = len(test_data['test_image'][0][0])
test_images = np.array([process(test_data['test_image'][:,:,i]) for i in xrange(test_data_size)])
test_labels = np.array([test_data['test_label'][i][0] for i in xrange(test_data_size)])
In [15]:
classifiers_cr = []
same_cov_classifiers_cr = []
for size in data_sizes:
cr = classifiers_by_sizes[size].correct_rate(test_images, test_labels)
sccr = same_sov_classifiers_by_sizes[size].correct_rate(test_images, test_labels)
classifiers_cr.append(cr)
same_cov_classifiers_cr.append(sccr)
print "Sigma_i: {0} | Sigma_overall: {1} | Size: {2}".format(cr, sccr, size)
In [16]:
# Plot correct rate of $\Sigma_{i}$ classifier
fig = plt.figure()
plt.plot(data_sizes, classifiers_cr,
color='red', marker='o',
markerfacecolor='blue', linewidth=1.0)
plt.xlabel('Training set size')
plt.ylabel('Accuracy')
plt.title('Sigma_i classifier')
plt.show()
fig.savefig('{0}/p4_accuracy_sigmai.png'.format(fig_directory),
dpi=fig_dpi,
bbox_inches='tight')
# Plot correct rate of $\Sigma_{overall}$ classifier
fig = plt.figure()
plt.plot(data_sizes, same_cov_classifiers_cr,
color='red', marker='o',
markerfacecolor='blue', linewidth=1.0)
plt.xlabel('Training set size')
plt.ylabel('Accuracy')
plt.title('Sigma_overall classifier')
plt.show()
fig.savefig('{0}/p4_accuracy_sigma_overall.png'.format(fig_directory),
dpi=fig_dpi,
bbox_inches='tight')
In [17]:
kaggle_data_path = "../data/digit-dataset/kaggle.mat"
kaggle_data = io.loadmat(kaggle_data_path)
kaggle_data_size = len(kaggle_data['kaggle_image'][0][0])
kaggle_images = np.array([process(kaggle_data['kaggle_image'][:,:,i]) for i in xrange(kaggle_data_size)])
In [18]:
kaggle_train_data_size = 1000
kaggle_classifier = GaussianClassifier()
kaggle_classifier.fit(train_images[randomize_indices[:kaggle_train_data_size]],
train_labels[randomize_indices[:kaggle_train_data_size]])
kaggle_classifier_cr = kaggle_classifier.correct_rate(test_images, test_labels)
print "Kaggle classifier has correct rate {0} with test data.".format(kaggle_classifier_cr)
kaggle_result = kaggle_classifier.predicate(kaggle_images)
In [4]:
output_dir = '../output'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
In [19]:
kaggle_result_path = '{0}/p4_kaggle_result.csv'.format(output_dir)
with open(kaggle_result_path, 'w') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerow(['Id', 'Category'])
for i in xrange(len(kaggle_result)):
writer.writerow([i+1, int(kaggle_result[i])])
In [20]:
spam_data_path = "../data/spam-dataset/spam_data.mat"
spam_data = io.loadmat(spam_data_path)
In [21]:
spam_training_data = spam_data['training_data']
spam_training_labels = spam_data['training_labels'][0]
assert len(spam_training_data) == len(spam_training_labels)
spam_training_data_size = len(spam_training_data)
spam_randomize_indices = np.random.permutation(spam_training_data_size)
In [22]:
spam_validation_set_size = 1172
spam_validation_set_data = spam_training_data[spam_randomize_indices[-spam_validation_set_size:]]
spam_validation_set_labels = spam_training_labels[spam_randomize_indices[-spam_validation_set_size:]]
In [23]:
spam_data_sizes = [100, 200, 500, 1000, 2000, 3000, 4000]
spam_classifiers_by_sizes = {}
for size in spam_data_sizes:
data = spam_training_data[spam_randomize_indices[:size]]
labels = spam_training_labels[spam_randomize_indices[:size]]
classifier = GaussianClassifier()
classifier.fit(data, labels)
spam_classifiers_by_sizes[size] = classifier
In [24]:
best_cr = float('-inf')
best_size = None
for size in spam_data_sizes:
cr = spam_classifiers_by_sizes[size].correct_rate(spam_validation_set_data,
spam_validation_set_labels)
print "Correct rate: {0} | Size: {1}".format(cr, size)
if cr > best_cr:
best_cr = cr
best_size = size
spam_kaggle_classifier = spam_classifiers_by_sizes[best_size]
In [25]:
spam_kaggle_data = spam_data['test_data']
spam_kaggle_result = spam_kaggle_classifier.predicate(spam_kaggle_data)
In [26]:
spam_kaggle_result_path = '{0}/p4_spam_kaggle_result.csv'.format(output_dir)
with open(spam_kaggle_result_path, 'w') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerow(['Id', 'Category'])
for i in xrange(len(spam_kaggle_result)):
writer.writerow([i+1, int(spam_kaggle_result[i])])
In [26]: