Import library


In [3]:
%matplotlib inline

import os, time
import csv
import numpy as np
from scipy import io
from scipy.stats import multivariate_normal
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams['xtick.direction'] = 'out'
matplotlib.rcParams['ytick.direction'] = 'out'

fig_directory = '../figures'
if not os.path.exists(fig_directory):
    os.makedirs(fig_directory)

save_fig = True
fig_size = (6, 4.5)
fig_dpi = 200

Load data


In [2]:
train_data_path = "../data/digit-dataset/train.mat"
train_data = io.loadmat(train_data_path)

Build data processing function


In [3]:
def process(image):
    raveled = image.ravel()
    norm = np.linalg.norm(raveled)
    if norm:
        return raveled / np.linalg.norm(raveled)
    else:
        return raveled

Process data


In [4]:
assert len(train_data['train_image'][0][0]) == len(train_data['train_label'])
train_data_size = len(train_data['train_image'][0][0])
train_images = np.array([process(train_data['train_image'][:,:,i]) for i in xrange(train_data_size)])
train_labels = np.array([train_data['train_label'][i][0] for i in xrange(train_data_size)])

Group data based on label classes


In [5]:
grouped_train_images = {}
for digit in range(10):
    grouped_train_images[digit] = train_images[train_labels == digit]

MLE


In [6]:
distributions = {}
covs = {}
means = {}
for digit in grouped_train_images:
    means[digit] = np.mean(grouped_train_images[digit], axis=0)
    covs[digit] = np.cov(grouped_train_images[digit], rowvar=0)
    adjusted_cov = covs[digit] + 0.001 * np.identity(covs[digit].shape[0])
    distributions[digit] = multivariate_normal(means[digit], adjusted_cov)

Prior distribution


In [7]:
prior = {}
for digit in range(10):
    total_size = train_labels.size
    digit_size = train_labels[train_labels == digit].size
    prior[digit] = digit_size / float(total_size)

Plot heatmap of covariance matrix


In [8]:
def plot_cov_mat_heatmap(digit):
    fig = plt.figure()
    plt.grid()
    plt.imshow(covs[digit])
    cb = plt.colorbar()
    plt.show()
    fig.savefig('{0}/p4_heatmap_{1}.png'.format(fig_directory, digit), 
                dpi=fig_dpi,
                bbox_inches='tight')

In [9]:
plot_cov_mat_heatmap(0)



In [10]:
plot_cov_mat_heatmap(6)


Gaussian classifier using $\Sigma_{i}$ for each class


In [11]:
from gaussian_classifier import GaussianClassifier

Gaussian classifier using $\Sigma_{overall}$


In [12]:
from gaussian_classifier import SameCovGaussianClassifier

Train Classifier with different sizes


In [13]:
randomize_indices = np.random.permutation(train_data_size)
data_sizes = [100, 200, 500, 1000, 2000, 5000, 10000, 30000, 60000]

classifiers_by_sizes = {}
same_sov_classifiers_by_sizes = {}

for size in data_sizes:
    images = train_images[randomize_indices[:size]]
    labels = train_labels[randomize_indices[:size]]
    
    # Train $\Sigma_{i}$ classifier
    classifier = GaussianClassifier()
    classifier.fit(images, labels)
    classifiers_by_sizes[size] = classifier
    
    # Train $\Sigma_{overall}$ classifier
    same_cov_classifier = SameCovGaussianClassifier()
    same_cov_classifier.fit(images, labels)
    same_sov_classifiers_by_sizes[size] = same_cov_classifier


Finish training with data size 100 , takes 2.08 seconds
Finish training with data size 100 , takes 0.89 seconds
Finish training with data size 200 , takes 1.8 seconds
Finish training with data size 200 , takes 0.87 seconds
Finish training with data size 500 , takes 1.89 seconds
Finish training with data size 500 , takes 1.0 seconds
Finish training with data size 1000 , takes 1.89 seconds
Finish training with data size 1000 , takes 1.04 seconds
Finish training with data size 2000 , takes 1.84 seconds
Finish training with data size 2000 , takes 1.16 seconds
Finish training with data size 5000 , takes 1.9 seconds
Finish training with data size 5000 , takes 1.1 seconds
Finish training with data size 10000 , takes 1.92 seconds
Finish training with data size 10000 , takes 1.16 seconds
Finish training with data size 30000 , takes 2.17 seconds
Finish training with data size 30000 , takes 1.53 seconds
Finish training with data size 60000 , takes 2.61 seconds
Finish training with data size 60000 , takes 1.83 seconds

Load and prepare test data


In [14]:
test_data_path = "../data/digit-dataset/test.mat"
test_data = io.loadmat(test_data_path)

assert len(test_data['test_image'][0][0]) == len(test_data['test_label'])
test_data_size = len(test_data['test_image'][0][0])
test_images = np.array([process(test_data['test_image'][:,:,i]) for i in xrange(test_data_size)])
test_labels = np.array([test_data['test_label'][i][0] for i in xrange(test_data_size)])

Calculate correct rate


In [15]:
classifiers_cr = []
same_cov_classifiers_cr = []

for size in data_sizes:
    cr = classifiers_by_sizes[size].correct_rate(test_images, test_labels)
    sccr = same_sov_classifiers_by_sizes[size].correct_rate(test_images, test_labels)
    classifiers_cr.append(cr)
    same_cov_classifiers_cr.append(sccr)
    print "Sigma_i: {0} | Sigma_overall: {1} | Size: {2}".format(cr, sccr, size)


Sigma_i: 0.44 | Sigma_overall: 0.17 | Size: 100
Sigma_i: 0.73 | Sigma_overall: 0.15 | Size: 200
Sigma_i: 0.88 | Sigma_overall: 0.15 | Size: 500
Sigma_i: 0.89 | Sigma_overall: 0.1 | Size: 1000
Sigma_i: 0.87 | Sigma_overall: 0.09 | Size: 2000
Sigma_i: 0.67 | Sigma_overall: 0.09 | Size: 5000
Sigma_i: 0.75 | Sigma_overall: 0.15 | Size: 10000
Sigma_i: 0.71 | Sigma_overall: 0.09 | Size: 30000
Sigma_i: 0.65 | Sigma_overall: 0.09 | Size: 60000

In [16]:
# Plot correct rate of $\Sigma_{i}$ classifier
fig = plt.figure()
plt.plot(data_sizes, classifiers_cr,
         color='red', marker='o', 
         markerfacecolor='blue', linewidth=1.0)
plt.xlabel('Training set size')
plt.ylabel('Accuracy')
plt.title('Sigma_i classifier')
plt.show()
fig.savefig('{0}/p4_accuracy_sigmai.png'.format(fig_directory), 
            dpi=fig_dpi,
            bbox_inches='tight')

# Plot correct rate of $\Sigma_{overall}$ classifier
fig = plt.figure()
plt.plot(data_sizes, same_cov_classifiers_cr,
         color='red', marker='o', 
         markerfacecolor='blue', linewidth=1.0)
plt.xlabel('Training set size')
plt.ylabel('Accuracy')
plt.title('Sigma_overall classifier')
plt.show()
fig.savefig('{0}/p4_accuracy_sigma_overall.png'.format(fig_directory), 
            dpi=fig_dpi,
            bbox_inches='tight')


Load and prepare Kaggle data


In [17]:
kaggle_data_path = "../data/digit-dataset/kaggle.mat"
kaggle_data = io.loadmat(kaggle_data_path)

kaggle_data_size = len(kaggle_data['kaggle_image'][0][0])
kaggle_images = np.array([process(kaggle_data['kaggle_image'][:,:,i]) for i in xrange(kaggle_data_size)])

Build Kaggle classifier and predict results


In [18]:
kaggle_train_data_size = 1000

kaggle_classifier = GaussianClassifier()
kaggle_classifier.fit(train_images[randomize_indices[:kaggle_train_data_size]],
                      train_labels[randomize_indices[:kaggle_train_data_size]])

kaggle_classifier_cr = kaggle_classifier.correct_rate(test_images, test_labels)
print "Kaggle classifier has correct rate {0} with test data.".format(kaggle_classifier_cr)

kaggle_result = kaggle_classifier.predicate(kaggle_images)


Finish training with data size 1000 , takes 2.18 seconds
Kaggle classifier has correct rate 0.89 with test data.

Create output directory


In [4]:
output_dir = '../output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

Output result to csv file


In [19]:
kaggle_result_path = '{0}/p4_kaggle_result.csv'.format(output_dir)
with open(kaggle_result_path, 'w') as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerow(['Id', 'Category'])
    for i in xrange(len(kaggle_result)):
        writer.writerow([i+1, int(kaggle_result[i])])

Kaggle for spam


In [20]:
spam_data_path = "../data/spam-dataset/spam_data.mat"
spam_data = io.loadmat(spam_data_path)

In [21]:
spam_training_data = spam_data['training_data']
spam_training_labels = spam_data['training_labels'][0]
assert len(spam_training_data) == len(spam_training_labels)
spam_training_data_size = len(spam_training_data)
spam_randomize_indices = np.random.permutation(spam_training_data_size)

In [22]:
spam_validation_set_size = 1172
spam_validation_set_data = spam_training_data[spam_randomize_indices[-spam_validation_set_size:]]
spam_validation_set_labels = spam_training_labels[spam_randomize_indices[-spam_validation_set_size:]]

In [23]:
spam_data_sizes = [100, 200, 500, 1000, 2000, 3000, 4000]

spam_classifiers_by_sizes = {}

for size in spam_data_sizes:
    data = spam_training_data[spam_randomize_indices[:size]]
    labels = spam_training_labels[spam_randomize_indices[:size]]
    classifier = GaussianClassifier()
    classifier.fit(data, labels)
    spam_classifiers_by_sizes[size] = classifier


Finish training with data size 100 , takes 0.0 seconds
Finish training with data size 200 , takes 0.0 seconds
Finish training with data size 500 , takes 0.0 seconds
Finish training with data size 1000 , takes 0.0 seconds
Finish training with data size 2000 , takes 0.0 seconds
Finish training with data size 3000 , takes 0.0 seconds
Finish training with data size 4000 , takes 0.0 seconds

In [24]:
best_cr = float('-inf')
best_size = None
for size in spam_data_sizes:
    cr = spam_classifiers_by_sizes[size].correct_rate(spam_validation_set_data, 
                                                      spam_validation_set_labels)
    print "Correct rate: {0} | Size: {1}".format(cr, size)
    if cr > best_cr:
        best_cr = cr
        best_size = size

spam_kaggle_classifier = spam_classifiers_by_sizes[best_size]


Correct rate: 0.77 | Size: 100
Correct rate: 0.78 | Size: 200
Correct rate: 0.78 | Size: 500
Correct rate: 0.78 | Size: 1000
Correct rate: 0.77 | Size: 2000
Correct rate: 0.78 | Size: 3000
Correct rate: 0.78 | Size: 4000

In [25]:
spam_kaggle_data = spam_data['test_data']
spam_kaggle_result = spam_kaggle_classifier.predicate(spam_kaggle_data)

In [26]:
spam_kaggle_result_path = '{0}/p4_spam_kaggle_result.csv'.format(output_dir)
with open(spam_kaggle_result_path, 'w') as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerow(['Id', 'Category'])
    for i in xrange(len(spam_kaggle_result)):
        writer.writerow([i+1, int(spam_kaggle_result[i])])

In [26]: