Baseline

This file provides a baseline accuracy and cross entropy for a given training, validation, and test file.


In [1]:
TRAINING_FILE = 'train170MB_lb_indx_0.8c_tokenized_lb_byline.txt'
VALIDATION_FILE = 'valid70MB_lb_indx_0.8c_tokenized_lb_byline.txt'
TEST_FILE = 'test80MB_lb_indx_08c_tokenized_lb_byline.txt'

In [2]:
import numpy as np


class NGramModel:
    
    def __init__(self, number_of_tokens, n=3):
        self.n = n
        self.number_of_tokens = number_of_tokens
        self._build_tables()

    def _build_tables(self):
        self.tables = list()
        for n in range(1, self.n + 1):
            self.tables.append(np.zeros(tuple([self.number_of_tokens for _ in range(n)]), dtype=np.int64))

    def _get_ngrams(self, words, n):
        return [tuple(words[i:i + n]) for i in range(len(words) - n + 1) if i + n <= len(words)]

    def _predict(self, ngram, k=1, threshold=0):
        column = self.tables[len(ngram)][ngram][1:]
        predictions = np.argpartition(column, len(column) - k)[- k:]
        if (sum(column[predictions]) < threshold) and (len(ngram) > 0):
            return self._predict(ngram[1:], k, threshold)
        else:
            return predictions + 1

    def _sum_log_probability(self, ngram, threshold=0, epsilon=1e-9):
        column = self.tables[len(ngram)][ngram][1:]
        if (sum(column) < threshold) and (len(ngram) > 0):
            return self._sum_log_probability(ngram[1:], threshold)
        else:
            sum_probability = column / sum(column)
            sum_probability[column == 0] = epsilon
            return sum(np.log(sum_probability)) + np.log(epsilon)

    def _sum_log_cross_entropy_one_file(self, file, threshold=0, epsilon=1e-9):
        sum_log_cross_entropy = 0
        for i in range(len(file)):
            if file[i] == 0:
                sum_log_cross_entropy += np.log(epsilon) * self.number_of_tokens
            else:
                ngram = tuple(file[max(i - (self.n - 1), 0):i])
                sum_log_cross_entropy += self._sum_log_probability(ngram, threshold, epsilon)
        return sum_log_cross_entropy

    def _test_one_file(self, file, k=1, threshold=0):
        correct = 0
        for i in range(len(file)):
            if file[i] != 0:
                ngram = tuple(file[max(i - (self.n - 1), 0):i])
                predictions = self._predict(ngram, k, threshold)
                correct += int(file[i] in predictions)
        return correct
    
    def cross_entropy(self, files, threshold=0, epsilon=1e-9):
        cross_entropy = 0
        for file in files:
            cross_entropy += self._sum_log_cross_entropy_one_file(file, threshold)
        return - cross_entropy / (sum([len(file) for file in files]) * self.number_of_tokens)

    def fit(self, files):
        for file in files:
            for n in range(1, self.n + 1):
                ngrams = self._get_ngrams(file, n)
                for ngram in ngrams:
                    self.tables[n - 1][ngram] += 1

    def test(self, files, k=1, threshold=0):
        correct = 0
        for file in files:
            correct += self._test_one_file(file, k, threshold)
        return correct / sum([len(file) for file in files])

In [3]:
with open(TRAINING_FILE, 'r') as infile:
    training_files = list()
    for line in infile:
        training_files.append([int(word) for word in line.split()])
        
with open(VALIDATION_FILE, 'r') as infile:
    validation_files = list()
    for line in infile:
        validation_files.append([int(word) for word in line.split()])
        
with open(TEST_FILE, 'r') as infile:
    test_files = list()
    for line in infile:
        test_files.append([int(word) for word in line.split()])

In [4]:
largest_token = 0
for file in training_files:
    for token in file:
        largest_token = max(largest_token, token)

In [5]:
print('n,k,accuracy')
for n in [2, 3, 5]:
    # build validation model
    validation_model = NGramModel(largest_token + 1, n)
    validation_model.fit(training_files)
    
    # build full model
    full_model = NGramModel(largest_token + 1, n)
    full_model.fit(training_files + validation_files)
    
    for k in [1, 3, 5, 10]:
        
        # find best threshold
        best_accuracy = - 1
        best_threshold = - 1
        for threshold in [0, 1, 10, 30, 50, 100]:
            accuracy = validation_model.test(validation_files, k, threshold)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_threshold = threshold
                
        # get test accuracy
        accuracy = full_model.test(test_files, k, best_threshold)
        
        # print results
        print('{0},{1},{2:.4f}'.format(n, k, accuracy))


n,k,accuracy
2,1,0.3078
2,3,0.5306
2,5,0.6034
2,10,0.6686
3,1,0.3922
3,3,0.6005
3,5,0.6544
3,10,0.6953
5,1,0.4583
5,3,0.6351
5,5,0.6755
5,10,0.7073

In [6]:
print('n,cross_entropy')
for n in [2, 3, 5]:
    # build validation model
    validation_model = NGramModel(largest_token + 1, n)
    validation_model.fit(training_files)
    
    # build full model
    full_model = NGramModel(largest_token + 1, n)
    full_model.fit(training_files + validation_files)

    # find best threshold
    best_cross_entropy = np.inf
    best_threshold = - 1
    for threshold in [0, 1, 10, 30, 50, 100]:
        cross_entropy = validation_model.cross_entropy(validation_files, threshold)
        if cross_entropy > best_cross_entropy:
            best_cross_entropy = cross_entropy
            best_threshold = threshold
                
    # get test accuracy
    cross_entropy = full_model.cross_entropy(test_files, best_threshold)
        
    # print results
    print('{0},{1:.4f}'.format(n, cross_entropy))


n,cross_entropy
2,15.5712
/Users/dashley/.miniconda3/envs/CMPUT499/lib/python3.6/site-packages/ipykernel/__main__.py:32: RuntimeWarning: invalid value encountered in true_divide
3,17.6383
5,19.3946