This file provides a baseline accuracy and cross entropy for a given training, validation, and test file.
In [1]:
TRAINING_FILE = 'train170MB_lb_indx_0.8c_tokenized_lb_byline.txt'
VALIDATION_FILE = 'valid70MB_lb_indx_0.8c_tokenized_lb_byline.txt'
TEST_FILE = 'test80MB_lb_indx_08c_tokenized_lb_byline.txt'
In [2]:
import numpy as np
class NGramModel:
def __init__(self, number_of_tokens, n=3):
self.n = n
self.number_of_tokens = number_of_tokens
self._build_tables()
def _build_tables(self):
self.tables = list()
for n in range(1, self.n + 1):
self.tables.append(np.zeros(tuple([self.number_of_tokens for _ in range(n)]), dtype=np.int64))
def _get_ngrams(self, words, n):
return [tuple(words[i:i + n]) for i in range(len(words) - n + 1) if i + n <= len(words)]
def _predict(self, ngram, k=1, threshold=0):
column = self.tables[len(ngram)][ngram][1:]
predictions = np.argpartition(column, len(column) - k)[- k:]
if (sum(column[predictions]) < threshold) and (len(ngram) > 0):
return self._predict(ngram[1:], k, threshold)
else:
return predictions + 1
def _sum_log_probability(self, ngram, threshold=0, epsilon=1e-9):
column = self.tables[len(ngram)][ngram][1:]
if (sum(column) < threshold) and (len(ngram) > 0):
return self._sum_log_probability(ngram[1:], threshold)
else:
sum_probability = column / sum(column)
sum_probability[column == 0] = epsilon
return sum(np.log(sum_probability)) + np.log(epsilon)
def _sum_log_cross_entropy_one_file(self, file, threshold=0, epsilon=1e-9):
sum_log_cross_entropy = 0
for i in range(len(file)):
if file[i] == 0:
sum_log_cross_entropy += np.log(epsilon) * self.number_of_tokens
else:
ngram = tuple(file[max(i - (self.n - 1), 0):i])
sum_log_cross_entropy += self._sum_log_probability(ngram, threshold, epsilon)
return sum_log_cross_entropy
def _test_one_file(self, file, k=1, threshold=0):
correct = 0
for i in range(len(file)):
if file[i] != 0:
ngram = tuple(file[max(i - (self.n - 1), 0):i])
predictions = self._predict(ngram, k, threshold)
correct += int(file[i] in predictions)
return correct
def cross_entropy(self, files, threshold=0, epsilon=1e-9):
cross_entropy = 0
for file in files:
cross_entropy += self._sum_log_cross_entropy_one_file(file, threshold)
return - cross_entropy / (sum([len(file) for file in files]) * self.number_of_tokens)
def fit(self, files):
for file in files:
for n in range(1, self.n + 1):
ngrams = self._get_ngrams(file, n)
for ngram in ngrams:
self.tables[n - 1][ngram] += 1
def test(self, files, k=1, threshold=0):
correct = 0
for file in files:
correct += self._test_one_file(file, k, threshold)
return correct / sum([len(file) for file in files])
In [3]:
with open(TRAINING_FILE, 'r') as infile:
training_files = list()
for line in infile:
training_files.append([int(word) for word in line.split()])
with open(VALIDATION_FILE, 'r') as infile:
validation_files = list()
for line in infile:
validation_files.append([int(word) for word in line.split()])
with open(TEST_FILE, 'r') as infile:
test_files = list()
for line in infile:
test_files.append([int(word) for word in line.split()])
In [4]:
largest_token = 0
for file in training_files:
for token in file:
largest_token = max(largest_token, token)
In [5]:
print('n,k,accuracy')
for n in [2, 3, 5]:
# build validation model
validation_model = NGramModel(largest_token + 1, n)
validation_model.fit(training_files)
# build full model
full_model = NGramModel(largest_token + 1, n)
full_model.fit(training_files + validation_files)
for k in [1, 3, 5, 10]:
# find best threshold
best_accuracy = - 1
best_threshold = - 1
for threshold in [0, 1, 10, 30, 50, 100]:
accuracy = validation_model.test(validation_files, k, threshold)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_threshold = threshold
# get test accuracy
accuracy = full_model.test(test_files, k, best_threshold)
# print results
print('{0},{1},{2:.4f}'.format(n, k, accuracy))
In [6]:
print('n,cross_entropy')
for n in [2, 3, 5]:
# build validation model
validation_model = NGramModel(largest_token + 1, n)
validation_model.fit(training_files)
# build full model
full_model = NGramModel(largest_token + 1, n)
full_model.fit(training_files + validation_files)
# find best threshold
best_cross_entropy = np.inf
best_threshold = - 1
for threshold in [0, 1, 10, 30, 50, 100]:
cross_entropy = validation_model.cross_entropy(validation_files, threshold)
if cross_entropy > best_cross_entropy:
best_cross_entropy = cross_entropy
best_threshold = threshold
# get test accuracy
cross_entropy = full_model.cross_entropy(test_files, best_threshold)
# print results
print('{0},{1:.4f}'.format(n, cross_entropy))