This notebook cross-validates the CLTK's part-of-speech taggers. The final results are found at the bottom.


In [1]:
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag import AffixTagger
from nltk.tag import BigramTagger
from nltk.tag import tnt
from nltk.tag import TrigramTagger
from nltk.tag import UnigramTagger
from nltk.tokenize import wordpunct_tokenize
import math
import os
import pandas as pd
import random
from statistics import mean
from statistics import stdev

In [2]:
full_training_set_rel = '~/latin_treebank_perseus/latin_training_set.pos'
full_training_set = os.path.expanduser(full_training_set_rel)

In [3]:
unigram_accuracies = []
bigram_accuracies = []
trigram_accuracies = []
backoff_accuracies = []
tnt_accuracies = []

with open(full_training_set) as f:
    training_set_string = f.read()

pos_set = training_set_string.split('\n\n')  # mk into a list

sentence_count = len(pos_set)  # 3473
tenth = math.ceil(int(sentence_count) / int(10))

random.shuffle(pos_set)

def chunks(l, n):
    """Yield successive n-sized chunks from l.
    http://stackoverflow.com/a/312464
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

# a list of 10 lists
ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

#for counter in list(range(10)):
for counter, part in list(enumerate(ten_parts)):
    # map test list to part of given loop
    test_set = ten_parts[counter]  # or: test_set = part
    
    # filter out this loop's test index
    training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
    
    # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
    training_set = [item for sublist in training_set_lists for item in sublist]
        
    # save shuffled tests to file (as NLTK trainers expect)
    local_dir_rel = '~/cltk_data/user_data'
    local_dir = os.path.expanduser(local_dir_rel)
    if not os.path.isdir(local_dir):
        os.makedirs(local_dir)

    test_path = os.path.join(local_dir, 'test_latin.pos')
    with open(test_path, 'w') as f:
        f.write('\n\n'.join(test_set))

    train_path = os.path.join(local_dir, 'train_latin.pos')
    with open(train_path, 'w') as f:
        f.write('\n\n'.join(training_set))

    # read POS corpora
    train_reader = TaggedCorpusReader(local_dir, 'train_latin.pos')
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_latin.pos')
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    # make unigram tagger
    unigram_tagger = UnigramTagger(train_sents)
    # evaluate unigram tagger
    unigram_accuracy = None
    unigram_accuracy = unigram_tagger.evaluate(test_sents)
    unigram_accuracies.append(unigram_accuracy)
    print('Unigram:', unigram_accuracy)
    
    # make bigram tagger
    bigram_tagger = BigramTagger(train_sents)
    # evaluate bigram tagger
    bigram_accuracy = None
    bigram_accuracy = bigram_tagger.evaluate(test_sents)
    bigram_accuracies.append(bigram_accuracy)
    print('Bigram:', bigram_accuracy)
    
    # make trigram tagger
    trigram_tagger = TrigramTagger(train_sents)
    # evaluate trigram tagger
    trigram_accuracy = None
    trigram_accuracy = trigram_tagger.evaluate(test_sents)
    trigram_accuracies.append(trigram_accuracy)
    print('Trigram:', trigram_accuracy)
    
    # make 1, 2, 3-gram backoff tagger
    tagger1 = UnigramTagger(train_sents)
    tagger2 = BigramTagger(train_sents, backoff=tagger1)
    tagger3 = TrigramTagger(train_sents, backoff=tagger2)
    # evaluate trigram tagger
    backoff_accuracy = None
    backoff_accuracy = tagger3.evaluate(test_sents)
    backoff_accuracies.append(backoff_accuracy)
    print('1, 2, 3-gram backoff:', backoff_accuracy)
    
    # make tnt tagger
    tnt_tagger = tnt.TnT()
    tnt_tagger.train(train_sents)
    # evaulate tnt tagger
    tnt_accuracy = None
    tnt_accuracy = tnt_tagger.evaluate(test_sents)
    tnt_accuracies.append(tnt_accuracy)
    print('TnT:', tnt_accuracy)


Loop #0
Unigram: 0.6875379939209726
Bigram: 0.11043566362715299
Trigram: 0.07720364741641338
1, 2, 3-gram backoff: 0.6942249240121581
TnT: 0.7075987841945289
Loop #1
Unigram: 0.6730807777109641
Bigram: 0.10924032872319102
Trigram: 0.079374624173181
1, 2, 3-gram backoff: 0.6808979755462017
TnT: 0.6963319302465424
Loop #2
Unigram: 0.6664298401420959
Bigram: 0.09218472468916519
Trigram: 0.07158081705150977
1, 2, 3-gram backoff: 0.672291296625222
TnT: 0.6895204262877442
Loop #3
Unigram: 0.6941284403669725
Bigram: 0.10128440366972477
Trigram: 0.0781651376146789
1, 2, 3-gram backoff: 0.7012844036697248
TnT: 0.7157798165137614
Loop #4
Unigram: 0.6781718186990328
Bigram: 0.09008154750616347
Trigram: 0.06523800493077944
1, 2, 3-gram backoff: 0.6874644414944054
TnT: 0.7022567798217334
Loop #5
Unigram: 0.6814953271028037
Bigram: 0.09719626168224299
Trigram: 0.07009345794392523
1, 2, 3-gram backoff: 0.6919626168224299
TnT: 0.7050467289719626
Loop #6
Unigram: 0.6717732669475297
Bigram: 0.10857908847184987
Trigram: 0.07659900421294523
1, 2, 3-gram backoff: 0.6746457296055152
TnT: 0.6853695901953275
Loop #7
Unigram: 0.6757990867579908
Bigram: 0.09899543378995433
Trigram: 0.07452054794520548
1, 2, 3-gram backoff: 0.6825570776255708
TnT: 0.7000913242009132
Loop #8
Unigram: 0.6858373922817534
Bigram: 0.11558636193330836
Trigram: 0.08542525290370925
1, 2, 3-gram backoff: 0.6970775571375046
TnT: 0.7122517796927689
Loop #9
Unigram: 0.6787082649151615
Bigram: 0.09797482211275314
Trigram: 0.07206714103265828
1, 2, 3-gram backoff: 0.6819923371647509
TnT: 0.6942163838715563

In [4]:
final_accuracies_list = []
mean_accuracy_unigram = mean(unigram_accuracies)
standard_deviation_unigram = stdev(unigram_accuracies)
uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
final_accuracies_list.append(uni)

mean_accuracy_bigram = mean(bigram_accuracies)
standard_deviation_bigram = stdev(bigram_accuracies)
bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
final_accuracies_list.append(bi)

mean_accuracy_trigram = mean(trigram_accuracies)
standard_deviation_trigram = stdev(trigram_accuracies)
tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
final_accuracies_list.append(tri)

mean_accuracy_backoff = mean(backoff_accuracies)
standard_deviation_backoff = stdev(backoff_accuracies)
back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
final_accuracies_list.append(back)

mean_accuracy_tnt = mean(tnt_accuracies)
standard_deviation_tnt = stdev(tnt_accuracies)
tnt = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
final_accuracies_list.append(tnt)

In [5]:
final_dict = {}
for x in final_accuracies_list:
    final_dict.update(x)

df = pd.DataFrame(final_dict)
df


Out[5]:
1, 2, 3-gram backoff bigram tnt trigram unigram
mean 0.686440 0.102156 0.700846 0.075027 0.679296
sd 0.009606 0.008422 0.009724 0.005629 0.008234