In [5]:
# Importing the simple data sample
import lxmls.readers.simple_sequence as ssr

simple = ssr.SimpleSequence()

In [6]:
print "train\n", simple.train
print "test\n", simple.test

print "x"
for sequence in simple.train.seq_list:
    print sequence.x

print "y"
for sequence in simple.train.seq_list:
    print sequence.y


train
[walk/rainy walk/sunny shop/sunny clean/sunny , walk/rainy walk/rainy shop/rainy clean/sunny , walk/sunny shop/sunny shop/sunny clean/sunny ]
test
[walk/rainy walk/sunny shop/sunny clean/sunny , clean/sunny walk/sunny tennis/sunny walk/sunny ]
x
[0, 0, 1, 2]
[0, 0, 1, 2]
[0, 1, 1, 2]
y
[0, 1, 1, 1]
[0, 0, 0, 1]
[1, 1, 1, 1]

In [7]:
import lxmls.sequences.hmm as hmmc

# Supervised training
hmm = hmmc.HMM(simple.x_dict, simple.y_dict) 
hmm.train_supervised(simple.train)

In [8]:
print "Initial Probabilities:\n", hmm.initial_probs
print "Transition Probabilities:\n", hmm.transition_probs
print "Final Probabilities:\n", hmm.final_probs
print "Emission Probabilities:\n", hmm.emission_probs


Initial Probabilities:
[ 0.66666667  0.33333333]
Transition Probabilities:
[[ 0.5    0.   ]
 [ 0.5    0.625]]
Final Probabilities:
[ 0.     0.375]
Emission Probabilities:
[[ 0.75   0.25 ]
 [ 0.25   0.375]
 [ 0.     0.375]
 [ 0.     0.   ]]

In [27]:
# Calculate the Trellis Score (Log Probabilities) for the first sequence
initial_scores, transition_scores, final_scores, emission_scores = hmm.compute_scores(simple.train.seq_list[0])

In [10]:
print "initial scores:\n", initial_scores
print "transition_scores:\n", transition_scores
print "final_scores:\n", final_scores
print "emission_scores:\n", emission_scores


initial scores:
[-0.40546511 -1.09861229]
transition_scores:
[[[-0.69314718        -inf]
  [-0.69314718 -0.47000363]]

 [[-0.69314718        -inf]
  [-0.69314718 -0.47000363]]

 [[-0.69314718        -inf]
  [-0.69314718 -0.47000363]]]
final_scores:
[       -inf -0.98082925]
emission_scores:
[[-0.28768207 -1.38629436]
 [-0.28768207 -1.38629436]
 [-1.38629436 -0.98082925]
 [       -inf -0.98082925]]

In [11]:
# Log Sum
from lxmls.sequences.log_domain import *

In [12]:
# Calculating the Log Likelihood using the Forward Algorithm
log_likelihood, forward = hmm.decoder.run_forward(initial_scores, transition_scores, final_scores, emission_scores)
print 'Log-Likelihood =', log_likelihood
log_likelihood, backward = hmm.decoder.run_backward(initial_scores, transition_scores, final_scores, emission_scores)
print 'Log-Likelihood =', log_likelihood


Log-Likelihood = -5.06823232601
Log-Likelihood = -5.06823232601

In [16]:
# Computing the node posteriors for the first sample sequence
initial_scores, transition_scores, final_scores, emission_scores = hmm.compute_scores(simple.train.seq_list[0])
state_posteriors, _, _ = hmm.compute_posteriors(initial_scores, transition_scores, final_scores, emission_scores)

print state_posteriors


[[ 0.95738152  0.04261848]
 [ 0.75281282  0.24718718]
 [ 0.26184794  0.73815206]
 [ 0.          1.        ]]

In [24]:
y_pred = hmm.posterior_decode(simple.test.seq_list[0])
print "Prediction test 0:", y_pred
print "Truth 0:", simple.test.seq_list[0]


Prediction test 0: walk/rainy walk/rainy shop/sunny clean/sunny 
Truth 0: walk/rainy walk/sunny shop/sunny clean/sunny 

In [28]:
# This will cause problems because tennis was never seen!
y_pred = hmm.posterior_decode(simple.test.seq_list[1])
print "Prediction test 1:", y_pred
print "Truth 1:", simple.test.seq_list[1]


Prediction test 1: clean/sunny walk/sunny tennis/sunny walk/sunny 
Truth 1: clean/sunny walk/sunny tennis/sunny walk/sunny 

In [26]:
# So we will smooth instead
hmm.train_supervised(simple.train, smoothing=0.1)
y_pred = hmm.posterior_decode(simple.test.seq_list[0])
print "Prediction test 0 with smoothing:", y_pred
print "Truth test 0:", simple.test.seq_list[0]

y_pred = hmm.posterior_decode(simple.test.seq_list[1])
print "Prediction test 1 with smoothing:", y_pred
print "Truth test 1:", simple.test.seq_list[1]


Prediction test 0 with smoothing: walk/rainy walk/rainy shop/sunny clean/sunny 
Truth test 0: walk/rainy walk/sunny shop/sunny clean/sunny 
Prediction test 1 with smoothing: clean/sunny walk/sunny tennis/sunny walk/sunny 
Truth test 1: clean/sunny walk/sunny tennis/sunny walk/sunny 

In [29]:
y_pred, score = hmm.viterbi_decode(simple.test.seq_list[0])
print "Viterbi decoding Prediction test 0 with smoothing"
print y_pred, score
print "Truth test 0"
print simple.test.seq_list[0]

y_pred, score = hmm.viterbi_decode(simple.test.seq_list[1])
print "Viterbi decoding Prediction test 1 with smoothing"
print y_pred, score
print "Truth test 1"
print simple.test.seq_list[1]


Viterbi decoding Prediction test 0 with smoothing
walk/rainy walk/rainy shop/sunny clean/sunny  -6.02050124698
Truth test 0
walk/rainy walk/sunny shop/sunny clean/sunny 
Viterbi decoding Prediction test 1 with smoothing
clean/sunny walk/sunny tennis/sunny walk/sunny  -11.713974074
Truth test 1
clean/sunny walk/sunny tennis/sunny walk/sunny 

In [32]:
import lxmls.readers.pos_corpus as pcc
import lxmls.sequences.confusion_matrix as cm

corpus = pcc.PostagCorpus()
train_seq = corpus.read_sequence_list_conll("data/train-02-21.conll",max_sent_len=15,max_nr_sent=1000)
test_seq = corpus.read_sequence_list_conll("data/test-23.conll",max_sent_len=15,max_nr_sent=1000)
dev_seq = corpus.read_sequence_list_conll("data/dev-22.conll",max_sent_len=15,max_nr_sent=1000)
hmm = hmmc.HMM(corpus.word_dict, corpus.tag_dict)
hmm.train_supervised(train_seq)
hmm.print_transition_matrix()

viterbi_pred_train = hmm.viterbi_decode_corpus(train_seq)
posterior_pred_train = hmm.posterior_decode_corpus(train_seq)
eval_viterbi_train =   hmm.evaluate_corpus(train_seq, viterbi_pred_train)
eval_posterior_train = hmm.evaluate_corpus(train_seq, posterior_pred_train)
print "Train Set Accuracy: Posterior Decode %.3f, Viterbi Decode: %.3f"%(eval_posterior_train,eval_viterbi_train)

viterbi_pred_test = hmm.viterbi_decode_corpus(test_seq)
posterior_pred_test = hmm.posterior_decode_corpus(test_seq)
eval_viterbi_test =   hmm.evaluate_corpus(test_seq,viterbi_pred_test)
eval_posterior_test = hmm.evaluate_corpus(test_seq,posterior_pred_test)
print "Test Set Accuracy: Posterior Decode %.3f, Viterbi Decode: %.3f"%(eval_posterior_test,eval_viterbi_test)

best_smothing = hmm.pick_best_smoothing(train_seq, dev_seq, [10,1,0.1,0])


hmm.train_supervised(train_seq, smoothing=best_smothing)
viterbi_pred_test = hmm.viterbi_decode_corpus(test_seq)
posterior_pred_test = hmm.posterior_decode_corpus(test_seq)
eval_viterbi_test =   hmm.evaluate_corpus(test_seq, viterbi_pred_test)
eval_posterior_test = hmm.evaluate_corpus(test_seq, posterior_pred_test)
print "Best Smoothing %f --  Test Set Accuracy: Posterior Decode %.3f, Viterbi Decode: %.3f"%(best_smothing,eval_posterior_test,eval_viterbi_test)

confusion_matrix = cm.build_confusion_matrix(test_seq.seq_list, viterbi_pred_test, 
                                             len(corpus.tag_dict), hmm.get_num_states())

cm.plot_confusion_bar_graph(confusion_matrix, corpus.tag_dict, 
                            range(hmm.get_num_states()), 'Confusion matrix')


Train Set Accuracy: Posterior Decode 0.985, Viterbi Decode: 0.985
Test Set Accuracy: Posterior Decode 0.350, Viterbi Decode: 0.509
Smoothing 10.000000 --  Train Set Accuracy: Posterior Decode 0.731, Viterbi Decode: 0.691
Smoothing 10.000000 -- Test Set Accuracy: Posterior Decode 0.712, Viterbi Decode: 0.675
Smoothing 1.000000 --  Train Set Accuracy: Posterior Decode 0.887, Viterbi Decode: 0.865
Smoothing 1.000000 -- Test Set Accuracy: Posterior Decode 0.818, Viterbi Decode: 0.792
Smoothing 0.100000 --  Train Set Accuracy: Posterior Decode 0.968, Viterbi Decode: 0.965
Smoothing 0.100000 -- Test Set Accuracy: Posterior Decode 0.851, Viterbi Decode: 0.842
Smoothing 0.000000 --  Train Set Accuracy: Posterior Decode 0.985, Viterbi Decode: 0.985
Smoothing 0.000000 -- Test Set Accuracy: Posterior Decode 0.370, Viterbi Decode: 0.526
Best Smoothing 0.100000 --  Test Set Accuracy: Posterior Decode 0.837, Viterbi Decode: 0.827

In [ ]: