In [133]:
# CSE5525 NLP Homework 2 Group 1
import nltk
import numpy
import itertools
import math
from nltk.corpus import treebank
epsilon = 0.00000000000000000001
full_training_set = nltk.corpus.treebank.tagged_sents()[0:3500]
training_set1 = full_training_set[0:1750]
training_set2 = full_training_set[1750:]
test_set = nltk.corpus.treebank.tagged_sents()[3500:]
# Step 2: Retrieve P(W_i | T_i) and P(T_i| T_i-1)
#
# Sample Usage 1:
# print set1_cpd_tags['DT'].prob('JJ')
# meaning print the prob of adjective given determinor from training set 1
#
# Sample Usage 2:
# print full_cpd_word_tag['DT'].prob('the')
# meaning print the prob of word 'the' given determinor from full training set
#
# PS. cpd as the Conditional Prob Distribution
# PSS. We use Laplace distribution for unseen cases
# Full Traning Set
full_training_set_words = []
for sent in full_training_set:
full_training_set_words.append(('<s>','<s>'))
full_training_set_words.extend([ (tag, word) for (word, tag) in sent ])
full_training_set_words.append(('</s>','</s>'))
full_tags = [tag for (tag, word) in full_training_set_words]
full_words = [word for (tag, word) in full_training_set_words]
full_tag_set = set(full_tags)
full_cfd_word_tag = nltk.ConditionalFreqDist(full_training_set_words)
full_cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(full_tags))
"""
for tag_1 in set(full_tags): # Laplace smoothing
for tag_2 in set(full_tags):
if full_cfd_tags[tag_1][tag_2] == 0:
full_cfd_tags[tag_1][tag_2] = 1
for tag in set(full_tags): # Laplace smoothing
for word in set(full_words):
if full_cfd_word_tag[tag][word] == 0:
full_cfd_word_tag[tag][word] = 1
"""
full_cpd_word_tag = nltk.ConditionalProbDist(full_cfd_word_tag, nltk.MLEProbDist)
full_cpd_tags = nltk.ConditionalProbDist(full_cfd_tags, nltk.MLEProbDist)
# Traning Set 1
set1_training_set_words = []
for sent in training_set1:
set1_training_set_words.append(('<s>','<s>'))
set1_training_set_words.extend([ (tag, word) for (word, tag) in sent ])
set1_training_set_words.append(('</s>','</s>'))
set1_tags = [tag for (tag, word) in set1_training_set_words]
set1_words = [word for (tag, word) in set1_training_set_words]
set1_cfd_word_tag = nltk.ConditionalFreqDist(set1_training_set_words)
set1_cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(set1_tags))
"""
for tag_1 in set(set1_tags): # Laplace smoothing
for tag_2 in set(set1_tags):
if set1_cfd_tags[tag_1][tag_2] == 0:
set1_cfd_tags[tag_1][tag_2] = 1
for tag in set(set1_tags): # Laplace smoothing
for word in set(set1_words):
if set1_cfd_word_tag[tag][word] == 0:
set1_cfd_word_tag[tag][word] = 1
"""
set1_cpd_word_tag = nltk.ConditionalProbDist(set1_cfd_word_tag, nltk.MLEProbDist)
set1_cpd_tags = nltk.ConditionalProbDist(set1_cfd_tags, nltk.MLEProbDist)
In [5]:
import numpy
In [127]:
for tag_1 in set(full_tags): # Laplace smoothing
for tag_2 in set(full_tags):
print full_cfd_tags[tag_1][tag_2],
print
In [49]:
len(set(set1_tags))
Out[49]:
In [46]:
for tag_1 in set(full_tags):
# A_table[dict_tags[tag_1]][dict_tags[tag_2]] = set1_cpd_tags[tag_1].prob(tag_2)
#print tag_1, ' ', tag_2, ' ', full_cpd_tags[tag_1].prob(tag_2)
print tag_1,' ', full_cfd_word_tag[tag_1]
In [145]:
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}]
path = ['']
epsilon = 0.00000000000000000001
# Initialize base cases (t == 0)
max_v = -1.0 / epsilon / 1000
for y in states:
# print y
temp_emit = emit_p[y].prob(obs[0])
if temp_emit < epsilon:
temp_emit = epsilon
V[0][y] = math.log(start_p[y]) + math.log(temp_emit)
if max_v < V[0][y]:
max_v = V[0][y]
path[0] = y
# print V[0]
# Run Viterbi for t > 0
for t in range(1, len(obs)):
# print t,
V.append({})
path.append('')
max_v = -1.0 / epsilon
for y in states:
if y == "<s>":
V[t][y] = max_v
else:
temp = list()
for y0 in states:
temp_trans = trans_p[y0].prob(y)
if temp_trans < epsilon:
temp_trans = epsilon
temp_prev_v = V[t-1][y0]
if temp_prev_v < epsilon:
temp_prev_v = epsilon
temp.append((math.log(temp_prev_v) + math.log(temp_trans)))
temp_emit = emit_p[y].prob(obs[t])
if temp_emit < epsilon:
temp_emit = epsilon
prob = math.log(temp_emit) + max(temp)
V[t][y] = prob
if max_v < prob:
max_v = prob
path[t] = y
return path
In [148]:
C_table = {}
for tag in full_tag_set:
C_table[tag] = 0.00000000000000000001
C_table['<s>'] = 1.0
print C_table
In [149]:
test_obs = ['<s>', 'Pierre', 'Viken' , ',' , '61' , "years", "old", "will", "join", "the", "board", "as", "a", "nonexecutive", "director", "Nov.", "29",".", "</s>"]
#test_obs = ['Pierre', 'Viken']
print viterbi(test_obs, full_tag_set, C_table, set1_cpd_tags, set1_cpd_word_tag )
In [150]:
In [157]:
full_obs_set = []
for sent in full_training_set:
full_obs_set.append([ word for (word, tag) in sent ])
for i in xrange(len(full_obs_set)):
full_obs_set[i].append('</s>')
full_obs_set[i].insert(0, '<s>')
In [164]:
full_obs_set[3]
Out[164]:
In [163]:
print viterbi(full_obs_set[3], full_tag_set, C_table, set1_cpd_tags, set1_cpd_word_tag )
In [167]:
for i in range(len(full_obs_set)):
print i,
viterbi(full_obs_set[i], full_tag_set, C_table, set1_cpd_tags, set1_cpd_word_tag )
In [1]:
%run "step4_5/main_step45.py"
In [ ]: