In [1]:
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()
In [2]:
#returns: string
#arg: string
#remove punctuations and make the string lowercase
def removePunctuations(sen):
#split the string into word tokens
temp_l = sen.split()
#print(temp_l)
i = 0
j = 0
#changes the word to lowercase and removes punctuations from it
for word in temp_l :
j = 0
#print(len(word))
for l in word :
if l in string.punctuation:
if l == "'":
if j+1<len(word) and word[j+1] == 's':
j = j + 1
continue
word = word.replace(l," ")
#print(j,word[j])
j += 1
temp_l[i] = word.lower()
i=i+1
#spliting is being don here beacause in sentences line here---so after punctuation removal it should
#become "here so"
content = " ".join(temp_l)
return content
In [3]:
#returns : int
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):
w1 = '' #for storing the 3rd last word to be used for next token set
w2 = '' #for storing the 2nd last word to be used for next token set
w3 = '' #for storing the last word to be used for next token set
token = []
#total no. of words in the corpus
word_len = 0
#open the corpus file and read it line by line
with open(file_path,'r') as file:
for line in file:
#split the string into word tokens
temp_l = line.split()
i = 0
j = 0
#does the same as the removePunctuations() function,implicit declratation for performance reasons
#changes the word to lowercase and removes punctuations from it
for word in temp_l :
j = 0
#print(len(word))
for l in word :
if l in string.punctuation:
if l == "'":
if j+1<len(word) and word[j+1] == 's':
j = j + 1
continue
word = word.replace(l," ")
#print(j,word[j])
j += 1
temp_l[i] = word.lower()
i=i+1
#spliting is being done here beacause in sentences line here---so after punctuation removal it should
#become "here so"
content = " ".join(temp_l)
token = content.split()
word_len = word_len + len(token)
if not token:
continue
#add the last word from previous line
if w3!= '':
token.insert(0,w3)
temp0 = list(ngrams(token,2))
#since we are reading line by line some combinations of word might get missed for pairing
#for trigram
#first add the previous words
if w2!= '':
token.insert(0,w2)
#tokens for trigrams
temp1 = list(ngrams(token,3))
#insert the 3rd last word from previous line for quadgram pairing
if w1!= '':
token.insert(0,w1)
#add new unique words to the vocaulary set if available
for word in token:
if word not in vocab_dict:
vocab_dict[word] = 1
else:
vocab_dict[word]+= 1
#tokens for quadgrams
temp2 = list(ngrams(token,4))
#count the frequency of the bigram sentences
for t in temp0:
sen = ' '.join(t)
bi_dict[sen] += 1
#count the frequency of the trigram sentences
for t in temp1:
sen = ' '.join(t)
tri_dict[sen] += 1
#count the frequency of the quadgram sentences
for t in temp2:
sen = ' '.join(t)
quad_dict[sen] += 1
#then take out the last 3 words
n = len(token)
#store the last few words for the next sentence pairing
w1 = token[n -3]
w2 = token[n -2]
w3 = token[n -1]
return word_len
In [4]:
#returns: void
#arg: dict,dict,dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a trigram sentence
def findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, nc_dict, k):
i = 0
V = len(vocab_dict)
for quad_sen in quad_dict:
quad_token = quad_sen.split()
#trigram sentence for key
tri_sen = ' '.join(quad_token[:3])
#find the probability
#Good Turing smoothing has been used
quad_count = quad_dict[quad_sen]
tri_count = tri_dict[tri_sen]
if quad_dict[quad_sen] <= k or (quad_sen not in quad_dict):
quad_count = findGoodTuringAdjustCount( quad_dict[quad_sen], k, nc_dict)
if tri_dict[tri_sen] <= k or (tri_sen not in tri_dict):
tri_count = findGoodTuringAdjustCount( tri_dict[tri_sen], k, nc_dict)
prob = quad_count / tri_count
#add the trigram to the quadgram probabiltity dict
if tri_sen not in quad_prob_dict:
quad_prob_dict[tri_sen] = []
quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
else:
quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
prob = None
quad_token = None
tri_sen = None
In [5]:
#returns: void
#arg: dict,dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a bigram sentence
def findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, nc_dict, k):
#vocabulary length
V = len(vocab_dict)
#create a dictionary of probable words with their probabilities for
#trigram probabilites,key is a bigram and value is a list of prob and word
for tri in tri_dict:
tri_token = tri.split()
#bigram sentence for key
bi_sen = ' '.join(tri_token[:2])
#find the probability
#Good Turing smoothing has been used
tri_count = tri_dict[tri]
bi_count = bi_dict[bi_sen]
if tri_dict[tri] <= k or (tri not in tri_dict):
tri_count = findGoodTuringAdjustCount( tri_dict[tri], k, nc_dict)
if bi_dict[bi_sen] <= k or (bi_sen not in bi_dict):
bi_count = findGoodTuringAdjustCount( bi_dict[bi_sen], k, nc_dict)
prob = tri_count / bi_count
#add the bigram sentence to the trigram probability dict
#tri_prob_dict is a dict of list
if bi_sen not in tri_prob_dict:
tri_prob_dict[bi_sen] = []
tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
else:
tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
prob = None
tri_token = None
bi_sen = None
In [6]:
#returns: void
#arg: dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a unigram
def findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, nc_dict, k):
#vocabulary size
V = len(vocab_dict)
#create a dictionary of probable words with their probabilities for bigram probabilites
for bi in bi_dict:
bi_token = bi.split()
#unigram for key
unigram = bi_token[0]
#find the probability
#Good Turing smoothing has been used
bi_count = bi_dict[bi]
uni_count = vocab_dict[unigram]
if bi_dict[bi] <= k or (bi not in bi_dict):
bi_count = findGoodTuringAdjustCount( bi_dict[bi], k, nc_dict)
if vocab_dict[unigram] <= k or (unigram not in vocab_dict):
uni_count = findGoodTuringAdjustCount( vocab_dict[unigram], k, nc_dict)
prob = bi_count / uni_count
#add the unigram to the bigram probability dict
#bi_prob_dict is a dict of list
if unigram not in bi_prob_dict:
bi_prob_dict[unigram] = []
bi_prob_dict[unigram].append([prob,bi_token[-1]])
else:
bi_prob_dict[unigram].append([prob,bi_token[-1]])
prob = None
bi_token = None
unigram = None
In [7]:
#returns: void
#arg: dict
#for sorting the probable word acc. to their probabilities
def sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict):
for key in bi_prob_dict:
if len(bi_prob_dict[key])>1:
bi_prob_dict[key] = sorted(bi_prob_dict[key],reverse = True)
for key in tri_prob_dict:
if len(tri_prob_dict[key])>1:
tri_prob_dict[key] = sorted(tri_prob_dict[key],reverse = True)
for key in quad_prob_dict:
if len(quad_prob_dict[key])>1:
quad_prob_dict[key] = sorted(quad_prob_dict[key],reverse = True)[:2]
In [8]:
#returns: string
#arg: void
#for taking input from user
def takeInput():
cond = False
#take input
while(cond == False):
sen = input('Enter the string\n')
sen = removePunctuations(sen)
temp = sen.split()
if len(temp) < 3:
print("Please enter atleast 3 words !")
else:
cond = True
temp = temp[-3:]
sen = " ".join(temp)
return sen
In [9]:
#computes the score for test data
def computeTestScore(test_token, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict ):
#increment the score value if correct prediction is made else decrement its value
score = 0
wrong = 0
total = 0
with open('Test_Scores/Good_Turing_Interpolated_Score.txt','w') as w:
for sent in test_token:
sen_token = sent[:3]
sen = " ".join(sen_token)
correct_word = sent[3]
#find the the most probable words for the bigram, trigram and unigram sentence
word_choice = chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
result = doInterpolatedPredictionGT(sen, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict )
if result:
if result[1] == correct_word:
score+=1
else:
wrong += 1
else:
wrong += 1
total += 1
w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
'\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
#print stats
print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
'\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100)+'%' )
return score
In [10]:
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict):
perplexity = float(1.0)
n = token_len
for key in quad_dict:
quad_token = key.split()
quad_count = quad_dict[key]
tri_count = tri_dict[' '.join(quad_token[0:3])]
if quad_dict[key] <= k or (key not in quad_dict):
quad_count = findGoodTuringAdjustCount( quad_dict[key], k, quad_nc_dict)
if tri_dict[' '.join(quad_token[0:3])] <= k or (' '.join(quad_token[0:3]) not in tri_dict):
tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[0:3])], k, tri_nc_dict)
prob = quad_count / tri_count
if prob != 0:
perplexity = perplexity * ( prob**(1./n))
with open('Test_Scores/Good_Turing_Interpolated_Score.txt','a') as w:
w.write('\nPerplexity: '+str(perplexity))
return perplexity
In [11]:
## Regression related stuff
#calculate best fit line for simple regression
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
#finds the slope for the best fit line
def findBestFitSlope(x,y):
m = (( mean(x)*mean(y) - mean(x*y) ) /
( mean(x)** 2 - mean(x**2)))
return m
#finds the intercept for the best fit line
def findBestFitIntercept(x,y,m):
c = mean(y) - m*mean(x)
return c
In [12]:
## Find the count Nc for quadgrams and trigrams where c > 5
#arg: dict, int, int, int, int
#returns: dict
#token_len : total no. of ngram tokens
def findFrequencyOfFrequencyCount(ngram_dict, k, n, V, token_len):
#for keeping count of 'c' value i.e Nc
nc_dict = {}
#we find the value of Nc,c = 0 by V^n - (total n-gram tokens)
nc_dict[0] = V**n - token_len
#find the count Nc till c = k,we will take k = 5
#find counts for n-gram
for key in ngram_dict:
if ngram_dict[key] <= k + 1:
if ngram_dict[key] not in nc_dict:
nc_dict[ ngram_dict[key]] = 1
else:
nc_dict[ ngram_dict[key] ] += 1
#check if all the values of Nc are there in the nc_dict or not ,if there then return
val_present = True
for i in range(1,7):
if i not in nc_dict:
val_present = False
break
if val_present == True:
return nc_dict
#now fill in the values of nc in case it is not there using regression upto c = 6
#we use :[ log(Nc) = blog(c) + a ] as the equation
#we first need to find data for regression that is values(Nc,c) we take 5 data points
data_pts = {}
i = 0
#get first 5 counts value i.e c
#for quadgram
for key in ngram_dict:
if ngram_dict[key] not in data_pts:
data_pts[ ngram_dict[key] ] = 1
i += 1
if i >5:
break
#now get Nc for those c values
for key in ngram_dict:
if ngram_dict[key] in data_pts:
data_pts[ ngram_dict[key] ] += 1
#make x ,y coordinates for regression
x_coor = [ np.log(item) for item in data_pts ]
y_coor = [ np.log( data_pts[item] ) for item in data_pts ]
x = np.array(x_coor, dtype = np.float64)
y = np.array(y_coor , dtype = np.float64)
#now do regression
#find the slope and intercept for the regression equation
slope_m = findBestFitSlope(x,y)
intercept_c = findBestFitIntercept(x,y,slope_m)
#now find the missing Nc terms and give them value using regression
for i in range(1,(k+2)):
if i not in nc_dict:
nc_dict[i] = (slope_m*i) + intercept_c
return nc_dict
In [13]:
#for finding the adjusted count c* in Good Turing Smoothing
def findGoodTuringAdjustCount(c, k, nc_dict):
adjust_count = ( ( (( c + 1)*( nc_dict[c + 1] / nc_dict[c])) - ( c * (k+1) * nc_dict[k+1] / nc_dict[1]) ) /
( 1 - (( k + 1)*nc_dict[k + 1] / nc_dict[1]) )
)
return adjust_count
In [14]:
#finds the lambda values required for doing Interpolation
#arg: int, dict, dict, dict, dict
#returns: list
def estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict):
max_prob = -9999999999999999999.0
curr_prob = 0.0
parameters = [0.0,0.0,0.0,0.0]
i = 1
#load the held out data
file = open('held_out_corpus.txt','r')
held_out_data = file.read()
file.close()
#remove punctuations and other cleaning stuff
held_out_data = removePunctuations(held_out_data)
held_out_data = held_out_data.split()
#make quad tokens for parameter estimation
quad_token_heldout = list(ngrams(held_out_data,4))
#for storing the stats
f = open('interpolation_prob_stats.txt','w')
#lambda values1 and 4
l1 = 0
l4 = 0
while l1 <= 1.0:
l2 = 0
while l2 <= 1.0:
l3 = 0
while l3 <= 1.0:
#when the sum of lambdas is greater than 1 or when all 4 are zero we don't need to check so skip
if l1 == 0 and l2 == 0 and l3 == 0 or ((l1+l2+l3)>1):
l3 += 0.1
i += 1
continue
#find lambda 4
l4 = 1- (l1 + l2 + l3)
curr_prob = 0
qc = [0]
bc = [0]
tc = [0]
#find the probability for the held out set using the current lambda values
for quad in quad_token_heldout:
#take log of prob to avoid underflow
curr_prob += log10( interpolatedProbability(quad,token_len, vocab_dict, bi_dict, tri_dict,
quad_dict,qc,bc,tc,l1, l2, l3, l4) )
if curr_prob > max_prob:
max_prob = curr_prob
parameters[0] = l1
parameters[1] = l2
parameters[2] = l3
parameters[3] = l4
l3 += 0.1
i += 1
l2 += 0.1
l1 += 0.1
f.write('\n\n\nL1: '+str(parameters[0])+' L2: '+str(parameters[1])+' L3: '+str(parameters[2])+' L4: '+str(parameters[3])+' MAX PROB: '+str(max_prob)+'\n')
f.close()
return parameters
In [15]:
#pick the top most probable words from bi,tri and quad prob dict as word prediction candidates
#returns: list[float,string]
#arg: string,dict,dict,dict
def chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict):
word_choice = []
token = sen.split()
if token[-1] in bi_prob_dict:
word_choice += bi_prob_dict[token[-1]][:1]
#print('Word Choice bi dict')
if ' '.join(token[1:]) in tri_prob_dict:
word_choice += tri_prob_dict[' '.join(token[1:])][:1]
#print('Word Choice tri_dict')
if ' '.join(token) in quad_prob_dict:
word_choice += quad_prob_dict[' '.join(token)][:1]
#print('Word Choice quad_dict')
return word_choice
In [16]:
#For doing word prediction using Interpolation
def doInterpolatedPredictionGT(sen, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict ):
pred = ''
max_prob = 0.0
V = len(vocab_dict)
#for each word choice find the interpolated probability and decide
for word in word_choice:
key = sen + ' ' + word[1]
quad_token = key.split()
#find the Good Turing probabilty for quadgram probability
quad_count = quad_dict[key]
tri_count = tri_dict[' '.join(quad_token[0:3])]
if quad_dict[key] <= k or (key not in quad_dict):
quad_count = findGoodTuringAdjustCount( quad_dict[key], k, quad_nc_dict)
if tri_dict[' '.join(quad_token[0:3])] <= k or (' '.join(quad_token[0:3]) not in tri_dict):
tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[0:3])], k, tri_nc_dict)
quad_prob = quad_count / tri_count
#find the Good Turing probabilty for trigram probability
tri_count = tri_dict[' '.join(quad_token[1:4])]
bi_count = bi_dict[' '.join(quad_token[1:3])]
if tri_dict[' '.join(quad_token[1:4])] <= k or (' '.join(quad_token[1:4]) not in tri_dict):
tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[1:4])], k, tri_nc_dict)
if bi_dict[' '.join(quad_token[1:3])] <= k or (' '.join(quad_token[1:3]) not in bi_dict):
bi_count = findGoodTuringAdjustCount( bi_dict[' '.join(quad_token[1:3])], k, bi_nc_dict)
tri_prob = tri_count / bi_count
#find the Good Turing probabilty for bigram probability
bi_count = bi_dict[' '.join(quad_token[2:4])]
uni_count = vocab_dict[quad_token[2]]
if bi_dict[' '.join(quad_token[2:4])] <= k or (' '.join(quad_token[2:4]) not in bi_dict):
bi_count = findGoodTuringAdjustCount( bi_dict[' '.join(quad_token[2:4])], k, bi_nc_dict)
if vocab_dict[quad_token[2]] <= k or (quad_token[2] not in vocab_dict):
uni_count = findGoodTuringAdjustCount( vocab_dict[quad_token[2]], k, uni_nc_dict)
bi_prob = bi_count / uni_count
#find the Good Turing probabilty for unigram probability
uni_count = vocab_dict[quad_token[3]]
if vocab_dict[quad_token[3]] <= k or (quad_token[3] not in vocab_dict):
bi_count = findGoodTuringAdjustCount( vocab_dict[quad_token[3]], k, uni_nc_dict)
uni_prob = uni_count / token_len
prob = (
param[0]*( quad_prob )
+ param[1]*( tri_prob )
+ param[2]*( bi_prob )
+ param[3]*(uni_prob)
)
if prob > max_prob:
max_prob = prob
pred = word
#return only pred to get word with its prob
if pred:
return pred
else:
return ''
In [17]:
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
test_result = ''
score = 0
#load the training corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))
start_time1 = time.time()
#create the different Nc dictionaries for ngrams
#threshold value
k = 5
V = len(vocab_dict)
quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))
#create quadgram probability dictionary
findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
#create trigram probability dictionary
findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
#create bigram probability dictionary
findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
#sort the probability dictionaries of quad,tri and bi grams
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
#Do only when required to find the lambda value as this can take some time
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
#found earlier using Held out data
param = [0.0,0.0,0.7999999999999999,0.20000000000000007]
print("---Processing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
### TESTING WITH TEST CORPUS
test_data = ''
#Now load the test corpus
with open('test_corpus.txt','r') as file :
test_data = file.read()
#remove punctuations from the test data
test_data = removePunctuations(test_data)
test_token = test_data.split()
#split the test data into 4 words list
test_token = test_data.split()
test_quadgrams = list(ngrams(test_token,4))
#choose most probable words for prediction
start_time2 = time.time()
score = computeTestScore(test_quadgrams, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict )
print('Score:',score)
print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time2))
start_time3 = time.time()
perplexity = computePerplexity(test_quadgrams, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict)
print('Perplexity:',perplexity)
print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time3))
In [20]:
def main():
#variable declaration
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
bi_dict = defaultdict(int) #for keeping count of sentences of two words
tri_dict = defaultdict(int) #for keeping count of sentences of three words
quad_dict = defaultdict(int) #for keeping count of sentences of four words
quad_prob_dict = OrderedDict()
tri_prob_dict = OrderedDict()
bi_prob_dict = OrderedDict()
train_file = 'corpusfile.txt'
#load the corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
#create the different Nc dictionaries for ngrams
#threshold value
k = 5
V = len(vocab_dict)
quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))
#create quadgram probability dictionary
findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
#create trigram probability dictionary
findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
#create bigram probability dictionary
findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
#sort the probability dictionaries of quad,tri and bi grams
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
#Do only when required to find the lambda value as this can take some time
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
#found earlier using Held out data
param = [0.7,0.1,0.1,0.1]
##WORD PREDICTION
start_time2 = time.time()
#take user input
input_sen = takeInput()
#find the the most probable words for the bigram, trigram and unigram sentence
word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
prediction = doInterpolatedPredictionGT(input_sen, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict )
if prediction:
print('Word Prediction:',prediction[1])
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))
In [21]:
if __name__ == '__main__':
main()
In [ ]:
#variable declaration
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
bi_dict = defaultdict(int) #for keeping count of sentences of two words
tri_dict = defaultdict(int) #for keeping count of sentences of three words
quad_dict = defaultdict(int) #for keeping count of sentences of four words
quad_prob_dict = OrderedDict()
tri_prob_dict = OrderedDict()
bi_prob_dict = OrderedDict()
print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))
In [ ]:
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,quad_prob_dict)
In [ ]:
train_file = 'corpusfile.txt'
#load the corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
In [ ]:
#create the different Nc dictionaries for ngrams
#threshold value
k = 5
V = len(vocab_dict)
quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))
In [ ]:
#create quadgram probability dictionary
findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
In [ ]:
#create trigram probability dictionary
findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
In [ ]:
#create bigram probability dictionary
findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
In [ ]:
#sort the probability dictionaries of quad,tri and bi grams
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
In [ ]:
#Do only when required to find the lambda value as this can take some time
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
#found earlier using Held out data
param = [0.7,0.1,0.1,0.1]
In [ ]:
#FOR DEBUGGING ONLY
writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict)
In [ ]:
##WORD PREDICTION
start_time2 = time.time()
#take user input
input_sen = takeInput()
#find the the most probable words for the bigram, trigram and unigram sentence
word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
prediction = doInterpolatedPredictionGT(input_sen, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
bi_nc_dict, uni_nc_dict )
if prediction:
print('Word Prediction:',prediction[1])
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))