In [ ]:
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()
In [ ]:
#returns: string
#arg: string
#remove punctuations, change to lowercase ,retain the apostrophe mark
def removePunctuations(sen):
#split the string into word tokens
temp_l = sen.split()
#print(temp_l)
i = 0
j = 0
#changes the word to lowercase and removes punctuations from it
for word in temp_l :
j = 0
#print(len(word))
for l in word :
if l in string.punctuation:
if l == "'":
if j+1<len(word) and word[j+1] == 's':
j = j + 1
continue
word = word.replace(l," ")
#print(j,word[j])
j += 1
temp_l[i] = word.lower()
i=i+1
#spliting is being done here beacause in sentences line here---so after punctuation removal it should
#become "here so"
content = " ".join(temp_l)
return content
In [ ]:
#returns : void
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):
w1 = '' #for storing the 3rd last word to be used for next token set
w2 = '' #for storing the 2nd last word to be used for next token set
w3 = '' #for storing the last word to be used for next token set
token = []
#total no. of words in the corpus
word_len = 0
#open the corpus file and read it line by line
with open(file_path,'r') as file:
for line in file:
#split the string into word tokens
temp_l = line.split()
i = 0
j = 0
#does the same as the removePunctuations() function,implicit declaration for performance reasons
#changes the word to lowercase and removes punctuations from it
for word in temp_l :
j = 0
#print(len(word))
for l in word :
if l in string.punctuation:
if l == "'":
if j+1<len(word) and word[j+1] == 's':
j = j + 1
continue
word = word.replace(l," ")
#print(j,word[j])
j += 1
temp_l[i] = word.lower()
i=i+1
#spliting is being done here beacause in sentences line here---so after punctuation removal it should
#become "here so"
content = " ".join(temp_l)
token = content.split()
word_len = word_len + len(token)
if not token:
continue
#add the last word from previous line
if w3!= '':
token.insert(0,w3)
temp0 = list(ngrams(token,2))
#since we are reading line by line some combinations of word might get missed for pairing
#for trigram
#first add the previous words
if w2!= '':
token.insert(0,w2)
#tokens for trigrams
temp1 = list(ngrams(token,3))
#insert the 3rd last word from previous line for quadgram pairing
if w1!= '':
token.insert(0,w1)
#add new unique words to the vocaulary set if available
for word in token:
if word not in vocab_dict:
vocab_dict[word] = 1
else:
vocab_dict[word]+= 1
#tokens for quadgrams
temp2 = list(ngrams(token,4))
#count the frequency of the bigram sentences
for t in temp0:
sen = ' '.join(t)
bi_dict[sen] += 1
#count the frequency of the trigram sentences
for t in temp1:
sen = ' '.join(t)
tri_dict[sen] += 1
#count the frequency of the quadgram sentences
for t in temp2:
sen = ' '.join(t)
quad_dict[sen] += 1
#then take out the last 3 words
n = len(token)
#store the last few words for the next sentence pairing
w1 = token[n -3]
w2 = token[n -2]
w3 = token[n -1]
return word_len
In [ ]:
#creates dict for storing probable words with their probabilities for a trigram sentence
# ADD 1 Smoothing used
#returns: void
#arg: dict,dict,dict,dict,dict
def findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict):
i = 0
V = len(vocab_dict)
#using the fourth word of the quadgram sentence as the probable word and calculate its
#probability,here ADD 1 smoothing has been used during the probability calculation
for quad_sen in quad_dict:
quad_token = quad_sen.split()
#trigram sentence for key
tri_sen = ' '.join(quad_token[:3])
#find the probability
#add 1 smoothing has been used
prob = ( quad_dict[quad_sen] + 1 ) / ( tri_dict[tri_sen] + V)
#if the trigram sentence is not present in the Dictionary then add it
if tri_sen not in quad_prob_dict:
quad_prob_dict[tri_sen] = []
quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
#the trigram sentence is present but the probable word is missing,then add it
else:
quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
prob = None
quad_token = None
tri_sen = None
In [ ]:
#for creating prob dict for trigram probabilities
#creates dict for storing probable words with their probabilities for a trigram sentence
# ADD 1 Smoothing used
#returns: void
#arg: dict,dict,dict,dict
def findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict):
#vocabulary length
V = len(vocab_dict)
#create a dictionary of probable words with their probabilities for
#trigram probabilites,key is a bigram and value is a list of prob and word
for tri in tri_dict:
tri_token = tri.split()
#bigram sentence for key
bi_sen = ' '.join(tri_token[:2])
#find the probability
#add 1 smoothing has been used
prob = ( tri_dict[tri] + 1 ) / ( bi_dict[bi_sen] + V)
#tri_prob_dict is a dict of list
#if the bigram sentence is not present in the Dictionary then add it
if bi_sen not in tri_prob_dict:
tri_prob_dict[bi_sen] = []
tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
#the bigram sentence is present but the probable word is missing,then add it
else:
tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
prob = None
tri_token = None
bi_sen = None
In [ ]:
#for creating prob dict for bigram probabilities
#creates dict for storing probable words with their probabilities for a trigram sentence
# ADD 1 Smoothing used
#returns: void
#arg: dict,dict,dict,dict
def findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict):
V = len(vocab_dict)
#create a dictionary of probable words with their probabilities for bigram probabilites
for bi in bi_dict:
bi_token = bi.split()
#unigram for key
unigram = bi_token[0]
#find the probability
#add 1 smoothing has been used
prob = ( bi_dict[bi] + 1 ) / ( vocab_dict[unigram] + V)
#bi_prob_dict is a dict of list
#if the unigram sentence is not present in the Dictionary then add it
if unigram not in bi_prob_dict:
bi_prob_dict[unigram] = []
bi_prob_dict[unigram].append([prob,bi_token[-1]])
#the unigram sentence is present but the probable word is missing,then add it
else:
bi_prob_dict[unigram].append([prob,bi_token[-1]])
prob = None
bi_token = None
unigram = None
In [ ]:
#finds the lambda values required for doing Interpolation
#arg: int, dict, dict, dict, dict
#returns: list
def estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict):
max_prob = -9999999999999999999.0
curr_prob = 0.0
parameters = [0.0,0.0,0.0,0.0]
i = 1
#load the held out data
file = open('held_out_corpus.txt','r')
held_out_data = file.read()
file.close()
#remove punctuations and other cleaning stuff
held_out_data = removePunctuations(held_out_data)
held_out_data = held_out_data.split()
#make quad tokens for parameter estimation
quad_token_heldout = list(ngrams(held_out_data,4))
#for storing the stats
#f = open('interpolation_prob_stats.txt','w')
#lambda values1 and 4
l1 = 0
l4 = 0
while l1 <= 1.0:
l2 = 0
while l2 <= 1.0:
l3 = 0
while l3 <= 1.0:
#when the sum of lambdas is greater than 1 or when all 4 are zero we don't need to check so skip
if l1 == 0 and l2 == 0 and l3 == 0 or ((l1+l2+l3)>1):
l3 += 0.1
i += 1
continue
#find lambda 4
l4 = 1- (l1 + l2 + l3)
curr_prob = 0
qc = [0]
bc = [0]
tc = [0]
#find the probability for the held out set using the current lambda values
for quad in quad_token_heldout:
#take log of prob to avoid underflow
curr_prob += log10( interpolatedProbability(quad,token_len, vocab_dict, bi_dict, tri_dict,
quad_dict,qc,bc,tc,l1, l2, l3, l4) )
if curr_prob > max_prob:
max_prob = curr_prob
parameters[0] = l1
parameters[1] = l2
parameters[2] = l3
parameters[3] = l4
l3 += 0.1
i += 1
l2 += 0.1
l1 += 0.1
#f.write('\n\n\nL1: '+str(parameters[0])+' L2: '+str(parameters[1])+' L3: '+str(parameters[2])+' L4: '+str(parameters[3])+' MAX PROB: '+str(max_prob)+'\n')
#f.close()
return parameters
In [ ]:
#returns: float
#arg: list,list,dict,dict,dict,dict,float,float,float,float
#for calculating the interpolated probablity given the Trigram sentence and the given word
def interpolatedProbability(quad_token,token_len, vocab_dict, bi_dict, tri_dict, quad_dict, qc, tc, bc,
l1 = 0.25, l2 = 0.25, l3 = 0.25 , l4 = 0.25):
V = len(vocab_dict)
sen = ' '.join(quad_token)
prob = (
l1*((quad_dict[sen] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V))
+ l2*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V))
+ l3*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V))
+ l4*((vocab_dict[quad_token[3]] + 1) / (token_len + V))
)
if sen in quad_dict:
qc[0] += 1
if ' '.join(quad_token[1:4]) in tri_dict:
tc[0] += 1
if ' '.join(quad_token[2:4]) in bi_dict:
bc[0] += 1
#since log10(1) is zero so it doesn't add upto anything but log10(0) is undefined
if prob <= 0:
return 1
return prob
In [ ]:
#for sorting the probable word acc. to their probabilities
#returns: void
#arg: dict, dict, dict
def sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict):
#sort bigram dict
for key in bi_prob_dict:
if len(bi_prob_dict[key])>1:
bi_prob_dict[key] = sorted(bi_prob_dict[key],reverse = True)
#sort trigram dict
for key in tri_prob_dict:
if len(tri_prob_dict[key])>1:
tri_prob_dict[key] = sorted(tri_prob_dict[key],reverse = True)
#sort quadgram dict
for key in quad_prob_dict:
if len(quad_prob_dict[key])>1:
quad_prob_dict[key] = sorted(quad_prob_dict[key],reverse = True)[:2]
In [ ]:
#pick the top most probable words from bi,tri and quad prob dict as word prediction candidates
#returns: list[float,string]
#arg: string,dict,dict,dict
def chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict):
word_choice = []
token = sen.split()
if token[-1] in bi_prob_dict:
word_choice += bi_prob_dict[token[-1]][:1]
#print('Word Choice bi dict')
if ' '.join(token[1:]) in tri_prob_dict:
word_choice += tri_prob_dict[' '.join(token[1:])][:1]
#print('Word Choice tri_dict')
if ' '.join(token) in quad_prob_dict:
word_choice += quad_prob_dict[' '.join(token)][:1]
#print('Word Choice quad_dict')
return word_choice
In [ ]:
#does prediction for the the sentence using Interpolation
#Uses Add-1 Smoothing
#returns: string
#arg: string,dict,dict,dict,dict,int,list,list
def doInterpolatedPredictionAdd1(sen, bi_dict, tri_dict, quad_dict,
vocab_dict,token_len, word_choice, param):
pred = ''
max_prob = 0.0
V = len(vocab_dict)
#for each word choice find the interpolated probability and decide
for word in word_choice:
key = sen + ' ' + word[1]
quad_token = key.split()
prob = (
param[0]*((quad_dict[key] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V))
+ param[1]*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V))
+ param[2]*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V))
+ param[3]*((vocab_dict[quad_token[3]] + 1) / (token_len + V))
)
if prob > max_prob:
max_prob = prob
pred = word
#return only pred to get word with its prob
if pred:
return pred[1]
else:
return ''
In [ ]:
#for taking input from user
#returns: string
#arg: void
def takeInput():
cond = False
#take input
while(cond == False):
sen = input('Enter the string\n')
sen = removePunctuations(sen)
temp = sen.split()
if len(temp) < 3:
print("Please enter atleast 3 words !")
else:
cond = True
temp = temp[-3:]
sen = " ".join(temp)
return sen
In [ ]:
#computes the score for test data i,e no. of right predictions against no. of wrong predictions
#return:int
#arg:list,dict,dict,dict,dict
def computeTestScore(test_token,bi_dict, tri_dict, quad_dict, vocab_dict,
bi_prob_dict, tri_prob_dict, quad_prob_dict, token_len,param):
#increment the score value if correct prediction is made else decrement its value
score = 0
wrong = 0
total = 0
with open('Test_Scores/add1_smoothing_score.txt','w') as w:
for sent in test_token:
sen_token = sent[:3]
sen = " ".join(sen_token)
correct_word = sent[3]
#select probable word candidates for prediction
word_choice = chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
result = doInterpolatedPredictionAdd1(sen, bi_dict, tri_dict, quad_dict, vocab_dict,token_len, word_choice, param)
if result == correct_word:
score+=1
else:
wrong += 1
total += 1
w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
'\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
#print stats
print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
'\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100) )
return score
In [ ]:
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams,token_len,tri_dict,quad_dict,vocab_dict,prob_dict):
perplexity = float(1.0)
n = token_len
V = len(vocab_dict)
for item in quad_dict:
sen_token = item.split()
tri_sen = ' '.join(sen_token[0:3])
prob = (quad_dict[item] + 1) / (tri_dict[tri_sen] + V)
perplexity = perplexity * ( prob**(1./n))
with open('Test_Scores/add1_smoothing_score.txt','w') as w:
w.write('\nPerplexity: '+str(perplexity))
return perplexity
In [ ]:
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
score = 0
#load the training corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))
start_time1 = time.time()
#estimate the lambdas for interpolation
#found earlier usign estimate Function
param = [0.7,0.1,0.1,0.1]
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
#print(param)
#create trigram Probability Dictionary
findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict)
#create bigram Probability Dictionary
findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict)
#create quadgram Probability Dictionary
findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)
#sort the probability dictionaries
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
gc.collect()
print("---Preprocessing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
### TESTING WITH TEST CORPUS
test_data = ''
#Now load the test corpus
with open('test_corpus.txt','r') as file :
test_data = file.read()
#remove punctuations from the test data
test_data = removePunctuations(test_data)
test_token = test_data.split()
#split the test data into 4 words list
test_token = test_data.split()
test_quadgrams = list(ngrams(test_token,4))
#choose most probable words for prediction
start_time2 = time.time()
score = computeTestScore(test_quadgrams,bi_dict, tri_dict, quad_dict, vocab_dict,
bi_prob_dict, tri_prob_dict, quad_prob_dict, token_len,param)
print('Score:',score)
print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time2))
start_time3 = time.time()
perplexity = computePerplexity(test_token,token_len,tri_dict,quad_dict,vocab_dict,prob_dict)
print('Perplexity:',perplexity)
print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time3))
In [ ]:
def main():
#variable declaration
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
bi_dict = defaultdict(int) #for keeping count of sentences of two words
tri_dict = defaultdict(int) #for keeping count of sentences of three words
quad_dict = defaultdict(int) #for keeping count of sentences of four words
quad_prob_dict = defaultdict(list) #for storing the probable words for Quadgram sentences
tri_prob_dict = defaultdict(list) #for storing the probable words for Trigram sentences
bi_prob_dict = defaultdict(list) #for storing the probable words for Bigram sentences
train_file = 'corpusfile.txt'
#load the corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
#estimate the lambdas for interpolation
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
param = [0.7,0.1,0.1,0.1]
#create bigram Probability Dictionary
findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict)
#create trigram Probability Dictionary
findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict)
#create quadgram Probability Dictionary
findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)
#sort the probability dictionaries
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
#take user input
input_sen = takeInput()
### PREDICTION
#choose most probable words for prediction
word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
prediction = doInterpolatedPredictionAdd1(input_sen, bi_dict, tri_dict, quad_dict, vocab_dict,token_len, word_choice, param)
print('Word Prediction:',prediction)
In [ ]:
if __name__ == '__main__':
main()
In [ ]:
#variable declaration
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
bi_dict = defaultdict(int) #for keeping count of sentences of two words
tri_dict = defaultdict(int) #for keeping count of sentences of three words
quad_dict = defaultdict(int) #for keeping count of sentences of four words
quad_prob_dict = defaultdict(list) #for storing the probable words for Quadgram sentences
tri_prob_dict = defaultdict(list) #for storing the probable words for Trigram sentences
bi_prob_dict = defaultdict(list) #for storing the probable words for Bigram sentences
In [ ]:
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,quad_prob_dict)
In [ ]:
train_file = 'corpusfile.txt'
#load the corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
In [ ]:
#estimate the lambdas for interpolation
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
param = [0.8,0.2,0.0,0.0]
In [ ]:
#create bigram Probability Dictionary
findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict)
In [ ]:
#create trigram Probability Dictionary
findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict)
In [ ]:
#create quadgram Probability Dictionary
findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)
In [ ]:
#sort the probability dictionaries
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
In [ ]:
#FOR DEBUGGING ONLY
writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict)
In [ ]:
#take user input
input_sen = takeInput()
### PREDICTION
start_time2 = time.time()
#choose most probable words for prediction
word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
prediction = doInterpolatedPredictionAdd1(input_sen, bi_dict, tri_dict, quad_dict, vocab_dict,token_len, word_choice, param)
#prediction = doPrediction(input_sen,prob_dict)
print('Word Prediction:',prediction)
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))