In [1]:
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()
In [2]:
#returns: string
#arg: string
#remove punctuations, change to lowercase ,retain the apostrophe mark
def removePunctuations(sen):
#split the string into word tokens
temp_l = sen.split()
#print(temp_l)
i = 0
j = 0
#changes the word to lowercase and removes punctuations from it
for word in temp_l :
j = 0
#print(len(word))
for l in word :
if l in string.punctuation:
if l == "'":
if j+1<len(word) and word[j+1] == 's':
j = j + 1
continue
word = word.replace(l," ")
#print(j,word[j])
j += 1
temp_l[i] = word.lower()
i=i+1
#spliting is being done here beacause in sentences line here---so after punctuation removal it should
#become "here so"
content = " ".join(temp_l)
return content
In [3]:
#returns : void
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):
w1 = '' #for storing the 3rd last word to be used for next token set
w2 = '' #for storing the 2nd last word to be used for next token set
w3 = '' #for storing the last word to be used for next token set
token = []
#total no. of words in the corpus
word_len = 0
#open the corpus file and read it line by line
with open(file_path,'r') as file:
for line in file:
#split the string into word tokens
temp_l = line.split()
i = 0
j = 0
#does the same as the removePunctuations() function,implicit declratation for performance reasons
#changes the word to lowercase and removes punctuations from it
for word in temp_l :
j = 0
#print(len(word))
for l in word :
if l in string.punctuation:
if l == "'":
if j+1<len(word) and word[j+1] == 's':
j = j + 1
continue
word = word.replace(l," ")
#print(j,word[j])
j += 1
temp_l[i] = word.lower()
i=i+1
#spliting is being done here beacause in sentences line here---so after punctuation removal it should
#become "here so"
content = " ".join(temp_l)
token = content.split()
word_len = word_len + len(token)
if not token:
continue
#add the last word from previous line
if w3!= '':
token.insert(0,w3)
temp0 = list(ngrams(token,2))
#since we are reading line by line some combinations of word might get missed for pairing
#for trigram
#first add the previous words
if w2!= '':
token.insert(0,w2)
#tokens for trigrams
temp1 = list(ngrams(token,3))
#insert the 3rd last word from previous line for quadgram pairing
if w1!= '':
token.insert(0,w1)
#add new unique words to the vocaulary set if available
for word in token:
if word not in vocab_dict:
vocab_dict[word] = 1
else:
vocab_dict[word]+= 1
#tokens for quadgrams
temp2 = list(ngrams(token,4))
#count the frequency of the bigram sentences
for t in temp0:
sen = ' '.join(t)
bi_dict[sen] += 1
#count the frequency of the trigram sentences
for t in temp1:
sen = ' '.join(t)
tri_dict[sen] += 1
#count the frequency of the quadgram sentences
for t in temp2:
sen = ' '.join(t)
quad_dict[sen] += 1
#then take out the last 3 words
n = len(token)
#store the last few words for the next sentence pairing
w1 = token[n -3]
w2 = token[n -2]
w3 = token[n -1]
return word_len
In [4]:
#for sorting the probable word acc. to their probabilities
#returns: void
#arg: dict
def sortProbWordDict(prob_dict):
for key in prob_dict:
if len(prob_dict[key])>1:
#only at most top 2 most probable words have been taken
prob_dict[key] = sorted(prob_dict[key],reverse = True)[:2]
In [5]:
#returns: string
#arg: string,dict
#does prediction for the the sentence
def doPrediction(sen, prob_dict):
if sen in prob_dict:
return prob_dict[sen][0][1]
else:
return ""
In [6]:
#returns: string
#arg: void
#for taking input from user
def takeInput():
cond = False
#take input
while(cond == False):
sen = input('Enter the string\n')
sen = removePunctuations(sen)
temp = sen.split()
if len(temp) < 3:
print("Please enter atleast 3 words !")
else:
cond = True
temp = temp[-3:]
sen = " ".join(temp)
return sen
In [7]:
#return:int
#arg:list,dict,dict,dict,dict
#computes the score for test data
def computeTestScore(test_sent,tri_dict,quad_dict,vocab_dict,prob_dict):
#increment the score value if correct prediction is made else decrement its value
score = 0
wrong = 0
total = 0
with open('Test_Scores/Knesser_Ney_Interpolated_Score.txt','w') as w:
for sent in test_sent:
sen_token = sent[:3]
sen = " ".join(sen_token)
correct_word = sent[3]
result = doPrediction(sen,prob_dict)
if result == correct_word:
score+=1
else:
wrong += 1
total += 1
w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
'\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
#print stats
print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
'\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100)+'%' )
return score
In [8]:
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams,token_len,tri_dict,quad_dict,vocab_dict,prob_dict):
perplexity = float(1.0)
n = token_len
for item in quad_dict:
sen_token = item.split()
sen = ' '.join(sen_token[0:3])
prob = quad_dict[item]/tri_dict[sen]
perplexity = perplexity * ( prob**(1./n))
with open('Test_Scores/Knesser_Ney_Interpolated_Score.txt','a') as w:
w.write('\nPerplexity: '+str(perplexity))
return perplexity
In [9]:
#creates the dictionaries required for computing Interpolated Knesser Ney probability
#return:dict,dict
#arg:dict, int
def createKNDict(ngram_dict, n):
#for knesser ney probability formula we need to find to important things
#first is for P(Wn|Wn-1) if find no. of ngrams which ends with Wn and no. of ngrams which starts
#with Wn-1
#so we divide the formula into two parts ,first part can be found in constant time
#and second term is found here
i = 0
d = 0.75
#for storing count of ngram ending with Wn,key:unigram
first_dict = {}
#for storing count of ngram having Wn-1 as its starting part, key: trigram sentence
sec_dict = {}
for key in ngram_dict:
#split the key sentence into tokens
ngram_token = key.split()
#since the indexing is from 0 ,so for quadgram we need to create a sentence of three words
#so start from 0 to 2,so we subtract 1,similarly for trigram from 0 to 1
n_1gram_sen = ' '.join(ngram_token[:n-1])
#n_1gram_sen is the word that stars in sec_dict[n_1gram_sen] number of times in ngram_dict
if n_1gram_sen not in sec_dict:
sec_dict[ n_1gram_sen ] = 1
else:
sec_dict[ n_1gram_sen ] += 1
if ngram_token[-1] not in first_dict:
first_dict[ ngram_token[-1] ] = 1
else:
first_dict[ ngram_token[-1] ] += 1
return first_dict, sec_dict
In [10]:
#Finds the Knesser Ney probability for prediction
#return:dict,dict
#arg:dict, int
def computeKnesserNeyProb(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict ):
d = 0.75
#first create the dict for storing the count of Wn-1 followed by Wn and for
#ngrams preceding Wn-1
#for quadgram
quad_first_dict, quad_sec_dict = createKNDict(quad_dict, 4)
#for trigram
tri_first_dict, tri_sec_dict = createKNDict(tri_dict, 3)
#for bigram
bi_first_dict, bi_sec_dict = createKNDict(bi_dict, 2)
#now find the probability for the trigram sentences
for quad in quad_dict:
quad_token = quad.split()
#W1,W2,W3
tri_sen = ' '.join(quad_token[:3])
#the forumula has been divied into parts for easier understanding
quad_prob1 = max( quad_dict[quad] - d, 0) / tri_dict[tri_sen]
quad_prob2 = d/tri_dict[tri_sen] * ( quad_sec_dict[tri_sen] )
tri_prob1 = max( tri_first_dict[quad_token[-1]] - d, 0) / len(tri_dict)
tri_prob2 = ( d/len(tri_dict) )* ( tri_sec_dict[' '.join(quad_token[1:3])] )
bi_prob1 = max( bi_first_dict[quad_token[-1]] - d, 0) / len(bi_dict)
bi_prob2 = ( d/len(bi_dict) ) * ( bi_sec_dict[quad_token[-2]])
uni_prob = bi_first_dict[quad_token[-1]] / len(bi_dict)
prob = quad_prob1 + quad_prob2*( tri_prob1 + tri_prob2*( bi_prob1 + bi_prob2* uni_prob ) )
if tri_sen not in prob_dict:
prob_dict[tri_sen] = []
prob_dict[tri_sen].append([prob,quad_token[-1]])
else:
prob_dict[tri_sen].append([prob,quad_token[-1]])
In [11]:
def computeKnesserNeyProb1(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict ):
d = 0.75
#first create the dict for storing the count of Wn-1 followed by Wn and for
#ngrams preceding Wn-1
#for quadgram
quad_first_dict, quad_sec_dict = createKNDict(quad_dict, 4)
#for trigram
tri_first_dict, tri_sec_dict = createKNDict(tri_dict, 3)
#for bigram
bi_first_dict, bi_sec_dict = createKNDict(bi_dict, 2)
#now find the probability for the trigram sentences
for tri in tri_dict:
max_prob1 = 0.0
max_prob2 = 0.0
curr_prob = 0.0
word1 = ''
word2 = ''
for word in vocab_dict:
quad = tri + ' ' + word
quad_token = quad.split()
#W1,W2,W3
tri_sen = ' '.join(quad_token[:3])
#the forumula has been divied into parts for easier understanding
quad_prob1 = max( quad_dict[quad] - d, 0) / tri_dict[tri_sen]
quad_prob2 = d/tri_dict[tri_sen] * ( quad_sec_dict[tri_sen] )
tri_prob1 = max( tri_first_dict[quad_token[-1]] - d, 0) / len(tri_dict)
tri_prob2 = ( d/len(tri_dict) )* ( tri_sec_dict[' '.join(quad_token[1:3])] )
bi_prob1 = max( bi_first_dict[quad_token[-1]] - d, 0) / len(bi_dict)
bi_prob2 = ( d/len(bi_dict) ) * ( bi_sec_dict[quad_token[-2]])
uni_prob = bi_first_dict[quad_token[-1]] / len(bi_dict)
curr_prob = quad_prob1 + quad_prob2*( tri_prob1 + tri_prob2*( bi_prob1 + bi_prob2* uni_prob ) )
if curr_prob > max_prob1:
max_prob1 = curr_prob
word1 = quad_token[-1]
else:
if curr_prob > max_prob2:
max_prob2 = curr_prob
word2 = quad_token[-1]
prob_dict[tri] = []
prob_dict[tri].append([max_prob1,word1])
prob_dict[tri].append([max_prob2,word2])
In [12]:
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
score = 0
#load the training corpus for the dataset
#token_len = loadCorpus1(bi_dict,tri_dict,quad_dict,vocab_dict)
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))
start_time1 = time.time()
#compute the Knesser Ney probabilities
computeKnesserNeyProb1(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict )
#sort the probable words by their probability
sortProbWordDict(prob_dict)
gc.collect()
print("---Preprocessing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
### TESTING WITH TEST CORPUS
test_data = ''
#Now load the test corpus
#!!!!!!!!!!!!!ONLY FOR THIS CORPUS!!!!!!!!
with open('test_corpus.txt','r') as file :
for line in file:
line_token = line.split()
test_data = test_data + ' '+ ' '.join(line_token[1:])
### TESTING WITH TEST CORPUS
test_data = ''
#Now load the test corpus
with open('test_corpus.txt','r') as file :
test_data = file.read()
#remove punctuations from the test data
test_data = removePunctuations(test_data)
test_token = test_data.split()
#split the test data into 4 words list
test_token = test_data.split()
test_quadgrams = list(ngrams(test_token,4))
#print(len(test_token))
start_time1 = time.time()
score = computeTestScore(test_quadgrams,tri_dict,quad_dict,vocab_dict,prob_dict)
print('Score:',score)
print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time1))
start_time2 = time.time()
perplexity = computePerplexity(test_token,token_len,tri_dict,quad_dict,vocab_dict,prob_dict)
print('Perplexity:',perplexity)
print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time2))
In [13]:
def main():
#variable declaration
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
bi_dict = defaultdict(int) #for keeping count of sentences of two words
tri_dict = defaultdict(int) #for keeping count of sentences of three words
quad_dict = defaultdict(int) #for keeping count of sentences of four words
prob_dict = defaultdict(list) #for storing probability of probable words for prediction
train_file = 'corpusfile.txt'
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
#compute the Knesser Ney probabilities for bigrams,trigrams and unigrams
computeKnesserNeyProb(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict )
#sort the probable words by their probability
sortProbWordDict(prob_dict)
#take user input
input_sen = takeInput()
### PREDICTION
prediction = doPrediction(input_sen,prob_dict)
print('Word Prediction:',prediction)
In [ ]:
if __name__ == '__main__':
main()
In [14]:
#variable declaration
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
bi_dict = defaultdict(int) #for keeping count of sentences of two words
tri_dict = defaultdict(int) #for keeping count of sentences of three words
quad_dict = defaultdict(int) #for keeping count of sentences of four words
prob_dict = defaultdict(list) #for storing probability of probable words for prediction
#load the corpus for the dataset
#loadCorpus('corpusfile.txt',bi_dict,tri_dict,quad_dict,vocab_dict)
print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))
In [ ]:
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict)
In [ ]:
train_file = 'corpusfile.txt'
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
In [ ]:
#compute the Knesser Ney probabilities for bigrams,trigrams and unigrams
computeKnesserNeyProb(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict )
In [ ]:
#sort the probable words by their probability
sortProbWordDict(prob_dict)
In [ ]:
#take user input
input_sen = takeInput()
### PREDICTION
start_time2 = time.time()
prediction = doPrediction(input_sen,prob_dict)
print('Word Prediction:',prediction)
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))
In [ ]:
#FOR DEBUGGING ONLY
writeProbDicts(prob_dict)