Word prediction

Language Model based on n-gram Probabilistic Model

Interpolated Knesser Ney Used

Highest Order n-gram used is Quadgram

Import corpus


In [1]:
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()

Do preprocessing:

Remove the punctuations and lowercase the tokens


In [2]:
#returns: string
#arg: string
#remove punctuations, change to lowercase ,retain the apostrophe mark
def removePunctuations(sen):
    #split the string into word tokens
    temp_l = sen.split()
    #print(temp_l)
    i = 0
    j = 0
    
    #changes the word to lowercase and removes punctuations from it
    for word in temp_l :
        j = 0
        #print(len(word))
        for l in word :
            if l in string.punctuation:
                if l == "'":
                    if j+1<len(word) and word[j+1] == 's':
                        j = j + 1
                        continue
                word = word.replace(l," ")
                #print(j,word[j])
            j += 1

        temp_l[i] = word.lower()
        i=i+1   

    #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
    #become "here so"   
    content = " ".join(temp_l)

    return content

Tokenize and load the corpus data


In [3]:
#returns : void
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):

    w1 = ''    #for storing the 3rd last word to be used for next token set
    w2 = ''    #for storing the 2nd last word to be used for next token set
    w3 = ''    #for storing the last word to be used for next token set
    token = []
    #total no. of words in the corpus
    word_len = 0

    #open the corpus file and read it line by line
    with open(file_path,'r') as file:
        for line in file:

            #split the string into word tokens
            temp_l = line.split()
            i = 0
            j = 0
            
            #does the same as the removePunctuations() function,implicit declratation for performance reasons
            #changes the word to lowercase and removes punctuations from it
            for word in temp_l :
                j = 0
                #print(len(word))
                for l in word :
                    if l in string.punctuation:
                        if l == "'":
                            if j+1<len(word) and word[j+1] == 's':
                                j = j + 1
                                continue
                        word = word.replace(l," ")
                        #print(j,word[j])
                    j += 1

                temp_l[i] = word.lower()
                i=i+1   

            #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
            #become "here so"   
            content = " ".join(temp_l)

            token = content.split()
            word_len = word_len + len(token)  

            if not token:
                continue

            #add the last word from previous line
            if w3!= '':
                token.insert(0,w3)

            temp0 = list(ngrams(token,2))

            #since we are reading line by line some combinations of word might get missed for pairing
            #for trigram
            #first add the previous words
            if w2!= '':
                token.insert(0,w2)

            #tokens for trigrams
            temp1 = list(ngrams(token,3))

            #insert the 3rd last word from previous line for quadgram pairing
            if w1!= '':
                token.insert(0,w1)

            #add new unique words to the vocaulary set if available
            for word in token:
                if word not in vocab_dict:
                    vocab_dict[word] = 1
                else:
                    vocab_dict[word]+= 1
                  
            #tokens for quadgrams
            temp2 = list(ngrams(token,4))

            #count the frequency of the bigram sentences
            for t in temp0:
                sen = ' '.join(t)
                bi_dict[sen] += 1

            #count the frequency of the trigram sentences
            for t in temp1:
                sen = ' '.join(t)
                tri_dict[sen] += 1

            #count the frequency of the quadgram sentences
            for t in temp2:
                sen = ' '.join(t)
                quad_dict[sen] += 1


            #then take out the last 3 words
            n = len(token)

            #store the last few words for the next sentence pairing
            w1 = token[n -3]
            w2 = token[n -2]
            w3 = token[n -1]
    return word_len

Some Misc functions for debugging

Sort the probable words


In [4]:
#for sorting the probable word acc. to their probabilities

#returns: void
#arg: dict
def sortProbWordDict(prob_dict):
   
    for key in prob_dict:
        if len(prob_dict[key])>1:
            #only at most top 2 most probable words have been taken
            prob_dict[key] = sorted(prob_dict[key],reverse = True)[:2]

Driver function for doing the prediction


In [5]:
#returns: string
#arg: string,dict
#does prediction for the the sentence
def doPrediction(sen, prob_dict):
    if sen in prob_dict:
        return prob_dict[sen][0][1]
    else:
        return ""

For Taking input from the User


In [6]:
#returns: string
#arg: void
#for taking input from user
def takeInput():
    cond = False
    #take input
    while(cond == False):
        sen = input('Enter the string\n')
        sen = removePunctuations(sen)
        temp = sen.split()
        if len(temp) < 3:
            print("Please enter atleast 3 words !")
        else:
            cond = True
            temp = temp[-3:]
    sen = " ".join(temp)
    return sen

Test Score ,Perplexity Calculation:

For computing the Test Score


In [7]:
#return:int
#arg:list,dict,dict,dict,dict
#computes the score for test data
def computeTestScore(test_sent,tri_dict,quad_dict,vocab_dict,prob_dict):
    #increment the score value if correct prediction is made else decrement its value
    score = 0
    wrong = 0
    total = 0
    with open('Test_Scores/Knesser_Ney_Interpolated_Score.txt','w') as w:
        for sent in test_sent:
            sen_token = sent[:3]
            sen = " ".join(sen_token)
            correct_word = sent[3]
           
            result = doPrediction(sen,prob_dict)
            if result == correct_word:
                score+=1
            else:
                wrong += 1

            total += 1
            
        w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
        #print stats
        print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100)+'%' )
    return score

For Computing the Perplexity


In [8]:
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams,token_len,tri_dict,quad_dict,vocab_dict,prob_dict):
    
    perplexity = float(1.0)
    n = token_len

    for item in quad_dict:
        sen_token = item.split()
        sen = ' '.join(sen_token[0:3])
        prob = quad_dict[item]/tri_dict[sen]
        perplexity = perplexity * ( prob**(1./n))
    with open('Test_Scores/Knesser_Ney_Interpolated_Score.txt','a') as w:
        w.write('\nPerplexity: '+str(perplexity))
    return perplexity

For creating Dictionaries required for Knesser Ney Probability Calculation

This function makes dictionaries which keeps count of ngrams where Wn is followed by Wn-1 and also count of ngrams where Wn-1 preceeds the ngram


In [9]:
#creates the dictionaries required for computing Interpolated Knesser Ney probability
#return:dict,dict
#arg:dict, int
def createKNDict(ngram_dict, n):
    #for knesser ney probability formula we need to find to important things 
    #first is for P(Wn|Wn-1) if find no. of ngrams which ends with Wn and no. of ngrams which starts 
    #with Wn-1
    #so we divide the formula into two parts ,first part can be found in constant time
    #and second term is found here
    i = 0
    d = 0.75
    #for storing count of ngram ending with Wn,key:unigram
    first_dict = {}
    #for storing count of ngram having Wn-1 as its starting part, key: trigram sentence
    sec_dict = {}
    
    for key in ngram_dict:
        #split the key sentence into tokens 
        ngram_token = key.split()
        #since the indexing is from 0 ,so for quadgram we need to create a sentence of three words
        #so start from 0 to 2,so we subtract 1,similarly for trigram from 0 to 1 
        n_1gram_sen = ' '.join(ngram_token[:n-1])
        
        #n_1gram_sen is the word that  stars in sec_dict[n_1gram_sen] number of times in ngram_dict 
        if n_1gram_sen not in sec_dict:
            sec_dict[ n_1gram_sen ] = 1
        else:
            sec_dict[ n_1gram_sen ] += 1
            
        if ngram_token[-1] not in first_dict:
            first_dict[ ngram_token[-1] ] = 1
        else:
            first_dict[ ngram_token[-1] ] += 1
    
    return first_dict, sec_dict

For Computing Knesser Ney probability for various words


In [10]:
#Finds the Knesser Ney probability for prediction
#return:dict,dict
#arg:dict, int
def computeKnesserNeyProb(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict ):
        d = 0.75
        #first create the dict for storing the count of Wn-1 followed by Wn and for
        #ngrams preceding Wn-1
        
        #for quadgram
        quad_first_dict, quad_sec_dict = createKNDict(quad_dict, 4)
        
        #for trigram
        tri_first_dict, tri_sec_dict = createKNDict(tri_dict, 3)
        
        #for bigram
        bi_first_dict, bi_sec_dict = createKNDict(bi_dict, 2)
       
        #now find the probability for the trigram sentences
        for quad in quad_dict:
            quad_token = quad.split()
            #W1,W2,W3
            tri_sen = ' '.join(quad_token[:3])
            
            #the forumula has been divied into parts for easier understanding
            quad_prob1 = max( quad_dict[quad] - d, 0) / tri_dict[tri_sen]
            quad_prob2 = d/tri_dict[tri_sen] * ( quad_sec_dict[tri_sen] )
            
            tri_prob1 = max( tri_first_dict[quad_token[-1]] - d, 0) / len(tri_dict)
            tri_prob2 = ( d/len(tri_dict) )* ( tri_sec_dict[' '.join(quad_token[1:3])] )
            
            bi_prob1 = max( bi_first_dict[quad_token[-1]] - d, 0) / len(bi_dict)
            bi_prob2 = ( d/len(bi_dict) ) * ( bi_sec_dict[quad_token[-2]])
            uni_prob = bi_first_dict[quad_token[-1]] / len(bi_dict)
            
            prob = quad_prob1 + quad_prob2*( tri_prob1 + tri_prob2*( bi_prob1 + bi_prob2* uni_prob ) )
           
            if tri_sen not in prob_dict:
                prob_dict[tri_sen] = []
                prob_dict[tri_sen].append([prob,quad_token[-1]])
            else:
                prob_dict[tri_sen].append([prob,quad_token[-1]])

In [11]:
def computeKnesserNeyProb1(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict ):
        d = 0.75
        
        #first create the dict for storing the count of Wn-1 followed by Wn and for
        #ngrams preceding Wn-1
        
        #for quadgram
        quad_first_dict, quad_sec_dict = createKNDict(quad_dict, 4)
        
        #for trigram
        tri_first_dict, tri_sec_dict = createKNDict(tri_dict, 3)
        
        #for bigram
        bi_first_dict, bi_sec_dict = createKNDict(bi_dict, 2)
        
        #now find the probability for the trigram sentences
        for tri in tri_dict:
            max_prob1 = 0.0
            max_prob2 = 0.0
            curr_prob = 0.0
            word1 = ''
            word2 = ''
            for word in vocab_dict:
                quad = tri + ' ' + word
                quad_token = quad.split()
                #W1,W2,W3
                tri_sen = ' '.join(quad_token[:3])

                #the forumula has been divied into parts for easier understanding
                quad_prob1 = max( quad_dict[quad] - d, 0) / tri_dict[tri_sen]
                quad_prob2 = d/tri_dict[tri_sen] * ( quad_sec_dict[tri_sen] )

                tri_prob1 = max( tri_first_dict[quad_token[-1]] - d, 0) / len(tri_dict)
                tri_prob2 = ( d/len(tri_dict) )* ( tri_sec_dict[' '.join(quad_token[1:3])] )

                bi_prob1 = max( bi_first_dict[quad_token[-1]] - d, 0) / len(bi_dict)
                bi_prob2 = ( d/len(bi_dict) ) * ( bi_sec_dict[quad_token[-2]])
                uni_prob = bi_first_dict[quad_token[-1]] / len(bi_dict)

                curr_prob = quad_prob1 + quad_prob2*( tri_prob1 + tri_prob2*( bi_prob1 + bi_prob2* uni_prob ) )
                
                if curr_prob > max_prob1:
                    max_prob1 = curr_prob
                    word1 = quad_token[-1]
                else:
                    if curr_prob > max_prob2:
                        max_prob2 = curr_prob
                        word2 = quad_token[-1]
                    
            
            prob_dict[tri] = []
            prob_dict[tri].append([max_prob1,word1])
            prob_dict[tri].append([max_prob2,word2])

Driver Function for Testing the Language Model


In [12]:
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
      
    score = 0
    #load the training corpus for the dataset
    #token_len = loadCorpus1(bi_dict,tri_dict,quad_dict,vocab_dict)
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
    print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))

    start_time1 = time.time()
    #compute the Knesser Ney probabilities
    computeKnesserNeyProb1(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict )
    #sort the probable words by their probability
    sortProbWordDict(prob_dict)
    gc.collect()
    print("---Preprocessing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
    

### TESTING WITH TEST CORPUS
    test_data = ''
    #Now load the test corpus
    #!!!!!!!!!!!!!ONLY FOR THIS CORPUS!!!!!!!!    
    with open('test_corpus.txt','r') as file :
        for line in file:
            line_token = line.split()
            test_data = test_data + ' '+ ' '.join(line_token[1:])
           
    ### TESTING WITH TEST CORPUS
    test_data = ''
    #Now load the test corpus
    with open('test_corpus.txt','r') as file :
        test_data = file.read()
        
    #remove punctuations from the test data
    test_data = removePunctuations(test_data)
    test_token = test_data.split()

    #split the test data into 4 words list
    test_token = test_data.split()
    test_quadgrams = list(ngrams(test_token,4))

    #print(len(test_token))
    start_time1 = time.time()
    score = computeTestScore(test_quadgrams,tri_dict,quad_dict,vocab_dict,prob_dict)
    print('Score:',score)
    print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time1))

    start_time2 = time.time()
    perplexity = computePerplexity(test_token,token_len,tri_dict,quad_dict,vocab_dict,prob_dict)
    print('Perplexity:',perplexity)
    print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time2))

main function


In [13]:
def main():
#variable declaration
vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
bi_dict = defaultdict(int)             #for keeping count of sentences of two words
tri_dict = defaultdict(int)            #for keeping count of sentences of three words
quad_dict = defaultdict(int)           #for keeping count of sentences of four words
prob_dict = defaultdict(list)         #for storing probability of probable words for prediction

train_file = 'corpusfile.txt'
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)

#compute the Knesser Ney probabilities for bigrams,trigrams and unigrams
computeKnesserNeyProb(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict )

#sort the probable words by their probability
sortProbWordDict(prob_dict)

#take user input 
input_sen = takeInput()

### PREDICTION
prediction = doPrediction(input_sen,prob_dict)
print('Word Prediction:',prediction)


  File "<ipython-input-13-7c9be7dbc273>", line 3
    vocab_dict = defaultdict(int)          #for storing the different words with their frequencies
             ^
IndentationError: expected an indented block

In [ ]:
if __name__ == '__main__':
    main()

For Debugging Purpose Only

Ignore running the cells below if not debugging


In [14]:
#variable declaration
vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
bi_dict = defaultdict(int)             #for keeping count of sentences of two words
tri_dict = defaultdict(int)            #for keeping count of sentences of three words
quad_dict = defaultdict(int)           #for keeping count of sentences of four words
prob_dict = defaultdict(list)         #for storing probability of probable words for prediction

#load the corpus for the dataset
#loadCorpus('corpusfile.txt',bi_dict,tri_dict,quad_dict,vocab_dict)
print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))


---Preprocessing Time for Corpus loading: 2.9048519134521484 seconds ---

For Testing the Language Model

Calculates % Accuracy and Perplexity
NOTE : If this is run then no need to run the cells following it


In [ ]:
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict)


---Processing Time for Corpus Loading: 16.079257488250732 seconds ---

In [ ]:
train_file = 'corpusfile.txt'
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)

In [ ]:
#compute the Knesser Ney probabilities for bigrams,trigrams and unigrams
computeKnesserNeyProb(vocab_dict, bi_dict, tri_dict, quad_dict, prob_dict )

In [ ]:
#sort the probable words by their probability
sortProbWordDict(prob_dict)

In [ ]:
#take user input 
input_sen = takeInput()

### PREDICTION
start_time2 = time.time()
prediction = doPrediction(input_sen,prob_dict)
print('Word Prediction:',prediction)
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))

In [ ]:
#FOR DEBUGGING ONLY
writeProbDicts(prob_dict)