Word prediction

Language Model based on n-gram Probabilistic Model

Add-1 Smoothing Used with Interpolation

Highest Order n-gram used is Quadgram

Import corpus


In [ ]:
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()

Do preprocessing:

Remove the punctuations and lowercase the tokens


In [ ]:
#returns: string
#arg: string
#remove punctuations, change to lowercase ,retain the apostrophe mark
def removePunctuations(sen):
    #split the string into word tokens
    temp_l = sen.split()
    #print(temp_l)
    i = 0
    j = 0
    
    #changes the word to lowercase and removes punctuations from it
    for word in temp_l :
        j = 0
        #print(len(word))
        for l in word :
            if l in string.punctuation:
                if l == "'":
                    if j+1<len(word) and word[j+1] == 's':
                        j = j + 1
                        continue
                word = word.replace(l," ")
                #print(j,word[j])
            j += 1

        temp_l[i] = word.lower()
        i=i+1   

    #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
    #become "here so"   
    content = " ".join(temp_l)

    return content

Tokenize and load the corpus data


In [ ]:
#returns : void
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):

    w1 = ''    #for storing the 3rd last word to be used for next token set
    w2 = ''    #for storing the 2nd last word to be used for next token set
    w3 = ''    #for storing the last word to be used for next token set
    token = []
    #total no. of words in the corpus
    word_len = 0

    #open the corpus file and read it line by line
    with open(file_path,'r') as file:
        for line in file:

            #split the string into word tokens
            temp_l = line.split()
            i = 0
            j = 0
            
            #does the same as the removePunctuations() function,implicit declaration for performance reasons
            #changes the word to lowercase and removes punctuations from it
            for word in temp_l :
                j = 0
                #print(len(word))
                for l in word :
                    if l in string.punctuation:
                        if l == "'":
                            if j+1<len(word) and word[j+1] == 's':
                                j = j + 1
                                continue
                        word = word.replace(l," ")
                        #print(j,word[j])
                    j += 1

                temp_l[i] = word.lower()
                i=i+1   

            #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
            #become "here so"   
            content = " ".join(temp_l)

            token = content.split()
            word_len = word_len + len(token)  

            if not token:
                continue

            #add the last word from previous line
            if w3!= '':
                token.insert(0,w3)

            temp0 = list(ngrams(token,2))

            #since we are reading line by line some combinations of word might get missed for pairing
            #for trigram
            #first add the previous words
            if w2!= '':
                token.insert(0,w2)

            #tokens for trigrams
            temp1 = list(ngrams(token,3))

            #insert the 3rd last word from previous line for quadgram pairing
            if w1!= '':
                token.insert(0,w1)

            #add new unique words to the vocaulary set if available
            for word in token:
                if word not in vocab_dict:
                    vocab_dict[word] = 1
                else:
                    vocab_dict[word]+= 1
                  
            #tokens for quadgrams
            temp2 = list(ngrams(token,4))

            #count the frequency of the bigram sentences
            for t in temp0:
                sen = ' '.join(t)
                bi_dict[sen] += 1

            #count the frequency of the trigram sentences
            for t in temp1:
                sen = ' '.join(t)
                tri_dict[sen] += 1

            #count the frequency of the quadgram sentences
            for t in temp2:
                sen = ' '.join(t)
                quad_dict[sen] += 1


            #then take out the last 3 words
            n = len(token)

            #store the last few words for the next sentence pairing
            w1 = token[n -3]
            w2 = token[n -2]
            w3 = token[n -1]
    return word_len

Create a Hash Table for Probable words for Trigram sentences


In [ ]:
#creates dict for storing probable words with their probabilities for a trigram sentence
# ADD 1 Smoothing used

#returns: void
#arg: dict,dict,dict,dict,dict
def findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict):
    i = 0
    V = len(vocab_dict)
    
    #using the fourth word of the quadgram sentence as the probable word and calculate its
    #probability,here ADD 1 smoothing has been used during the probability calculation
    for quad_sen in quad_dict:
        quad_token = quad_sen.split()

        #trigram sentence for key
        tri_sen = ' '.join(quad_token[:3])

        #find the probability
        #add 1 smoothing has been used
        prob = ( quad_dict[quad_sen] + 1 ) / ( tri_dict[tri_sen] + V)
        
        #if the trigram sentence is not present in the Dictionary then add it
        if tri_sen not in quad_prob_dict:
            quad_prob_dict[tri_sen] = []
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        #the trigram sentence is present but the probable word is missing,then add it
        else:
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        
   
    prob = None
    quad_token = None
    tri_sen = None

For creating Probability Dictionary for Trigram Probabilities


In [ ]:
#for creating prob dict for trigram probabilities
#creates dict for storing probable words with their probabilities for a trigram sentence
# ADD 1 Smoothing used

#returns: void
#arg: dict,dict,dict,dict
def findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict):
   
    #vocabulary length
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for
    #trigram probabilites,key is a bigram and value is a list of prob and word
    for tri in tri_dict:
        tri_token = tri.split()
        #bigram sentence for key
        bi_sen = ' '.join(tri_token[:2])
        
        #find the probability
        #add 1 smoothing has been used
        prob = ( tri_dict[tri] + 1 ) / ( bi_dict[bi_sen] + V)

        #tri_prob_dict is a dict of list
        #if the bigram sentence is not present in the Dictionary then add it
        if bi_sen not in tri_prob_dict:
            tri_prob_dict[bi_sen] = []
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
        #the bigram sentence is present but the probable word is missing,then add it
        else:
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
            
  
    prob = None
    tri_token = None
    bi_sen = None

For creating Probability Dictionary for Bigram Probabilities


In [ ]:
#for creating prob dict for bigram probabilities
#creates dict for storing probable words with their probabilities for a trigram sentence
# ADD 1 Smoothing used

#returns: void
#arg: dict,dict,dict,dict
def findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict):
    
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for bigram probabilites
    for bi in bi_dict:
        bi_token = bi.split()
        #unigram for key
        unigram = bi_token[0]
        
        #find the probability
        #add 1 smoothing has been used
        prob = ( bi_dict[bi] + 1 ) / ( vocab_dict[unigram] + V)

        #bi_prob_dict is a dict of list
        #if the unigram sentence is not present in the Dictionary then add it
        if unigram not in bi_prob_dict:
            bi_prob_dict[unigram] = []
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
        #the unigram sentence is present but the probable word is missing,then add it
        else:
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
    
   
    prob = None
    bi_token = None
    unigram = None

Parameter estimation for Interpolation

For estimating parameters we try to maximise the value of lambdas L1,L2,L3 and L4
We do that by try all possible combinations of lambdas with step size 0.1 and try to maximise the
probabilty of held out data


In [ ]:
#finds the lambda values required for doing Interpolation

#arg: int, dict, dict, dict, dict
#returns: list
def estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict):
    max_prob = -9999999999999999999.0
    curr_prob = 0.0
    parameters = [0.0,0.0,0.0,0.0]
    i = 1
    
    #load the held out data 
    file = open('held_out_corpus.txt','r')
    held_out_data = file.read()
    file.close()
    
    #remove punctuations and other cleaning stuff
    held_out_data = removePunctuations(held_out_data)
    held_out_data = held_out_data.split()
    #make quad tokens for parameter estimation
    quad_token_heldout = list(ngrams(held_out_data,4))
    
    #for storing the stats 
    #f = open('interpolation_prob_stats.txt','w') 
    
    #lambda values1 and 4
    l1 = 0
    l4 = 0

    while l1 <= 1.0:
        l2 = 0
        while l2 <= 1.0:
            l3 = 0
            while l3 <= 1.0:
                
                #when the sum of lambdas is greater than 1 or when all 4 are zero we don't need to check so skip
                if l1 == 0 and l2 == 0 and l3 == 0 or ((l1+l2+l3)>1):
                    l3 += 0.1
                    i += 1
                    continue
                    
                #find lambda 4
                l4 = 1- (l1 + l2 + l3)
                
                curr_prob = 0
                qc = [0]
                bc = [0]
                tc = [0]
                
                #find the probability for the held out set using the current lambda values
                for quad in quad_token_heldout:
                    #take log of prob to avoid underflow 
                    curr_prob += log10( interpolatedProbability(quad,token_len, vocab_dict, bi_dict, tri_dict, 
                                                                quad_dict,qc,bc,tc,l1, l2, l3, l4) )
                
                if curr_prob > max_prob:
                    max_prob = curr_prob
                    parameters[0] = l1
                    parameters[1] = l2
                    parameters[2] = l3
                    parameters[3] = l4
                l3 += 0.1
                i += 1
               
            l2 += 0.1
        l1 += 0.1
    
    #f.write('\n\n\nL1: '+str(parameters[0])+'  L2: '+str(parameters[1])+'  L3: '+str(parameters[2])+'  L4: '+str(parameters[3])+'  MAX PROB: '+str(max_prob)+'\n')        
    #f.close()
    return parameters

For Computing Interpolated Probability


In [ ]:
#returns: float
#arg: list,list,dict,dict,dict,dict,float,float,float,float
#for calculating the interpolated probablity given the Trigram sentence and the given word
def interpolatedProbability(quad_token,token_len, vocab_dict, bi_dict, tri_dict, quad_dict, qc, tc, bc,
                            l1 = 0.25, l2 = 0.25, l3 = 0.25 , l4 = 0.25):
    V = len(vocab_dict)
    
    sen = ' '.join(quad_token)
    prob = (   
              l1*((quad_dict[sen] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V)) 
            + l2*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V)) 
            + l3*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V)) 
            + l4*((vocab_dict[quad_token[3]] + 1) / (token_len + V))
           )
    
    if sen  in quad_dict:
        qc[0] += 1
    if ' '.join(quad_token[1:4]) in tri_dict:
        tc[0] += 1
    if ' '.join(quad_token[2:4])  in bi_dict:
        bc[0] += 1    
        
    #since log10(1) is zero so it doesn't add upto anything but log10(0) is undefined
    if prob <= 0:
        return 1
    
    return prob

Sort the probable words


In [ ]:
#for sorting the probable word acc. to their probabilities

#returns: void
#arg: dict, dict, dict
def sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict):
    #sort bigram dict
    for key in bi_prob_dict:
        if len(bi_prob_dict[key])>1:
            bi_prob_dict[key] = sorted(bi_prob_dict[key],reverse = True)
    
    #sort trigram dict
    for key in tri_prob_dict:
        if len(tri_prob_dict[key])>1:
            tri_prob_dict[key] = sorted(tri_prob_dict[key],reverse = True)
    
    #sort quadgram dict
    for key in quad_prob_dict:
        if len(quad_prob_dict[key])>1:
            quad_prob_dict[key] = sorted(quad_prob_dict[key],reverse = True)[:2]

For choosing prediction word candidates for Word Prediction


In [ ]:
#pick the top most probable words from bi,tri and quad prob dict as word prediction candidates

#returns: list[float,string]
#arg: string,dict,dict,dict
def chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict):
    word_choice = []
    token = sen.split()
    if token[-1] in bi_prob_dict:
        word_choice +=  bi_prob_dict[token[-1]][:1]
        #print('Word Choice bi dict')
    if ' '.join(token[1:]) in tri_prob_dict:
        word_choice +=  tri_prob_dict[' '.join(token[1:])][:1]
        #print('Word Choice tri_dict')
    if ' '.join(token) in quad_prob_dict:
        word_choice += quad_prob_dict[' '.join(token)][:1]
        #print('Word Choice quad_dict')
    
    return word_choice

Finds the Predicted Word using Interpolation with Add -1 Smoothing


In [ ]:
#does prediction for the the sentence using Interpolation
#Uses Add-1 Smoothing
#returns: string
#arg: string,dict,dict,dict,dict,int,list,list
def doInterpolatedPredictionAdd1(sen, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, word_choice, param):
    pred = ''
    max_prob = 0.0
    V = len(vocab_dict)
    #for each word choice find the interpolated probability and decide
    for word in word_choice:
        key = sen + ' ' + word[1]
        quad_token = key.split()
        
        prob = (   
                  param[0]*((quad_dict[key] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V)) 
                + param[1]*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V)) 
                + param[2]*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V)) 
                + param[3]*((vocab_dict[quad_token[3]] + 1) / (token_len + V))
               )
        
        if prob > max_prob:
            max_prob = prob
            pred = word
    #return only pred to get word with its prob
    if pred:
        return pred[1]
    else:
        return ''

For Taking input from the User


In [ ]:
#for taking input from user

#returns: string
#arg: void
def takeInput():
    cond = False
    #take input
    while(cond == False):
        sen = input('Enter the string\n')
        sen = removePunctuations(sen)
        temp = sen.split()
        if len(temp) < 3:
            print("Please enter atleast 3 words !")
        else:
            cond = True
            temp = temp[-3:]
    sen = " ".join(temp)
    return sen

Test Score ,Perplexity Calculation:

For computing the Test Score


In [ ]:
#computes the score for test data i,e no. of right predictions against no. of wrong predictions

#return:int
#arg:list,dict,dict,dict,dict
def computeTestScore(test_token,bi_dict, tri_dict, quad_dict, vocab_dict,
                     bi_prob_dict, tri_prob_dict, quad_prob_dict, token_len,param):
    
    
    #increment the score value if correct prediction is made else decrement its value
    score = 0
    wrong = 0
    total = 0
    with open('Test_Scores/add1_smoothing_score.txt','w') as w:
        for sent in test_token:
            sen_token = sent[:3]
            sen = " ".join(sen_token)
            correct_word = sent[3]
            #select probable word candidates for prediction
            word_choice = chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
            result = doInterpolatedPredictionAdd1(sen, bi_dict, tri_dict, quad_dict, vocab_dict,token_len, word_choice, param)
            
            if result == correct_word:
                score+=1
            else:
                wrong += 1
            
            total += 1
        w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
    #print stats
    print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100) )
    return score

For Computing the Perplexity


In [ ]:
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams,token_len,tri_dict,quad_dict,vocab_dict,prob_dict):
    
    perplexity = float(1.0)
    n = token_len
    V = len(vocab_dict)
    for item in quad_dict:
        sen_token = item.split()
        tri_sen = ' '.join(sen_token[0:3])
        prob = (quad_dict[item] + 1) / (tri_dict[tri_sen] + V)
        perplexity = perplexity * ( prob**(1./n))
    with open('Test_Scores/add1_smoothing_score.txt','w') as w:
        w.write('\nPerplexity: '+str(perplexity))
    return perplexity

Driver Function for Testing the Language Model


In [ ]:
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
    score = 0
    
    #load the training corpus for the dataset
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
    print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))

    start_time1 = time.time()
    
    #estimate the lambdas for interpolation
    #found earlier usign estimate Function
    param = [0.7,0.1,0.1,0.1]
    #param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
    #print(param)
    
    #create trigram Probability Dictionary
    findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict)
    #create bigram Probability Dictionary
    findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict)
    #create quadgram Probability Dictionary
    findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)
    #sort the probability dictionaries
    sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
    gc.collect()
    print("---Preprocessing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
    
    
    ### TESTING WITH TEST CORPUS
    test_data = ''
    #Now load the test corpus
    with open('test_corpus.txt','r') as file :
        test_data = file.read()

    #remove punctuations from the test data
    test_data = removePunctuations(test_data)
    test_token = test_data.split()

    #split the test data into 4 words list
    test_token = test_data.split()
    test_quadgrams = list(ngrams(test_token,4))
    
    #choose most probable words for prediction
    start_time2 = time.time()
    score = computeTestScore(test_quadgrams,bi_dict, tri_dict, quad_dict, vocab_dict,
                     bi_prob_dict, tri_prob_dict, quad_prob_dict, token_len,param)
    print('Score:',score)
    print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time2))

    start_time3 = time.time()
    perplexity = computePerplexity(test_token,token_len,tri_dict,quad_dict,vocab_dict,prob_dict)
    print('Perplexity:',perplexity)
    print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time3))

main function


In [ ]:
def main():

    #variable declaration
    vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
    bi_dict = defaultdict(int)             #for keeping count of sentences of two words
    tri_dict = defaultdict(int)            #for keeping count of sentences of three words
    quad_dict = defaultdict(int)           #for keeping count of sentences of four words
    quad_prob_dict = defaultdict(list)         #for storing the probable  words for Quadgram sentences     
    tri_prob_dict = defaultdict(list)          #for storing the probable  words for Trigram sentences     
    bi_prob_dict = defaultdict(list)           #for storing the probable  words for Bigram sentences

    train_file = 'corpusfile.txt'
    #load the corpus for the dataset
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
   
    #estimate the lambdas for interpolation
    #param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
    param = [0.7,0.1,0.1,0.1]
    
    #create bigram Probability Dictionary
    findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict)
    #create trigram Probability Dictionary
    findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict)
    #create quadgram Probability Dictionary
    findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)
    #sort the probability dictionaries
    sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)

    #take user input 
    input_sen = takeInput()


    ### PREDICTION
    #choose most probable words for prediction
    word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
    prediction = doInterpolatedPredictionAdd1(input_sen, bi_dict, tri_dict, quad_dict, vocab_dict,token_len, word_choice, param)
    print('Word Prediction:',prediction)

In [ ]:
if __name__ == '__main__':
    main()

For Debugging Purpose Only

Ignore running the cells below if not debugging


In [ ]:
#variable declaration
vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
bi_dict = defaultdict(int)             #for keeping count of sentences of two words
tri_dict = defaultdict(int)            #for keeping count of sentences of three words
quad_dict = defaultdict(int)           #for keeping count of sentences of four words
quad_prob_dict = defaultdict(list)         #for storing the probable  words for Quadgram sentences     
tri_prob_dict = defaultdict(list)          #for storing the probable  words for Trigram sentences     
bi_prob_dict = defaultdict(list)           #for storing the probable  words for Bigram sentences

For Testing the Language Model

Calculates % Accuracy and Perplexity
NOTE : If this is run then no need to run the cells following it


In [ ]:
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,quad_prob_dict)

In [ ]:
train_file = 'corpusfile.txt'
#load the corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)

In [ ]:
#estimate the lambdas for interpolation
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
param = [0.8,0.2,0.0,0.0]

In [ ]:
#create bigram Probability Dictionary
findBigramProbAdd1(vocab_dict, bi_dict, bi_prob_dict)

In [ ]:
#create trigram Probability Dictionary
findTrigramProbAdd1(vocab_dict, bi_dict, tri_dict, tri_prob_dict)

In [ ]:
#create quadgram Probability Dictionary
findQuadgramProbAdd1(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)

In [ ]:
#sort the probability dictionaries
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)

In [ ]:
#FOR DEBUGGING ONLY
writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict)

In [ ]:
#take user input 
input_sen = takeInput()


### PREDICTION
start_time2 = time.time()
#choose most probable words for prediction
word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)
prediction = doInterpolatedPredictionAdd1(input_sen, bi_dict, tri_dict, quad_dict, vocab_dict,token_len, word_choice, param)
#prediction = doPrediction(input_sen,prob_dict)
print('Word Prediction:',prediction)
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))