Word prediction

Language Model based on n-gram Probabilistic Model

Good Turing Smoothing Used with Backoff

Highest Order n-gram used is Quadgram

Import corpus



In [1]:

    
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()

Do preprocessing:

Remove the punctuations and lowercase the tokens



In [2]:

    
#returns: string
#arg: string
#remove punctuations and make the string lowercase
def removePunctuations(sen):
    #split the string into word tokens
    temp_l = sen.split()
    #print(temp_l)
    i = 0
    j = 0
    
    #changes the word to lowercase and removes punctuations from it
    for word in temp_l :
        j = 0
        #print(len(word))
        for l in word :
            if l in string.punctuation:
                if l == "'":
                    if j+1<len(word) and word[j+1] == 's':
                        j = j + 1
                        continue
                word = word.replace(l," ")
                #print(j,word[j])
            j += 1

        temp_l[i] = word.lower()
        i=i+1   

    #spliting is being don here beacause in sentences line here---so after punctuation removal it should 
    #become "here so"   
    content = " ".join(temp_l)

    return content

Tokenize and load the corpus data



In [3]:

    
#returns : int
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):

    w1 = ''    #for storing the 3rd last word to be used for next token set
    w2 = ''    #for storing the 2nd last word to be used for next token set
    w3 = ''    #for storing the last word to be used for next token set
    token = []
    #total no. of words in the corpus
    word_len = 0

    #open the corpus file and read it line by line
    with open(file_path,'r') as file:
        for line in file:

            #split the string into word tokens
            temp_l = line.split()
            i = 0
            j = 0
            
            #does the same as the removePunctuations() function,implicit declratation for performance reasons
            #changes the word to lowercase and removes punctuations from it
            for word in temp_l :
                j = 0
                #print(len(word))
                for l in word :
                    if l in string.punctuation:
                        if l == "'":
                            if j+1<len(word) and word[j+1] == 's':
                                j = j + 1
                                continue
                        word = word.replace(l," ")
                        #print(j,word[j])
                    j += 1

                temp_l[i] = word.lower()
                i=i+1   

            #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
            #become "here so"   
            content = " ".join(temp_l)

            token = content.split()
            word_len = word_len + len(token)  

            if not token:
                continue

            #add the last word from previous line
            if w3!= '':
                token.insert(0,w3)

            temp0 = list(ngrams(token,2))

            #since we are reading line by line some combinations of word might get missed for pairing
            #for trigram
            #first add the previous words
            if w2!= '':
                token.insert(0,w2)

            #tokens for trigrams
            temp1 = list(ngrams(token,3))

            #insert the 3rd last word from previous line for quadgram pairing
            if w1!= '':
                token.insert(0,w1)

            #add new unique words to the vocaulary set if available
            for word in token:
                if word not in vocab_dict:
                    vocab_dict[word] = 1
                else:
                    vocab_dict[word]+= 1
                  
            #tokens for quadgrams
            temp2 = list(ngrams(token,4))

            #count the frequency of the bigram sentences
            for t in temp0:
                sen = ' '.join(t)
                bi_dict[sen] += 1

            #count the frequency of the trigram sentences
            for t in temp1:
                sen = ' '.join(t)
                tri_dict[sen] += 1

            #count the frequency of the quadgram sentences
            for t in temp2:
                sen = ' '.join(t)
                quad_dict[sen] += 1


            #then take out the last 3 words
            n = len(token)

            #store the last few words for the next sentence pairing
            w1 = token[n -3]
            w2 = token[n -2]
            w3 = token[n -1]
    return word_len

Create a Hash Table for Probable words for Trigram sentences



In [19]:

    
#returns: void
#arg: dict,dict,dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a trigram sentence
def findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, nc_dict, k):
    
    i = 0
    V = len(vocab_dict)
   
    for quad_sen in quad_dict:
        quad_token = quad_sen.split()
        
        #trigram sentence for key
        tri_sen = ' '.join(quad_token[:3])

        #find the probability
        #Good Turing smoothing has been used
        quad_count = quad_dict[quad_sen]
        tri_count = tri_dict[tri_sen]
        
        if quad_dict[quad_sen] <= k  or (quad_sen not in quad_dict):
            quad_count = findGoodTuringAdjustCount( quad_dict[quad_sen], k, nc_dict)
        if tri_dict[tri_sen] <= k  or (tri_sen not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[tri_sen], k, nc_dict)
        
        prob = quad_count / tri_count
        
        #add the trigram to the quadgram probabiltity dict
        if tri_sen not in quad_prob_dict:
            quad_prob_dict[tri_sen] = []
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        else:
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
  
    prob = None
    quad_token = None
    tri_sen = None

Create a Hash Table for Probable words for Bigram sentences



In [20]:

    
#returns: void
#arg: dict,dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a bigram sentence
def findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, nc_dict, k):
    
    #vocabulary length
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for
    #trigram probabilites,key is a bigram and value is a list of prob and word
    for tri in tri_dict:
        tri_token = tri.split()
        #bigram sentence for key
        bi_sen = ' '.join(tri_token[:2])
        
        #find the probability
        #Good Turing smoothing has been used
        tri_count = tri_dict[tri]
        bi_count = bi_dict[bi_sen]
        
        if tri_dict[tri] <= k or (tri not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[tri], k, nc_dict)
        if bi_dict[bi_sen] <= k or (bi_sen not in bi_dict):
            bi_count = findGoodTuringAdjustCount( bi_dict[bi_sen], k, nc_dict)
        
        prob = tri_count / bi_count
        
        #add the bigram sentence  to the trigram probability dict
        #tri_prob_dict is a dict of list
        if bi_sen not in tri_prob_dict:
            tri_prob_dict[bi_sen] = []
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
        else:
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
    
    prob = None
    tri_token = None
    bi_sen = None

Create a Hash Table for Probable words for Unigram



In [21]:

    
#returns: void
#arg: dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a unigram
def findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, nc_dict, k):
   
    #vocabulary size
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for bigram probabilites
    for bi in bi_dict:
        bi_token = bi.split()
        #unigram for key
        unigram = bi_token[0]
       
        #find the probability
        #Good Turing smoothing has been used
        bi_count = bi_dict[bi]
        uni_count = vocab_dict[unigram]
        
        if bi_dict[bi] <= k or (bi not in bi_dict):
            bi_count = findGoodTuringAdjustCount( bi_dict[bi], k, nc_dict)
        if vocab_dict[unigram] <= k or (unigram not in vocab_dict):
            uni_count = findGoodTuringAdjustCount( vocab_dict[unigram], k, nc_dict)
        
        prob = bi_count / uni_count
        
        #add the unigram to the bigram probability dict
        #bi_prob_dict is a dict of list
        if unigram not in bi_prob_dict:
            bi_prob_dict[unigram] = []
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
        else:
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
    
   
    prob = None
    bi_token = None
    unigram = None

Sort the probable words for the various Probability Dictionaries according to their probability



In [7]:

    
#returns: void
#arg: dict
#for sorting the probable word acc. to their probabilities
def sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict):
    for key in bi_prob_dict:
        if len(bi_prob_dict[key])>1:
            bi_prob_dict[key] = sorted(bi_prob_dict[key],reverse = True)
    
    for key in tri_prob_dict:
        if len(tri_prob_dict[key])>1:
            tri_prob_dict[key] = sorted(tri_prob_dict[key],reverse = True)
    
    for key in quad_prob_dict:
        if len(quad_prob_dict[key])>1:
            quad_prob_dict[key] = sorted(quad_prob_dict[key],reverse = True)[:2]

For Taking input from the User



In [8]:

    
#returns: string
#arg: void
#for taking input from user
def takeInput():
    cond = False
    #take input
    while(cond == False):
        sen = input('Enter the string\n')
        sen = removePunctuations(sen)
        temp = sen.split()
        if len(temp) < 3:
            print("Please enter atleast 3 words !")
        else:
            cond = True
            temp = temp[-3:]
    sen = " ".join(temp)
    return sen

Test Score ,Perplexity Calculation:

For computing the Test Score



In [9]:

    
#computes the score for test data
def computeTestScore(test_token, bi_dict, tri_dict, quad_dict, 
                              quad_prob_dict, tri_prob_dict,bi_prob_dict ):
     #increment the score value if correct prediction is made else decrement its value
    score = 0
    wrong = 0
    total = 0
    with open('Test_Scores/Good_Turing_Backoff_Score.txt','w') as w:
        for sent in test_token:
            sen_token = sent[:3]
            sen = " ".join(sen_token)
            correct_word = sent[3]
            
            result = doPredictionBackoffGT(sen, bi_dict, tri_dict, quad_dict, bi_prob_dict, tri_prob_dict,
                                           quad_prob_dict)
            if result:
                if result[1] == correct_word:
                    score+=1
                else:
                    wrong += 1
            else:
                wrong += 1
            total += 1
            
        w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
        #print stats
        print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100)+'%' )
    return score

For Computing the Perplexity



In [10]:

    
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams, bi_dict, tri_dict, quad_dict, 
                                     vocab_dict,token_len, k, quad_nc_dict, tri_nc_dict,
                                      bi_nc_dict, uni_nc_dict):
    
    perplexity = float(1.0)
    n = token_len
    
    for key in quad_dict:
        quad_token = key.split()
         
        quad_count = quad_dict[key]
        tri_count = tri_dict[' '.join(quad_token[0:3])]
        
        if quad_dict[key] <= k or (key not in quad_dict):
            quad_count = findGoodTuringAdjustCount( quad_dict[key], k, quad_nc_dict)
        if tri_dict[' '.join(quad_token[0:3])] <= k  or (' '.join(quad_token[0:3]) not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[0:3])], k, tri_nc_dict)
        prob = quad_count / tri_count
        
        if prob != 0:
            perplexity = perplexity * ( prob**(1./n))
    with open('Test_Scores/Good_Turing_Backoff_Score.txt','a') as w:
        w.write('\nPerplexity: '+str(perplexity))
    return perplexity

Regression related stuff



In [11]:

    
## Regression related stuff
#calculate best fit line for simple regression 
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib import style

#finds the slope for the best fit line
def findBestFitSlope(x,y):
    m = (( mean(x)*mean(y) - mean(x*y) ) / 
          ( mean(x)** 2 - mean(x**2)))

    return m
      
#finds the intercept for the best fit line
def findBestFitIntercept(x,y,m):
    c = mean(y) - m*mean(x)
    return c

Find the count Nc for quadgrams and trigrams where c > k , k = 5



In [12]:

    
## Find the count Nc for quadgrams and trigrams where c > 5
#arg: dict, int, int, int, int
#returns: dict
#token_len : total no. of ngram tokens
def findFrequencyOfFrequencyCount(ngram_dict, k, n, V, token_len):
    #for keeping count of 'c' value i.e Nc
    nc_dict = {}
    #we find the value of Nc,c = 0 by V^n - (total n-gram tokens)
    nc_dict[0] = V**n - token_len
    #find the count Nc till c = k,we will take k = 5
    #find counts for n-gram
    for key in ngram_dict:
        if ngram_dict[key] <= k + 1:
            if ngram_dict[key] not in nc_dict:
                nc_dict[ ngram_dict[key]] = 1
            else:
                nc_dict[ ngram_dict[key] ] += 1
    
    #check if all the values of Nc are there in the nc_dict or not ,if there then return           
    val_present = True
    for i in range(1,7):
        if i not in nc_dict:
            val_present = False
            break
    if val_present == True:
        return nc_dict
    
    #now fill in the values of nc in case it is not there using regression upto c = 6
    #we use :[ log(Nc) = blog(c) + a ] as the equation

    #we first need to find data for regression that is values(Nc,c) we take 5 data points
    data_pts = {}
    i = 0
    #get first 5 counts value i.e c
    #for quadgram
    for key in ngram_dict:
        if ngram_dict[key] not in data_pts:
                data_pts[ ngram_dict[key] ] = 1
                i += 1
        if i >5:
            break
            
    #now get Nc for those c values
    for key in ngram_dict:
        if ngram_dict[key] in data_pts:
            data_pts[ ngram_dict[key] ] += 1
    
    #make x ,y coordinates for regression 
    x_coor = [ np.log(item) for item in data_pts ]
    y_coor = [ np.log( data_pts[item] ) for item in data_pts ]
    x = np.array(x_coor, dtype = np.float64)
    y = np.array(y_coor , dtype = np.float64)
   

    #now do regression
    #find the slope and intercept for the regression equation
    slope_m = findBestFitSlope(x,y)
    intercept_c = findBestFitIntercept(x,y,slope_m)

    #now find the missing Nc terms and give them value using regression
    for i in range(1,(k+2)):
        if i not in nc_dict:
            nc_dict[i] = (slope_m*i) + intercept_c
    
    return nc_dict

For finding the Good Turing Probability



In [13]:

    
#for finding the adjusted count c* in Good Turing Smoothing
def findGoodTuringAdjustCount(c, k, nc_dict):
   
    adjust_count = ( ( (( c + 1)*( nc_dict[c + 1] / nc_dict[c])) - ( c * (k+1) * nc_dict[k+1] / nc_dict[1]) ) /
                     ( 1 - (( k + 1)*nc_dict[k + 1] / nc_dict[1]) )
                   )
    return adjust_count

Driver function for doing the prediction

Find word Prediction using Backoff



In [14]:

    
#finds the word prediction usinng Backoff
def doPredictionBackoffGT(input_sen, bi_dict, tri_dict, quad_dict, bi_prob_dict, tri_prob_dict, quad_prob_dict):
    #split the input sentence into tokens
    token = input_sen.split()
    
    #if the input sen is found in any ngram then give the most probable word for that ngram
    #if not then go to the lower order ngram
    if input_sen in quad_prob_dict and quad_prob_dict[ input_sen ][0][0]>0:
        pred = quad_prob_dict[input_sen][0]
    elif ' '.join(token[1:]) in tri_prob_dict and tri_prob_dict[' '.join(token[1:])][0][0]>0:
        pred = tri_prob_dict[ ' '.join(token[1:]) ][0]
    elif ' '.join(token[2:]) in bi_prob_dict and bi_prob_dict[ ' '.join(token[2:]) ][0][0]>0:
        pred = bi_prob_dict[' '.join(token[2:])][0]
    else:
        pred = []
    return pred

Driver Function for Testing the Language Model



In [15]:

    
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
      
    test_result = ''
    score = 0
    #load the training corpus for the dataset
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
    print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))

    start_time1 = time.time()
    
    #create the different Nc dictionaries for ngrams
    #threshold value
    k = 5
    V = len(vocab_dict)
    quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
    tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
    bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
    uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))

    #create quadgram probability dictionary
    findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
    #create trigram probability dictionary
    findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
    #create bigram probability dictionary
    findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
    #sort the probability dictionaries of quad,tri and bi grams
    sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
    
    print("---Processing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
    
    
    ### TESTING WITH TEST CORPUS
    test_data = ''
    #Now load the test corpus
    with open('test_corpus.txt','r') as file :
        test_data = file.read()

    #remove punctuations from the test data
    test_data = removePunctuations(test_data)
    test_token = test_data.split()

    #split the test data into 4 words list
    test_token = test_data.split()
    test_quadgrams = list(ngrams(test_token,4))
    
    #choose most probable words for prediction
    start_time2 = time.time()
    score = computeTestScore(test_quadgrams, bi_dict, tri_dict, quad_dict, 
                              quad_prob_dict, tri_prob_dict,bi_prob_dict )
    print('Score:',score)
    print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time2))

    start_time3 = time.time()
    perplexity = computePerplexity(test_quadgrams, bi_dict, tri_dict, quad_dict, 
                                     vocab_dict,token_len,  k, quad_nc_dict, tri_nc_dict,
                                      bi_nc_dict, uni_nc_dict)
    print('Perplexity:',perplexity)
    print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time3))

main function



In [17]:

    
def main():
    #variable declaration
    vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
    bi_dict = defaultdict(int)             #for keeping count of sentences of two words
    tri_dict = defaultdict(int)            #for keeping count of sentences of three words
    quad_dict = defaultdict(int)           #for keeping count of sentences of four words
    quad_prob_dict = OrderedDict()              
    tri_prob_dict = OrderedDict()
    bi_prob_dict = OrderedDict()

    #load the corpus for the dataset
    train_file = 'corpusfile.txt'
    #load corpus
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)

    #create the different Nc dictionaries for ngrams
    #threshold value
    k = 5
    V = len(vocab_dict)
    quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
    tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
    bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
    uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))

    #create quadgram probability dictionary
    findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
    #create trigram probability dictionary
    findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
    #create bigram probability dictionary
    findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
    #sort the probability dictionaries of quad,tri and bi grams
    sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)

    ##WORD PREDICTION 
    #take user input 
    input_sen = takeInput()

    prediction = doPredictionBackoffGT(input_sen, bi_dict, tri_dict, quad_dict, bi_prob_dict, tri_prob_dict, quad_prob_dict)
    if prediction:
        print('Word Prediction:',prediction[1])



In [18]:

    
if __name__ == '__main__':
    main()









    



42164 585760 1416128 1861521
42164 585760 1416128
42164 585760
Enter the string
emma by jane
Word Prediction: austen

For Debugging Purpose Only

Uncomment the above two cells and ignore running the cells below if not debugging



In [22]:

    
#variable declaration
vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
bi_dict = defaultdict(int)             #for keeping count of sentences of two words
tri_dict = defaultdict(int)            #for keeping count of sentences of three words
quad_dict = defaultdict(int)           #for keeping count of sentences of four words
quad_prob_dict = OrderedDict()              
tri_prob_dict = OrderedDict()
bi_prob_dict = OrderedDict()

#load the corpus for the dataset
#loadCorpus('corpusfile.txt',bi_dict,tri_dict,quad_dict,vocab_dict)
print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))









    



---Preprocessing Time for Corpus loading: 154.0471556186676 seconds ---

For Testing the Language Model

Calculates % Accuracy and Perplexity
NOTE : If this is run then no need to run the cells following it



In [23]:

    
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,quad_prob_dict)









    



---Processing Time for Corpus Loading: 170.31629729270935 seconds ---
---Processing Time for Creating Probable Word Dict: 20.3472957611084 seconds ---
Total Word Prdictions: 312980
Correct Prdictions: 26627
Wrong Prdictions: 286353
ACCURACY:8.50757236884146%
Score: 26627
---Processing Time for computing score: 0.9217593669891357 seconds ---
Perplexity: 0.2295166338575179
---Processing Time for computing Perplexity: 7.355926513671875 seconds ---



In [ ]:

    
train_file = 'corpusfile.txt'
#load corpus
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)



In [ ]:

    
#create the different Nc dictionaries for ngrams
#threshold value
k = 5
V = len(vocab_dict)
quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))



In [ ]:

    
#create quadgram probability dictionary
findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)



In [ ]:

    
#create trigram probability dictionary
findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)



In [ ]:

    
#create bigram probability dictionary
findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)



In [ ]:

    
#sort the probability dictionaries of quad,tri and bi grams
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)



In [ ]:

    
#FOR DEBUGGING ONLY
writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict)



In [ ]:

    
##WORD PREDICTION 

start_time2 = time.time()
#take user input 
input_sen = takeInput()

prediction = doPredictionBackoffGT(input_sen, bi_dict, tri_dict, quad_dict, bi_prob_dict, tri_prob_dict, quad_prob_dict)
if prediction:
    print('Word Prediction:',prediction[1])
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))