Word prediction

Language Model based on n-gram Probabilistic Model

Good Turing Smoothing Used with Interpolation

Highest Order n-gram used is Quadgram

Import corpus



In [1]:

    
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()

Do preprocessing:

Remove the punctuations and lowercase the tokens



In [2]:

    
#returns: string
#arg: string
#remove punctuations and make the string lowercase
def removePunctuations(sen):
    #split the string into word tokens
    temp_l = sen.split()
    #print(temp_l)
    i = 0
    j = 0
    
    #changes the word to lowercase and removes punctuations from it
    for word in temp_l :
        j = 0
        #print(len(word))
        for l in word :
            if l in string.punctuation:
                if l == "'":
                    if j+1<len(word) and word[j+1] == 's':
                        j = j + 1
                        continue
                word = word.replace(l," ")
                #print(j,word[j])
            j += 1

        temp_l[i] = word.lower()
        i=i+1   

    #spliting is being don here beacause in sentences line here---so after punctuation removal it should 
    #become "here so"   
    content = " ".join(temp_l)

    return content

Tokenize and load the corpus data



In [3]:

    
#returns : int
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):

    w1 = ''    #for storing the 3rd last word to be used for next token set
    w2 = ''    #for storing the 2nd last word to be used for next token set
    w3 = ''    #for storing the last word to be used for next token set
    token = []
    #total no. of words in the corpus
    word_len = 0

    #open the corpus file and read it line by line
    with open(file_path,'r') as file:
        for line in file:

            #split the string into word tokens
            temp_l = line.split()
            i = 0
            j = 0
            
            #does the same as the removePunctuations() function,implicit declratation for performance reasons
            #changes the word to lowercase and removes punctuations from it
            for word in temp_l :
                j = 0
                #print(len(word))
                for l in word :
                    if l in string.punctuation:
                        if l == "'":
                            if j+1<len(word) and word[j+1] == 's':
                                j = j + 1
                                continue
                        word = word.replace(l," ")
                        #print(j,word[j])
                    j += 1

                temp_l[i] = word.lower()
                i=i+1   

            #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
            #become "here so"   
            content = " ".join(temp_l)

            token = content.split()
            word_len = word_len + len(token)  

            if not token:
                continue

            #add the last word from previous line
            if w3!= '':
                token.insert(0,w3)

            temp0 = list(ngrams(token,2))

            #since we are reading line by line some combinations of word might get missed for pairing
            #for trigram
            #first add the previous words
            if w2!= '':
                token.insert(0,w2)

            #tokens for trigrams
            temp1 = list(ngrams(token,3))

            #insert the 3rd last word from previous line for quadgram pairing
            if w1!= '':
                token.insert(0,w1)

            #add new unique words to the vocaulary set if available
            for word in token:
                if word not in vocab_dict:
                    vocab_dict[word] = 1
                else:
                    vocab_dict[word]+= 1
                  
            #tokens for quadgrams
            temp2 = list(ngrams(token,4))

            #count the frequency of the bigram sentences
            for t in temp0:
                sen = ' '.join(t)
                bi_dict[sen] += 1

            #count the frequency of the trigram sentences
            for t in temp1:
                sen = ' '.join(t)
                tri_dict[sen] += 1

            #count the frequency of the quadgram sentences
            for t in temp2:
                sen = ' '.join(t)
                quad_dict[sen] += 1


            #then take out the last 3 words
            n = len(token)

            #store the last few words for the next sentence pairing
            w1 = token[n -3]
            w2 = token[n -2]
            w3 = token[n -1]
    return word_len

Create a Hash Table for Probable words for Trigram sentences



In [4]:

    
#returns: void
#arg: dict,dict,dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a trigram sentence
def findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, nc_dict, k):
    
    i = 0
    V = len(vocab_dict)
   
    for quad_sen in quad_dict:
        quad_token = quad_sen.split()
        
        #trigram sentence for key
        tri_sen = ' '.join(quad_token[:3])

        #find the probability
        #Good Turing smoothing has been used
        quad_count = quad_dict[quad_sen]
        tri_count = tri_dict[tri_sen]
        
        if quad_dict[quad_sen] <= k  or (quad_sen not in quad_dict):
            quad_count = findGoodTuringAdjustCount( quad_dict[quad_sen], k, nc_dict)
        if tri_dict[tri_sen] <= k  or (tri_sen not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[tri_sen], k, nc_dict)
        
        prob = quad_count / tri_count
        
        #add the trigram to the quadgram probabiltity dict
        if tri_sen not in quad_prob_dict:
            quad_prob_dict[tri_sen] = []
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        else:
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])

    prob = None
    quad_token = None
    tri_sen = None

Create a Hash Table for Probable words for Bigram sentences



In [5]:

    
#returns: void
#arg: dict,dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a bigram sentence
def findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, nc_dict, k):
    
    #vocabulary length
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for
    #trigram probabilites,key is a bigram and value is a list of prob and word
    for tri in tri_dict:
        tri_token = tri.split()
        #bigram sentence for key
        bi_sen = ' '.join(tri_token[:2])
        
        #find the probability
        #Good Turing smoothing has been used
        tri_count = tri_dict[tri]
        bi_count = bi_dict[bi_sen]
        
        if tri_dict[tri] <= k or (tri not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[tri], k, nc_dict)
        if bi_dict[bi_sen] <= k or (bi_sen not in bi_dict):
            bi_count = findGoodTuringAdjustCount( bi_dict[bi_sen], k, nc_dict)
        
        prob = tri_count / bi_count
        
        #add the bigram sentence  to the trigram probability dict
        #tri_prob_dict is a dict of list
        if bi_sen not in tri_prob_dict:
            tri_prob_dict[bi_sen] = []
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
        else:
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
            
    prob = None
    tri_token = None
    bi_sen = None

Create a Hash Table for Probable words for Unigram



In [6]:

    
#returns: void
#arg: dict,dict,dict,dict,int
#creates dict for storing probable words with their probabilities for a unigram
def findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, nc_dict, k):
    
    #vocabulary size
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for bigram probabilites
    for bi in bi_dict:
        bi_token = bi.split()
        #unigram for key
        unigram = bi_token[0]
       
        #find the probability
        #Good Turing smoothing has been used
        bi_count = bi_dict[bi]
        uni_count = vocab_dict[unigram]
        
        if bi_dict[bi] <= k or (bi not in bi_dict):
            bi_count = findGoodTuringAdjustCount( bi_dict[bi], k, nc_dict)
        if vocab_dict[unigram] <= k or (unigram not in vocab_dict):
            uni_count = findGoodTuringAdjustCount( vocab_dict[unigram], k, nc_dict)
        
        prob = bi_count / uni_count
        
        #add the unigram to the bigram probability dict
        #bi_prob_dict is a dict of list
        if unigram not in bi_prob_dict:
            bi_prob_dict[unigram] = []
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
        else:
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
    
    prob = None
    bi_token = None
    unigram = None

Sort the probable words for the various Probability Dictionaries according to their probability



In [7]:

    
#returns: void
#arg: dict
#for sorting the probable word acc. to their probabilities
def sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict):
    for key in bi_prob_dict:
        if len(bi_prob_dict[key])>1:
            bi_prob_dict[key] = sorted(bi_prob_dict[key],reverse = True)
    
    for key in tri_prob_dict:
        if len(tri_prob_dict[key])>1:
            tri_prob_dict[key] = sorted(tri_prob_dict[key],reverse = True)
    
    for key in quad_prob_dict:
        if len(quad_prob_dict[key])>1:
            quad_prob_dict[key] = sorted(quad_prob_dict[key],reverse = True)[:2]

For Taking input from the User



In [8]:

    
#returns: string
#arg: void
#for taking input from user
def takeInput():
    cond = False
    #take input
    while(cond == False):
        sen = input('Enter the string\n')
        sen = removePunctuations(sen)
        temp = sen.split()
        if len(temp) < 3:
            print("Please enter atleast 3 words !")
        else:
            cond = True
            temp = temp[-3:]
    sen = " ".join(temp)
    return sen

Test Score ,Perplexity Calculation:

For computing the Test Score



In [9]:

    
#computes the score for test data
def computeTestScore(test_token, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
                              bi_nc_dict, uni_nc_dict ):
     #increment the score value if correct prediction is made else decrement its value
    score = 0
    wrong = 0
    total = 0
    with open('Test_Scores/Good_Turing_Interpolated_Score.txt','w') as w:
        for sent in test_token:
            sen_token = sent[:3]
            sen = " ".join(sen_token)
            correct_word = sent[3]
            #find the the most probable words for the bigram, trigram and unigram  sentence               
            word_choice = chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)

            result = doInterpolatedPredictionGT(sen, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
                              bi_nc_dict, uni_nc_dict )
            if result:
                if result[1] == correct_word:
                    score+=1
                else:
                    wrong += 1
            else:
                wrong += 1
            total += 1
            
        w.write('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY: '+str((score/total)*100)+'%' )
        #print stats
        print('Total Word Prdictions: '+str(total) + '\n' +'Correct Prdictions: '+str(score) +
                '\n'+'Wrong Prdictions: '+str(wrong) + '\n'+'ACCURACY:'+str((score/total)*100)+'%' )
    return score

For Computing the Perplexity



In [10]:

    
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams, bi_dict, tri_dict, quad_dict, 
                                     vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
                                      bi_nc_dict, uni_nc_dict):
    
    perplexity = float(1.0)
    n = token_len
    
    for key in quad_dict:
        quad_token = key.split()
         
        quad_count = quad_dict[key]
        tri_count = tri_dict[' '.join(quad_token[0:3])]
        
        if quad_dict[key] <= k or (key not in quad_dict):
            quad_count = findGoodTuringAdjustCount( quad_dict[key], k, quad_nc_dict)
        if tri_dict[' '.join(quad_token[0:3])] <= k  or (' '.join(quad_token[0:3]) not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[0:3])], k, tri_nc_dict)
        prob = quad_count / tri_count
        if prob != 0:
            perplexity = perplexity * ( prob**(1./n))
    with open('Test_Scores/Good_Turing_Interpolated_Score.txt','a') as w:
        w.write('\nPerplexity: '+str(perplexity))
    
    return perplexity



In [11]:

    
## Regression related stuff
#calculate best fit line for simple regression 
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib import style

#finds the slope for the best fit line
def findBestFitSlope(x,y):
    m = (( mean(x)*mean(y) - mean(x*y) ) / 
          ( mean(x)** 2 - mean(x**2)))

    return m
      
#finds the intercept for the best fit line
def findBestFitIntercept(x,y,m):
    c = mean(y) - m*mean(x)
    return c

Find the count Nc for quadgrams and trigrams where c > 5



In [12]:

    
## Find the count Nc for quadgrams and trigrams where c > 5
#arg: dict, int, int, int, int
#returns: dict
#token_len : total no. of ngram tokens
def findFrequencyOfFrequencyCount(ngram_dict, k, n, V, token_len):
    #for keeping count of 'c' value i.e Nc
    nc_dict = {}
    #we find the value of Nc,c = 0 by V^n - (total n-gram tokens)
    nc_dict[0] = V**n - token_len
    #find the count Nc till c = k,we will take k = 5
    #find counts for n-gram
    for key in ngram_dict:
        if ngram_dict[key] <= k + 1:
            if ngram_dict[key] not in nc_dict:
                nc_dict[ ngram_dict[key]] = 1
            else:
                nc_dict[ ngram_dict[key] ] += 1
    
    #check if all the values of Nc are there in the nc_dict or not ,if there then return           
    val_present = True
    for i in range(1,7):
        if i not in nc_dict:
            val_present = False
            break
    if val_present == True:
        return nc_dict
    
    #now fill in the values of nc in case it is not there using regression upto c = 6
    #we use :[ log(Nc) = blog(c) + a ] as the equation

    #we first need to find data for regression that is values(Nc,c) we take 5 data points
    data_pts = {}
    i = 0
    #get first 5 counts value i.e c
    #for quadgram
    for key in ngram_dict:
        if ngram_dict[key] not in data_pts:
                data_pts[ ngram_dict[key] ] = 1
                i += 1
        if i >5:
            break
            
    #now get Nc for those c values
    for key in ngram_dict:
        if ngram_dict[key] in data_pts:
            data_pts[ ngram_dict[key] ] += 1
    
    #make x ,y coordinates for regression 
    x_coor = [ np.log(item) for item in data_pts ]
    y_coor = [ np.log( data_pts[item] ) for item in data_pts ]
    x = np.array(x_coor, dtype = np.float64)
    y = np.array(y_coor , dtype = np.float64)
   

    #now do regression
    #find the slope and intercept for the regression equation
    slope_m = findBestFitSlope(x,y)
    intercept_c = findBestFitIntercept(x,y,slope_m)

    #now find the missing Nc terms and give them value using regression
    for i in range(1,(k+2)):
        if i not in nc_dict:
            nc_dict[i] = (slope_m*i) + intercept_c
    
    return nc_dict

For finding the Good Turing Probability



In [13]:

    
#for finding the adjusted count c* in Good Turing Smoothing
def findGoodTuringAdjustCount(c, k, nc_dict):
   
    adjust_count = ( ( (( c + 1)*( nc_dict[c + 1] / nc_dict[c])) - ( c * (k+1) * nc_dict[k+1] / nc_dict[1]) ) /
                     ( 1 - (( k + 1)*nc_dict[k + 1] / nc_dict[1]) )
                   )
    return adjust_count

Parameter estimation

For estimating parameters we try to maximise the value of lambdas l1,l2,l3 and l4
We do that by try all possible combinations of lambdas with step size 0.1 and try to maximise the
probabilty of held out data



In [14]:

    
#finds the lambda values required for doing Interpolation

#arg: int, dict, dict, dict, dict
#returns: list
def estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict):
    max_prob = -9999999999999999999.0
    curr_prob = 0.0
    parameters = [0.0,0.0,0.0,0.0]
    i = 1
    
    #load the held out data 
    file = open('held_out_corpus.txt','r')
    held_out_data = file.read()
    file.close()
    
    #remove punctuations and other cleaning stuff
    held_out_data = removePunctuations(held_out_data)
    held_out_data = held_out_data.split()
    #make quad tokens for parameter estimation
    quad_token_heldout = list(ngrams(held_out_data,4))
    
    #for storing the stats 
    f = open('interpolation_prob_stats.txt','w') 
    
    #lambda values1 and 4
    l1 = 0
    l4 = 0

    while l1 <= 1.0:
        l2 = 0
        while l2 <= 1.0:
            l3 = 0
            while l3 <= 1.0:
                
                #when the sum of lambdas is greater than 1 or when all 4 are zero we don't need to check so skip
                if l1 == 0 and l2 == 0 and l3 == 0 or ((l1+l2+l3)>1):
                    l3 += 0.1
                    i += 1
                    continue
                    
                #find lambda 4
                l4 = 1- (l1 + l2 + l3)
                
                curr_prob = 0
                qc = [0]
                bc = [0]
                tc = [0]
                
                #find the probability for the held out set using the current lambda values
                for quad in quad_token_heldout:
                    #take log of prob to avoid underflow 
                    curr_prob += log10( interpolatedProbability(quad,token_len, vocab_dict, bi_dict, tri_dict, 
                                                                quad_dict,qc,bc,tc,l1, l2, l3, l4) )
                
                if curr_prob > max_prob:
                    max_prob = curr_prob
                    parameters[0] = l1
                    parameters[1] = l2
                    parameters[2] = l3
                    parameters[3] = l4
                l3 += 0.1
                i += 1
               
            l2 += 0.1
        l1 += 0.1
    
    f.write('\n\n\nL1: '+str(parameters[0])+'  L2: '+str(parameters[1])+'  L3: '+str(parameters[2])+'  L4: '+str(parameters[3])+'  MAX PROB: '+str(max_prob)+'\n')        
    f.close()
    return parameters

For choosing Probable words as Word Prediction candidate



In [15]:

    
#pick the top most probable words from bi,tri and quad prob dict as word prediction candidates

#returns: list[float,string]
#arg: string,dict,dict,dict
def chooseWords(sen, bi_prob_dict, tri_prob_dict, quad_prob_dict):
    word_choice = []
    token = sen.split()
    if token[-1] in bi_prob_dict:
        word_choice +=  bi_prob_dict[token[-1]][:1]
        #print('Word Choice bi dict')
    if ' '.join(token[1:]) in tri_prob_dict:
        word_choice +=  tri_prob_dict[' '.join(token[1:])][:1]
        #print('Word Choice tri_dict')
    if ' '.join(token) in quad_prob_dict:
        word_choice += quad_prob_dict[' '.join(token)][:1]
        #print('Word Choice quad_dict')
    
    return word_choice

Driver function for doing the prediction

Do word Prediction using Interpolation



In [16]:

    
#For doing word prediction using Interpolation
def doInterpolatedPredictionGT(sen, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
                              bi_nc_dict, uni_nc_dict ):
    
    
    pred = ''
    max_prob = 0.0
    V = len(vocab_dict)
    #for each word choice find the interpolated probability and decide
    for word in word_choice:
        key = sen + ' ' + word[1]
        quad_token = key.split()
        
        #find the Good Turing probabilty for quadgram probability
        quad_count = quad_dict[key]
        tri_count = tri_dict[' '.join(quad_token[0:3])]
        
        if quad_dict[key] <= k or (key not in quad_dict):
            quad_count = findGoodTuringAdjustCount( quad_dict[key], k, quad_nc_dict)
        if tri_dict[' '.join(quad_token[0:3])] <= k  or (' '.join(quad_token[0:3]) not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[0:3])], k, tri_nc_dict)
        quad_prob = quad_count / tri_count
        
        #find the Good Turing probabilty for trigram probability
        tri_count = tri_dict[' '.join(quad_token[1:4])]
        bi_count = bi_dict[' '.join(quad_token[1:3])]
        
        if tri_dict[' '.join(quad_token[1:4])] <= k  or (' '.join(quad_token[1:4]) not in tri_dict):
            tri_count = findGoodTuringAdjustCount( tri_dict[' '.join(quad_token[1:4])], k, tri_nc_dict)
        if bi_dict[' '.join(quad_token[1:3])] <= k or (' '.join(quad_token[1:3]) not in bi_dict):
            bi_count = findGoodTuringAdjustCount( bi_dict[' '.join(quad_token[1:3])], k, bi_nc_dict)
        tri_prob = tri_count / bi_count
       
        #find the Good Turing probabilty for bigram probability
        bi_count = bi_dict[' '.join(quad_token[2:4])]
        uni_count = vocab_dict[quad_token[2]]
        
        if bi_dict[' '.join(quad_token[2:4])] <= k or (' '.join(quad_token[2:4]) not in bi_dict):
            bi_count = findGoodTuringAdjustCount( bi_dict[' '.join(quad_token[2:4])], k, bi_nc_dict)
        if vocab_dict[quad_token[2]] <= k or (quad_token[2] not in vocab_dict):
            uni_count = findGoodTuringAdjustCount( vocab_dict[quad_token[2]], k, uni_nc_dict)
        bi_prob = bi_count / uni_count
        
        #find the Good Turing probabilty for unigram probability
        uni_count = vocab_dict[quad_token[3]]
        
        if vocab_dict[quad_token[3]] <= k or (quad_token[3] not in vocab_dict):
            bi_count = findGoodTuringAdjustCount( vocab_dict[quad_token[3]], k, uni_nc_dict)
        uni_prob = uni_count / token_len
        
        prob = (   
                  param[0]*( quad_prob ) 
                + param[1]*( tri_prob ) 
                + param[2]*( bi_prob ) 
                + param[3]*(uni_prob)
               )
       
        if prob > max_prob:
            max_prob = prob
            pred = word
    #return only pred to get word with its prob
    if pred:
        return pred
    else:
        return ''

Driver Function for Testing the Language Model



In [17]:

    
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
      
    test_result = ''
    score = 0
    #load the training corpus for the dataset
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)
    print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))

    start_time1 = time.time()
    
    #create the different Nc dictionaries for ngrams
    #threshold value
    k = 5
    V = len(vocab_dict)
    quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
    tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
    bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
    uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))

    #create quadgram probability dictionary
    findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
    #create trigram probability dictionary
    findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
    #create bigram probability dictionary
    findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
    #sort the probability dictionaries of quad,tri and bi grams
    sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
    #Do only when required to find the lambda value as this can take some time
    #param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
    #found earlier using Held out data
    param = [0.0,0.0,0.7999999999999999,0.20000000000000007]
    print("---Processing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
    
    
    ### TESTING WITH TEST CORPUS
    test_data = ''
    #Now load the test corpus
    with open('test_corpus.txt','r') as file :
        test_data = file.read()


    #remove punctuations from the test data
    test_data = removePunctuations(test_data)
    test_token = test_data.split()

    #split the test data into 4 words list
    test_token = test_data.split()
    test_quadgrams = list(ngrams(test_token,4))
    
    #choose most probable words for prediction
    start_time2 = time.time()
    score = computeTestScore(test_quadgrams, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
                              bi_nc_dict, uni_nc_dict )
    print('Score:',score)
    print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time2))

    start_time3 = time.time()
    perplexity = computePerplexity(test_quadgrams, bi_dict, tri_dict, quad_dict, 
                                     vocab_dict,token_len, param, k, quad_nc_dict, tri_nc_dict,
                                      bi_nc_dict, uni_nc_dict)
    print('Perplexity:',perplexity)
    print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time3))

main function



In [20]:

    
def main():
    #variable declaration
    vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
    bi_dict = defaultdict(int)             #for keeping count of sentences of two words
    tri_dict = defaultdict(int)            #for keeping count of sentences of three words
    quad_dict = defaultdict(int)           #for keeping count of sentences of four words
    quad_prob_dict = OrderedDict()              
    tri_prob_dict = OrderedDict()
    bi_prob_dict = OrderedDict()

    train_file = 'corpusfile.txt'
    #load the corpus for the dataset
    token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)

    #create the different Nc dictionaries for ngrams
    #threshold value
    k = 5
    V = len(vocab_dict)
    quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
    tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
    bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
    uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))

    #create quadgram probability dictionary
    findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)
    #create trigram probability dictionary
    findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)
    #create bigram probability dictionary
    findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)
    #sort the probability dictionaries of quad,tri and bi grams
    sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)
    #Do only when required to find the lambda value as this can take some time
    #param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
    #found earlier using Held out data
    param = [0.7,0.1,0.1,0.1]

    ##WORD PREDICTION 

    start_time2 = time.time()
    #take user input 
    input_sen = takeInput()
    #find the the most probable words for the bigram, trigram and unigram  sentence               
    word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)


    prediction = doInterpolatedPredictionGT(input_sen, bi_dict, tri_dict, quad_dict, 
                                 vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
                                  bi_nc_dict, uni_nc_dict )
    if prediction:
        print('Word Prediction:',prediction[1])
    print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))



In [21]:

    
if __name__ == '__main__':
    main()









    



Enter the string
emma by jane
Word Prediction: austen
---Time for Prediction Operation: 7.338468313217163 seconds ---

For Debugging Purpose Only

Uncomment the above two cells and ignore running the cells below if not debugging



In [ ]:

    
#variable declaration
vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
bi_dict = defaultdict(int)             #for keeping count of sentences of two words
tri_dict = defaultdict(int)            #for keeping count of sentences of three words
quad_dict = defaultdict(int)           #for keeping count of sentences of four words
quad_prob_dict = OrderedDict()              
tri_prob_dict = OrderedDict()
bi_prob_dict = OrderedDict()

print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))

For Testing the Language Model

Calculates % Accuracy and Perplexity
NOTE : If this is run then no need to run the cells following it



In [ ]:

    
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,quad_prob_dict)



In [ ]:

    
train_file = 'corpusfile.txt'
#load the corpus for the dataset
token_len = loadCorpus(train_file, bi_dict, tri_dict, quad_dict, vocab_dict)



In [ ]:

    
#create the different Nc dictionaries for ngrams
#threshold value
k = 5
V = len(vocab_dict)
quad_nc_dict = findFrequencyOfFrequencyCount(quad_dict, k, 4, V, len(quad_dict))
tri_nc_dict = findFrequencyOfFrequencyCount(tri_dict, k, 3, V, len(tri_dict))
bi_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 2, V, len(bi_dict))
uni_nc_dict = findFrequencyOfFrequencyCount(bi_dict, k, 1, V, len(vocab_dict))



In [ ]:

    
#create quadgram probability dictionary
findQuadgramProbGT(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict, quad_nc_dict, k)



In [ ]:

    
#create trigram probability dictionary
findTrigramProbGT(vocab_dict, bi_dict, tri_dict, tri_prob_dict, tri_nc_dict, k)



In [ ]:

    
#create bigram probability dictionary
findBigramProbGT(vocab_dict, bi_dict, bi_prob_dict, bi_nc_dict, k)



In [ ]:

    
#sort the probability dictionaries of quad,tri and bi grams
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)



In [ ]:

    
#Do only when required to find the lambda value as this can take some time
#param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
#found earlier using Held out data
param = [0.7,0.1,0.1,0.1]



In [ ]:

    
#FOR DEBUGGING ONLY
writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict)



In [ ]:

    
##WORD PREDICTION 

start_time2 = time.time()
#take user input 
input_sen = takeInput()
#find the the most probable words for the bigram, trigram and unigram  sentence               
word_choice = chooseWords(input_sen, bi_prob_dict, tri_prob_dict, quad_prob_dict)


prediction = doInterpolatedPredictionGT(input_sen, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, word_choice, param, k, quad_nc_dict, tri_nc_dict,
                              bi_nc_dict, uni_nc_dict )
if prediction:
    print('Word Prediction:',prediction[1])
print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))