Word prediction based on Quadgram

This program reads the corpus line by line so it is slower than the program which reads the corpus in one go.This reads the corpus one line at a time loads it into the memory.Also this uses encoded keys making it even more memory efficient

Import modules

In [18]:
#import the modules necessary
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import nltk
import string
import time

start_time = time.time()

Do preprocessing:

Encode keys for dictionary storage

In [19]:
#return: string
#for encoding keys for the dictionary
#for encoding keys ,index has been used for each unique word   
#for mapping keys with their index
def encodeKey(s,index,vocab_dict):
    key = ''
    #print (s)
    for t in s:
        #print (t)
        if t not in vocab_dict:
            vocab_dict[t] = index[0]
            index[0] = index[0] + 1

        key = key + str(vocab_dict[t]) + '#'  
    return key

Remove the punctuations and lowercase the tokens

In [20]:
#returns: string
#arg: string
#remove punctuations and make the string lowercase
def removePunctuations(sen):

      #split the string into word tokens
    temp_l = sen.split()
    i = 0

    #changes the word to lowercase and removes punctuations from it
    for word in temp_l :
        for l in word :
            if l in string.punctuation:
                word = word.replace(l," ")
        temp_l[i] = word.lower()

    #spliting is being don here beacause in sentences line here---so after punctuation removal it should 
    #become "here so"   
    content = " ".join(temp_l)

    return content

Tokenize the corpus data

In [21]:
#returns : void
#arg: string,dict,dict,dict,list
#loads the corpus for the dataset and makes the frequency count of quadgram and trigram strings
def loadCorupus(filename,tri_dict,quad_dict,vocab_dict,index):
    w1 = ''    #for storing the 3rd last word to be used for next token set
    w2 = ''    #for storing the 2nd last word to be used for next token set
    w3 = ''    #for storing the last word to be used for next token set
    i = 0
    sen = ''
    token = []

    with open(filename,'r') as file:
        #read the data line by line
        for line in file:
            token = line.split()
            i = 0
            for word in token :
                for l in word :
                    if l in string.punctuation:
                        word = word.replace(l," ")
                token[i] = word.lower()

            content = " ".join(token)
            token = content.split()

            if not token:
            #first add the previous words
            if w2!= '':
            if w3!= '':
            #tokens for trigrams
            temp1 = list(ngrams(token,3))

            if w1!= '':

            #tokens for quadgrams
            temp2 = list(ngrams(token,4))
            #count the frequency of the trigram sentences
            for t in temp1:
                sen = encodeKey(t,index,vocab_dict)
                tri_dict[sen] += 1

            #count the frequency of the quadgram sentences
            for t in temp2:
                sen = encodeKey(t,index,vocab_dict)
                quad_dict[sen] += 1

            #then take out the last 3 words
            n = len(token)

            w1 = token[n -3]
            w2 = token[n -2]
            w3 = token[n -1]

Find the probability

In [22]:
#returns : float
#arg : string sentence,string word,dict,dict
def findprobability(s,w,tri_dict,quad_dict):
    c1 = 0 # for count of sentence 's' with word 'w'
    c2 = 0 # for count of sentence 's'
    s1 = s + w
    if s1 in quad_dict:
        c1 = quad_dict[s1]
    if s in tri_dict:
        c2 = tri_dict[s]
    if c2 == 0:
        return 0
    return c1/c2

Decode key

In [23]:
#arg: list
#return: string,dict
#for decoding keys 
def decodeKey(s,vocab_dict):
    key = ''
    l = []
    item = list(vocab_dict.items())
    temp_l =  s.split('#')
    del temp_l[len(temp_l)-1]
    index = 0
    for c in temp_l:
        if c != ' ':
            index = int(c)

    key = ' '.join(l)    
    return key

Driver function for doing the prediction

In [24]:
#returns : void
#arg: string,dict,dict,dict,list
def doPrediction(sen,tri_dict,quad_dict,vocab_dict,index):
    #remove punctuations and make it lowercase
    temp_l = sen.split()
    i = 0
    for word in temp_l :
        for l in word :
            if l in string.punctuation:
                word = word.replace(l," ")
        temp_l[i] = word.lower()
    content = " ".join(temp_l)
    temp_l = content.split() 
    #encode the sentence before checking
    sen = encodeKey(temp_l,index,vocab_dict)
    max_prob = 0
    #when there is no probable word available
    #now for guessing the word which should exist we use quadgram
    right_word = 'apple' 
    for word in vocab_dict:
        #encode the word before checking
        dict_l = []
        word = encodeKey(dict_l,index,vocab_dict)
        prob = findprobability(sen,word,tri_dict,quad_dict)
        if prob > max_prob:
            max_prob = prob
            right_word = word
    #decode the right word       
    right_word = decodeKey(right_word,vocab_dict)
    print('Word Prediction is :',right_word)

In [25]:
def main():

    tri_dict = defaultdict(int)
    quad_dict = defaultdict(int)
    vocab_dict = OrderedDict()   #for mapping of words with their index ==> key:word value:index of key in dict\n",
    index = [0]   #list for assigning index value to keys\n",


    cond = False
    #take input
    while(cond == False):
        sen = input('Enter the string\n')
        sen = removePunctuations(sen)
        temp = sen.split()
        if len(temp) < 3:
            print("Please enter atleast 3 words !")
            cond = True
            temp = temp[-3:]
            sen = " ".join(temp)

if __name__ == '__main__':

Enter the string
emma by jane
Word Prediction is : austen