Word prediction based on Pentagram

This program reads the corpus line by line so it is slower than the program which reads the corpus in one go.This reads the corpus one line at a time loads it into the memory

Import corpus


In [5]:
#%%timeit
from nltk.util import ngrams
from collections import defaultdict
import nltk
import string

Do preprocessing:

Tokenize the corpus data

Remove the punctuations and lowercase the tokens


In [6]:
quad_dict = defaultdict(int)            #for keeping count of sentences of three words
penta_dict = defaultdict(int)           #for keeping count of sentences of three words    
w1 = ''    #for storing the 3rd last word to be used for next token set
w2 = ''    #for storing the 2nd last word to be used for next token set
w3 = ''    #for storing the last word to be used for next token set
w4 = ''
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
#word_len = 0

#Data/Tokenization/Chat1.txt
with open('mycorpus.txt','r') as file:
    for line in file:
        token = line.split()
        i = 0
        for word in token :
            for l in word :
                if l in string.punctuation:
                    word = word.replace(l," ")
            #token[i] = "".join(l for l in word if l not in string.punctuation)
            #token[i] = word.replace('.','').replace(' ','').replace(',','').replace(':','').replace(';','').replace('!','').replace('?','').replace('(','').replace(')','')      
            token[i] = word.lower()
            i=i+1   
        content = " ".join(token)
        token = content.split()
        #word_len = word_len + len(token)
    
    
        
        if not token:
            continue

        #first add the previous words
        if w2!= '':
            token.insert(0,w2)
        if w3!= '':
            token.insert(1,w3)
        if w4!= '':
            token.insert(2,w4)
        
        
        
        #tokens for quadgrams
        temp1 = list(ngrams(token,4))

        if w1!= '':
            token.insert(0,w1)
        
        #add new unique words to the vocaulary set
        for word in token:
            if word not in vocab_dict:
                vocab_dict[word] = 1
            else:
                vocab_dict[word]+= 1
                
        #tokens for pentagrams
        temp2 = list(ngrams(token,5))
       
        #uni_trigrams = set(trigrams)
        #count the frequency of the quadgram sentences
        for t in temp1:
            sen = ' '.join(t)
            quad_dict[sen] += 1

        #count the frequency of the pentagram sentences
        for t in temp2:
            sen = ' '.join(t)
            penta_dict[sen] += 1


        #then take out the last 4 words
        n = len(token)

        w1 = token[n -4]
        w2 = token[n -3]
        w3 = token[n -2]
        w4 = token[n -1]    
#print(word_len)
#print(len(quad_dict))
#print(len(tri_dict))

Find the probability


In [12]:
def findprobability(s,w):
    c1 = 0 # for count of sentence 's' with word 'w'
    c2 = 0 # for count of sentence 's'
    s1 = s + ' ' + w
    
    if s1 in penta_dict:
        c1 = penta_dict[s1]
    if s in quad_dict:
        c2 = quad_dict[s]
    
    if c2 == 0:
        return 0
    return c1/c2

Driver function for doing the prediction


In [13]:
#%%timeit
del token[:]
def doPrediction(sen):
    
    #remove punctuations and make it lowercase
    temp_l = sen.split()
    i = 0
    
    for word in temp_l :
        for l in word :
            if l in string.punctuation:
                word = word.replace(l," ")
        #token[i] = "".join(l for l in word if l not in string.punctuation)
        #token[i] = word.replace('.','').replace(' ','').replace(',','').replace(':','').replace(';','').replace('!','').replace('?','').replace('(','').replace(')','')      
        temp_l[i] = word.lower()
        i=i+1   
        
    content = " ".join(temp_l)
    temp_l = content.split() 
        
    #print(temp_l)
    sen = ' '.join(temp_l)
    #print(sen)
    
    max_prob = 0
    #when there is no probable word available
    #now for guessing the word which should exist we use quadgram
    right_word = 'apple' 
    
    for word in vocab_dict:
        prob = findprobability(sen,word)
        if prob > max_prob:
            max_prob = prob
            right_word = word
    
    print('Word Prediction is :',right_word)
    #print('Probability:',max_prob)
    #print(len(token),',',len(vocab))

In [14]:
#print(len(vocab))
sen = input('Enter four words\n')
doPrediction(sen)


Enter four words
herself--so disgustingly decorated
Word Prediction is : with