In [5]:
#%%timeit
from nltk.util import ngrams
from collections import defaultdict
import nltk
import string
In [6]:
quad_dict = defaultdict(int) #for keeping count of sentences of three words
penta_dict = defaultdict(int) #for keeping count of sentences of three words
w1 = '' #for storing the 3rd last word to be used for next token set
w2 = '' #for storing the 2nd last word to be used for next token set
w3 = '' #for storing the last word to be used for next token set
w4 = ''
vocab_dict = defaultdict(int) #for storing the different words with their frequencies
#word_len = 0
#Data/Tokenization/Chat1.txt
with open('mycorpus.txt','r') as file:
for line in file:
token = line.split()
i = 0
for word in token :
for l in word :
if l in string.punctuation:
word = word.replace(l," ")
#token[i] = "".join(l for l in word if l not in string.punctuation)
#token[i] = word.replace('.','').replace(' ','').replace(',','').replace(':','').replace(';','').replace('!','').replace('?','').replace('(','').replace(')','')
token[i] = word.lower()
i=i+1
content = " ".join(token)
token = content.split()
#word_len = word_len + len(token)
if not token:
continue
#first add the previous words
if w2!= '':
token.insert(0,w2)
if w3!= '':
token.insert(1,w3)
if w4!= '':
token.insert(2,w4)
#tokens for quadgrams
temp1 = list(ngrams(token,4))
if w1!= '':
token.insert(0,w1)
#add new unique words to the vocaulary set
for word in token:
if word not in vocab_dict:
vocab_dict[word] = 1
else:
vocab_dict[word]+= 1
#tokens for pentagrams
temp2 = list(ngrams(token,5))
#uni_trigrams = set(trigrams)
#count the frequency of the quadgram sentences
for t in temp1:
sen = ' '.join(t)
quad_dict[sen] += 1
#count the frequency of the pentagram sentences
for t in temp2:
sen = ' '.join(t)
penta_dict[sen] += 1
#then take out the last 4 words
n = len(token)
w1 = token[n -4]
w2 = token[n -3]
w3 = token[n -2]
w4 = token[n -1]
#print(word_len)
#print(len(quad_dict))
#print(len(tri_dict))
In [12]:
def findprobability(s,w):
c1 = 0 # for count of sentence 's' with word 'w'
c2 = 0 # for count of sentence 's'
s1 = s + ' ' + w
if s1 in penta_dict:
c1 = penta_dict[s1]
if s in quad_dict:
c2 = quad_dict[s]
if c2 == 0:
return 0
return c1/c2
In [13]:
#%%timeit
del token[:]
def doPrediction(sen):
#remove punctuations and make it lowercase
temp_l = sen.split()
i = 0
for word in temp_l :
for l in word :
if l in string.punctuation:
word = word.replace(l," ")
#token[i] = "".join(l for l in word if l not in string.punctuation)
#token[i] = word.replace('.','').replace(' ','').replace(',','').replace(':','').replace(';','').replace('!','').replace('?','').replace('(','').replace(')','')
temp_l[i] = word.lower()
i=i+1
content = " ".join(temp_l)
temp_l = content.split()
#print(temp_l)
sen = ' '.join(temp_l)
#print(sen)
max_prob = 0
#when there is no probable word available
#now for guessing the word which should exist we use quadgram
right_word = 'apple'
for word in vocab_dict:
prob = findprobability(sen,word)
if prob > max_prob:
max_prob = prob
right_word = word
print('Word Prediction is :',right_word)
#print('Probability:',max_prob)
#print(len(token),',',len(vocab))
In [14]:
#print(len(vocab))
sen = input('Enter four words\n')
doPrediction(sen)