In [1]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
"""Global definitons"""
_start = 'S_START'
_end = 'S_END'
In [22]:
class WordItem:
def __init__(self,word,count=0):
self.word = word
self.count = count
In [55]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
file_in = open(_fi)
#file_out = open(_fo,'wb')
words = [] #stores unique words encountered in the document as WordItem objects
_dict = {} #temporary dictionary to maintain count of each word
_dict['UNK'] = 0
for l in file_in:
#file_out.write(l+'\n')
l = _start+' '+l+' '+_end
split = word_tokenize(l.decode('utf-8'))
for w in split:
if len(w)==0:
continue
elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
_dict['UNK'] += 1
continue
if w not in _dict:
_dict[w] = 1
_dict[w] += 1
_vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
_vocab['UNK'] = len(words)
words.append(WordItem('UNK',_dict['UNK']))
for k,v in _dict.iteritems():
#if v > 9 and k != 'UNK':
if k != 'UNK':
_vocab[k] = len(words)
words.append(WordItem(k,v))
else:
words[0].count += 1
#cleaning up unnecessary memory
del _dict
file_in.close()
#file_out.close()
return _vocab, words
def UnigramTable(_vocab, words):
""" Calculates probabilities based on count of each word present"""
pow = 0.75
totalFreqPow = 0.0
unigramTable = {}
l = [words[i].count**pow for i in range(len(_vocab))]
totalFreqPow = np.sum(l)
for i in range(len(_vocab)):
unigramTable[i] = (words[i].count**pow)/totalFreqPow
del l
return unigramTable
def hotVector(wordIndex,vocabSize):
""" Returns hot vector representation of a word """
hVector = np.zeros(vocabSize)
hVector[wordIndex-1] = 1
return hVector
def softmax(net):
""" calculates softmax score - target score normalized with noise scores and calculated as probability"""
_exp = np.exp(net)
return _exp/np.sum(_exp)
def sigmoid(net):
""" Applies sigmoid logistic function on net """
return 1.0/(1+np.exp(-net))
def randomIdx(k, vocabSize, current):
""" Returns k indices from with unigram table randomly with respect to each word's probablity """
global _unigramTable
idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable.values()))
if current in idxs:
idxs.remove(current)
else:
del idxs[-1]
return idxs
def softmaxCostGradient(net, target):
prob = softmax(net)
print(prob)
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
errorHidden = np.zeros(shape=(emb.size,1))
actOut = sigmoid(out[context])
negSamples = randomIdx(k, vocabSize, context)
_negSamples = [-out[sample] for sample in negSamples]
# error for context word
e = -np.log(actOut) - np.sum(np.log(sigmoid(np.array(_negSamples))))
""" calculating gradients for output vectors for both target and negative samples
calculating hidden layer error for each context word """
# Updating output weight vector for context word
delta = actOut - 1
errorHidden += delta * W_Output[:,context:context+1]
W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
# Updating output weight vectors for negative sampling
for sample in negSamples:
delta = sigmoid(out[sample])
errorHidden += delta * W_Output[:,sample:sample+1]
W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
return errorHidden,e
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
"""
will be called on each window with
target: Target word index
contextWords: Arrray of integers representing context words
"""
loss = 0
k = 10 #Number of negative samples
emb = W_Embedding[target]
out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
#print out.shape
_predicted = []
EH = np.zeros(shape=(emb.size,1))
for context in contextWords:
#predicted = hotVector(context, vocabSize)
#softmaxCostGradient(out,context)
_EH,_e = negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
EH += _EH
loss += _e
#EH += sof
#updating hidden layer input vector embedding
W_Embedding[target] -= learningRate * EH.T[0]
return loss
In [56]:
""" Creates word embeddings in vector space representation """
""" Feedforward Neural Net Language model """
#Input layer
#Projection layer
#Hidden layer
#Output layer
#Initialization
fin='/home/jazzycrazzy/MTData/English/English-small.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fout = 'testfile.txt'
_vocab, words = dataset(fin, fout)
_unigramTable = UnigramTable(_vocab, words)
learningRate = 0.1
vocabSize = len(words)
emb_size = 10
win_size = 2
target = None
epoch = 20
print _vocab
# No need of hidden layer since when the embedding matrix is multiplied with hot vector
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab
for _ in np.arange(epoch):
totalLoss = 0
loss = 0
fileIn = open(fin)
for l in fileIn:
l = _start+' '+l+' '+_end
tokens = word_tokenize(l.decode('utf-8'))
#print 'tokens',tokens
for token in tokens:
loss = 0
contextWords = []
if token in _vocab:
target = _vocab[token]
trgtIdx = tokens.index(token)
cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
cntxtIdxs.remove(trgtIdx)
for idx in cntxtIdxs:
#check for first word and last word and use UNK for context words for window where words not available
if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab:
contextWords = np.append(contextWords, _vocab[tokens[idx]])
else:
contextWords = np.append(contextWords, _vocab['UNK'])
loss += skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)
totalLoss += loss
print 'Total Loss:',totalLoss
print(W_Embedding)
In [38]:
"""print _unigramTable
print words[0].word,words[0].count
print _vocab.values()[:10]
print _vocab.keys()[:10]
print words[_vocab.get('UNK')].count
print _vocab
#print W_Embedding
fig = plt.figure()
plt.scatter(W_Embedding[:,0:1], W_Embedding[:,1:2], W_Embedding[:,2:3])
plt.show()"""
y='La démarche terroriste est désormais une aberration qui ne séduit guère plus qu’une minorité.'
x = word_tokenize(_start+' '+y+' ')
net=np.array([-5, 2, 4, 3])
print 1/(1+np.exp(-net))
print sigmoid(net)
print np.negative(net)
print np.sum(np.log(sigmoid(np.negative(net))))