In [1]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'

In [22]:
class WordItem:
    def __init__(self,word,count=0):
        self.word = word
        self.count = count

In [55]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
    file_in = open(_fi)
    #file_out = open(_fo,'wb')

    words = [] #stores unique words encountered in the document as WordItem objects
    _dict = {} #temporary dictionary to maintain count of each word
    
    _dict['UNK'] = 0

    for l in file_in:
        #file_out.write(l+'\n')
        l = _start+' '+l+' '+_end
        split = word_tokenize(l.decode('utf-8'))
        for w in split:
            if len(w)==0:
                continue
            elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
                _dict['UNK'] += 1
                continue
            if w not in _dict:
                _dict[w] = 1
            _dict[w] += 1
            
    _vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
    _vocab['UNK'] = len(words)
    words.append(WordItem('UNK',_dict['UNK']))
    for k,v in _dict.iteritems():
        #if v > 9 and k != 'UNK':
        if k != 'UNK':
            _vocab[k] = len(words)
            words.append(WordItem(k,v))
        else:
            words[0].count += 1
    
    #cleaning up unnecessary memory
    del _dict
    file_in.close()
    #file_out.close()
    
    return _vocab, words

def UnigramTable(_vocab, words):
    """ Calculates probabilities based on count of each word present"""
    pow = 0.75
    totalFreqPow = 0.0
    unigramTable = {}
    
    l = [words[i].count**pow for i in range(len(_vocab))]
    totalFreqPow = np.sum(l)
    
    for i in range(len(_vocab)):
        unigramTable[i] = (words[i].count**pow)/totalFreqPow
    
    del l
    return unigramTable

def hotVector(wordIndex,vocabSize):
    """ Returns hot vector representation of a word """
    hVector = np.zeros(vocabSize)
    hVector[wordIndex-1] = 1
    return hVector

def softmax(net):
    """ calculates softmax score - target score normalized with noise scores and calculated as probability"""
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def sigmoid(net):
    """ Applies sigmoid logistic function on net """
    return 1.0/(1+np.exp(-net))

def randomIdx(k, vocabSize, current):
    """ Returns k indices from with unigram table randomly with respect to each word's probablity """
    global _unigramTable
    idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable.values()))
    if current in idxs:
        idxs.remove(current)
    else:
        del idxs[-1]
    return idxs
    
def softmaxCostGradient(net, target):
    prob = softmax(net)
    print(prob)
    
    
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
    
    errorHidden = np.zeros(shape=(emb.size,1))
    
    actOut = sigmoid(out[context])
    negSamples = randomIdx(k, vocabSize, context)
    _negSamples = [-out[sample] for sample in negSamples]
    
    # error for context word
    e = -np.log(actOut) - np.sum(np.log(sigmoid(np.array(_negSamples))))
    
    """ calculating gradients for output vectors for both target and negative samples
    calculating hidden layer error for each context word """
    # Updating output weight vector for context word
    delta = actOut - 1
    errorHidden += delta * W_Output[:,context:context+1]
    W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    # Updating output weight vectors for negative sampling
    for sample in negSamples:
        delta = sigmoid(out[sample])
        errorHidden += delta * W_Output[:,sample:sample+1]
        W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    return errorHidden,e    
    
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
    
    """
    will be called on each window with
    target: Target word index
    contextWords: Arrray of integers representing context words
    """
    loss = 0
    k = 10 #Number of negative samples
    emb = W_Embedding[target]
    out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
    #print out.shape
    _predicted = []
    EH = np.zeros(shape=(emb.size,1))
    for context in contextWords:
        #predicted = hotVector(context, vocabSize)
        #softmaxCostGradient(out,context)
        _EH,_e = negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
        EH += _EH
        loss += _e
        #EH += sof
        
    #updating hidden layer input vector embedding
    W_Embedding[target] -= learningRate * EH.T[0]
    return loss

In [56]:
""" Creates word embeddings in vector space representation """

""" Feedforward Neural Net Language model """
#Input layer

#Projection layer

#Hidden layer

#Output layer

#Initialization
fin='/home/jazzycrazzy/MTData/English/English-small.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fout = 'testfile.txt'
_vocab, words = dataset(fin, fout)
_unigramTable = UnigramTable(_vocab, words)

learningRate = 0.1
vocabSize = len(words)
emb_size = 10
win_size = 2
target = None
epoch = 20

print _vocab


# No need of hidden layer since when the embedding matrix is multiplied with hot vector 
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab

for _ in np.arange(epoch):
    
    totalLoss = 0
    loss = 0
    
    fileIn = open(fin)
    for l in fileIn:
        l = _start+' '+l+' '+_end
        tokens = word_tokenize(l.decode('utf-8'))
        #print 'tokens',tokens
        for token in tokens:
            
            loss = 0
            contextWords = []
            
            if token in _vocab:
                target = _vocab[token]
                trgtIdx = tokens.index(token)
                cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
                cntxtIdxs.remove(trgtIdx)
                for idx in cntxtIdxs:
                    #check for first word and last word and use UNK for context words for window where words not available
                    if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab:
                        contextWords = np.append(contextWords, _vocab[tokens[idx]])
                    else:
                        contextWords = np.append(contextWords, _vocab['UNK'])
                loss += skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)
        totalLoss += loss
    print 'Total Loss:',totalLoss
                

print(W_Embedding)


{u'enjoy': 1, u'S_END': 30, u'want': 3, u'tired': 4, u'ran': 5, u'is': 6, u'am': 7, u'see': 9, u'at': 10, u'have': 11, u'go': 12, u'tomorrow': 20, u'birth': 40, u'speak': 15, u'what': 17, u'how': 22, u'sun': 19, u'friends': 42, u'day': 43, u'graduate': 13, u'write': 16, u'to': 18, u'of': 52, u'enjoys': 24, u'has': 26, u'beach': 27, u'today': 28, u'?': 51, u'dad': 2, u'be': 31, u'we': 32, u'good': 33, u'read': 8, u'student': 37, u'sunny': 36, u'here': 38, u'every': 39, u'mom': 21, u'date': 41, 'UNK': 0, u'come': 23, u'you': 25, u'I': 34, u'a': 44, u'boy': 45, u'store': 46, u'your': 14, u'name': 47, u'did': 48, u'S_START': 49, u'work': 50, u'can': 29, u'night': 35, u'the': 53, u'nice': 54, u'where': 55, u'are': 56}
/home/jazzycrazzy/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:92: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/home/jazzycrazzy/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:103: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/home/jazzycrazzy/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:104: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
Total Loss: 346.691826545
Total Loss: 178.768376317
Total Loss: 183.290073389
Total Loss: 166.625463328
Total Loss: 174.908476869
Total Loss: 169.296636938
Total Loss: 170.168831662
Total Loss: 177.859704209
Total Loss: 172.712286226
Total Loss: 176.027312374
Total Loss: 174.975126312
Total Loss: 176.584253707
Total Loss: 179.330062731
Total Loss: 178.340631869
Total Loss: 168.612763771
Total Loss: 169.049138441
Total Loss: 179.039235471
Total Loss: 172.019309351
Total Loss: 180.076219617
Total Loss: 169.766303609
[[ -3.13317690e-02  -6.98069634e-01   1.58624384e-01   6.50130627e-01
   -5.34281703e-01  -3.68388641e-01   1.12118685e+00   2.34374903e-01
    1.28035319e+00  -1.69041001e-02]
 [  2.21583361e+00  -2.60910986e-01   1.84376686e+00   2.14441012e+00
   -5.56048870e-01   6.13942590e-02  -1.30338472e-01  -7.99465715e-01
    5.05656544e-01  -8.99497493e-01]
 [  6.14369935e-01   1.32862548e+00   9.84956556e-01  -4.00023679e-01
   -1.12321765e+00   1.07646048e+00  -1.63702154e+00  -9.45208321e-01
    5.91340474e-01  -2.95509120e-01]
 [  8.57937733e-01  -8.67638562e-01   2.16675248e+00  -4.82708163e-01
    8.43360609e-02   3.87687237e-01  -4.61329545e-02  -7.91125645e-01
    1.11983665e+00  -3.13326162e-03]
 [  1.35608011e+00  -7.19867407e-02   1.00392225e+00  -5.40245272e-02
   -1.12150017e+00  -2.34585857e-01  -7.87177507e-01  -1.73352308e-01
    1.34558612e+00   8.20288872e-01]
 [  1.64421541e+00  -1.71426065e+00   2.53618993e+00   5.31584679e-01
   -1.04772610e+00   2.47931140e-01  -1.21313581e+00  -9.78324976e-02
    6.08189493e-02  -1.12105154e+00]
 [  2.95170258e-01   1.42275657e+00   8.75572795e-01  -9.79468101e-01
   -5.63111827e-01  -1.01338992e-01  -9.76649060e-01  -1.03250232e+00
   -1.38543005e-01   4.92595786e-01]
 [  1.01631730e+00   1.47123218e-01   7.25129898e-01  -8.19873170e-01
    2.11666085e-02   9.41765685e-01  -9.86197162e-01  -1.27017623e+00
    2.38649475e+00  -4.52705502e-01]
 [  8.48775520e-01   5.46973220e-01   1.39512736e+00   9.39634525e-02
   -1.45275934e+00  -4.54148425e-01  -3.82829568e-01  -1.50646520e+00
    2.13622601e+00   3.64396817e-01]
 [  1.03664925e-01   1.91996120e+00   2.33702706e+00   1.52632304e+00
    4.79664726e-01   3.02114325e-02  -2.40159457e-01  -5.39953660e-01
   -9.24532262e-01  -7.04904896e-01]
 [  1.48238274e+00  -4.06731530e-01   4.81779008e-01   5.69054941e-01
    1.00030936e+00  -1.26430819e+00  -2.29322902e+00  -6.71126148e-01
    6.34093326e-01   3.98023997e-01]
 [  4.65358958e-01   1.86074393e+00   1.95450837e+00   8.67627685e-01
    1.38113955e+00   4.11727206e-01  -1.04628018e+00   2.21389519e-01
    6.57573190e-01   1.02072640e+00]
 [  1.83114813e+00   8.88726422e-01   1.32242793e+00   2.41885531e-01
    1.38906943e+00  -1.31078233e+00  -1.31423953e+00   6.12868734e-01
    8.41829432e-01  -1.07280751e+00]
 [  2.21325991e+00   6.40129127e-01   1.49640009e+00   1.86174210e+00
    1.04869985e-01   2.34355526e+00  -7.96257289e-01  -3.91391012e-01
    5.40068946e-02  -2.57325730e-01]
 [ -6.43384193e-01   1.46686838e+00   7.24040454e-01  -1.16721022e+00
   -6.83493032e-01   2.86953092e-01  -2.25477486e+00   6.01513208e-01
    1.43091629e+00  -1.15178130e+00]
 [  9.38257921e-01   6.98855460e-01   1.40709423e+00   4.19914269e-03
   -1.55170887e+00  -6.67761460e-02  -3.48848577e-01  -1.69394267e-01
    2.42486708e+00   3.80384550e-01]
 [  7.69507349e-01   3.72155629e-01   6.52274753e-01  -3.84596260e-02
   -1.36905500e+00  -3.63419041e-02  -1.34897245e+00  -3.72623387e-01
    2.20648563e+00   1.00915175e+00]
 [  3.70243397e-01   2.31201197e+00   1.42501230e+00  -4.34113838e-01
    2.04039720e-01  -1.48133754e-02  -2.06240786e+00  -1.72930906e+00
   -2.12143696e-01   2.38859381e-01]
 [  7.89522302e-01  -1.71737348e+00   1.86512991e+00   1.41429239e-01
    2.89792618e-01  -1.07609124e+00  -7.94699763e-01  -1.37567191e+00
    1.24439079e+00  -4.03218198e-01]
 [  2.26193360e+00  -1.53415090e-01   1.54339908e+00  -2.55620618e-01
   -1.47040275e+00  -8.62791423e-01  -1.80387299e+00  -7.62856311e-01
   -1.31895146e+00   3.79147431e-01]
 [  1.50697735e+00   2.90361839e-01   1.77721513e+00   1.74209049e+00
   -4.77733472e-01   8.38934114e-01  -4.09636405e-01  -6.38736008e-01
    6.29340612e-03   1.31811306e+00]
 [  4.77325162e-01   1.81245227e+00   9.01386851e-01  -4.22071294e-01
   -1.79041692e+00   1.47480554e+00  -1.32306364e+00  -6.74113508e-01
    1.08054675e+00  -6.69523411e-01]
 [ -5.84029648e-01   8.39854839e-01   2.70135241e+00   7.48099646e-01
    8.15440584e-02   1.83620743e-01  -7.27835219e-01  -1.06272693e+00
   -3.11962922e-01   9.06672384e-01]
 [  1.07950517e+00   5.43672687e-01   1.03858553e+00  -9.92885994e-01
   -2.30154157e+00  -9.11052413e-03  -4.32045719e-01  -9.56591258e-01
   -1.34807936e-01  -5.52180062e-01]
 [  8.83741604e-01  -8.65111057e-01   1.69737521e+00   1.37043559e+00
   -1.05045784e+00   5.75551668e-01  -1.91612607e+00  -5.57841740e-01
   -1.54915161e+00  -9.69261138e-02]
 [  1.42383967e+00   8.06049568e-01   1.90829461e+00   2.67194944e-01
    1.99086772e-01  -3.60306372e-01   4.48667862e-01   9.11375487e-02
   -6.57450959e-02   1.82283189e-01]
 [  1.09617311e+00   6.29143821e-01   1.04381125e+00  -6.87312773e-01
   -1.10497761e+00   4.45951270e-01  -5.73583880e-01  -1.24720833e+00
    1.91140564e-01  -1.96025178e+00]
 [  1.76496163e+00  -7.47286437e-01   1.65557783e+00   4.98887805e-01
   -3.77601741e-01  -6.28996069e-01  -6.22943231e-01  -1.01875779e+00
    4.45744109e-01  -6.22651848e-02]
 [  2.51654054e-03   1.61259380e+00   1.21453569e+00   6.43712706e-01
    3.59151933e-01   8.99167985e-01  -1.20373881e+00  -7.80379508e-01
    4.55475673e-01   6.32431209e-01]
 [  1.09879125e+00   9.28703078e-02   1.84395321e+00  -1.64290864e+00
   -3.13587949e-01   6.62214317e-01  -6.82963800e-01  -6.48964904e-01
    5.59019562e-01   8.83039737e-01]
 [ -4.94190792e-02   5.09641600e-01   1.72456173e+00   3.30836213e-01
   -1.05450390e+00   4.68676559e-02  -1.27552754e+00   8.74762464e-01
    3.51713441e-01  -6.37806611e-01]
 [  7.00781139e-01   8.16355705e-01   7.05779044e-01  -3.94687169e-01
   -9.75312702e-01   4.28711169e-01  -1.08256601e+00   1.52610587e+00
    1.43768395e+00   1.09695424e+00]
 [  4.44325215e-01   1.34724471e+00   1.62320563e+00  -1.04433580e+00
    1.92417169e-01  -1.62823017e+00  -1.57847946e-01  -2.32588035e-02
    1.82398689e+00   1.16603703e+00]
 [  8.23063652e-01   5.13376996e-01   9.16294233e-01   1.79935879e+00
    4.59511014e-01  -8.30482881e-01  -2.15418924e+00  -1.32400779e-01
    7.24643512e-01   4.65581853e-01]
 [  1.48349061e-01   5.02419124e-01   2.13684866e+00  -6.63500817e-01
   -2.40585799e-01   8.56287757e-02  -3.68036554e-01   5.69122365e-01
    1.34599010e+00   1.42144556e-01]
 [  1.68446367e+00   1.43229514e+00   9.38697104e-01   2.06556503e+00
   -2.58631826e-02   5.60543237e-01  -7.63737665e-01   1.49637916e-01
    2.04604396e+00   1.18995501e-01]
 [  1.45064205e+00   5.67377482e-01   3.77689857e-01   1.42555991e+00
    1.40058174e-01   1.09575086e-01  -2.17854705e+00  -7.21878275e-01
    4.95833480e-01   7.77985036e-02]
 [  2.02951127e+00   9.33966713e-01   1.60932692e+00   1.03414495e+00
   -1.27749740e-01  -2.58151463e-01  -9.73579226e-02   6.84249270e-01
    8.22335259e-01   1.31313058e-01]
 [  1.20225854e+00   1.20369146e+00   1.47179261e+00   6.34121023e-01
   -1.54271160e+00   1.09283928e+00   1.70227910e-01  -8.68726558e-01
    1.18435645e+00  -1.52822448e+00]
 [  9.14966379e-01  -1.98915711e-01   2.17211977e+00   9.45611740e-02
   -1.09332337e+00   4.03728066e-01  -5.83382818e-01  -1.83243539e+00
   -1.09198690e+00  -6.13255504e-01]
 [  1.50305611e+00   1.27594060e+00   6.51573599e-01  -9.96817151e-01
   -1.16584508e+00  -1.87653967e-01  -8.91979378e-01   8.51489975e-01
    1.58966538e+00  -1.11055798e+00]
 [  5.91621679e-01   7.60741891e-01   9.08868986e-02  -1.45597436e+00
   -5.14947536e-01  -1.04163923e+00  -2.84784758e+00  -5.58224399e-01
    8.55079585e-01  -9.53812874e-01]
 [  2.62386339e+00   3.61639182e-01   5.44850468e-01  -2.15158354e+00
   -3.50203942e-01   5.70491673e-01  -1.31202434e+00   3.59822535e-01
   -6.66236095e-01   1.03083301e+00]
 [  1.81118763e+00   1.68438496e-01   7.11418576e-01   2.78113341e-01
    4.87577649e-01  -9.59597147e-01  -2.32936452e+00   5.86037151e-02
   -4.09133045e-01   1.36091062e+00]
 [ -7.53776562e-01   2.53149312e-01   1.84210834e+00   1.24504703e+00
    1.58978547e+00   1.87896062e-01  -1.43818295e+00  -1.76136905e-01
    6.65501888e-01  -4.52032037e-03]
 [  1.93424647e+00  -1.83248153e-01   2.45602923e+00   2.24095521e-01
    2.76886443e-01  -5.03121316e-02  -8.98729900e-01  -8.59075279e-01
   -1.52599387e+00   8.70232784e-02]
 [  2.69126813e+00  -6.24063297e-01   1.34001453e+00   1.88072517e-01
   -9.37483167e-01  -4.32813921e-01  -1.40178573e+00  -4.38433831e-02
   -6.74666800e-01   1.60246670e-01]
 [  1.38359913e+00   1.53999744e+00   1.56274618e-01  -3.40556348e-01
    2.04164725e-01   1.06703141e+00  -2.62002010e+00  -6.16821175e-01
   -3.42491590e-01  -5.09385748e-01]
 [ -1.06223559e+00   1.45654114e+00   2.56779462e+00   1.28852361e+00
   -1.67101241e-01  -8.31174199e-01  -8.55974656e-01  -6.64422411e-01
    1.51429799e-01  -2.87600246e-01]
 [ -1.48809165e-01   3.75044756e-01   1.15093689e+00   2.87350969e-01
   -1.48705597e+00   1.28070077e-01  -1.32187523e+00  -3.35665352e-01
    7.06811562e-01   7.59461155e-01]
 [  1.45875340e+00   5.61273986e-01   4.31874325e-01   1.21913503e+00
   -6.18197521e-04  -1.47506415e+00  -1.99044104e+00  -4.91675004e-01
   -2.23361359e-02  -9.30545908e-01]
 [  1.10950841e+00   8.77741137e-01   7.12885645e-01  -5.99643556e-01
   -8.01054991e-01  -1.50812749e+00  -6.49558141e-01  -3.93122008e-01
    3.33226938e-01   4.93419670e-01]
 [  1.28101682e+00   3.23957223e+00   5.30268331e-01  -1.33988132e+00
   -5.53862833e-01  -4.32408361e-01  -1.63372980e+00   7.51588844e-01
    1.07803516e+00  -1.77290857e+00]
 [  6.12855501e-01   3.04680410e-01   1.35415998e+00   8.59789446e-01
   -6.85960358e-01  -3.78297910e-01  -5.86975699e-01  -1.91949913e+00
   -7.43823619e-01  -4.80829570e-01]
 [  1.44224712e+00   2.28144506e-01   9.24546135e-01   1.84697283e+00
    6.36589024e-01  -2.04918785e-01  -1.08745308e+00  -5.99651387e-01
    2.24337166e+00   4.89331123e-01]
 [ -7.60810846e-01   1.04468263e+00   1.47352501e+00  -5.39804408e-01
   -8.27729341e-01   2.89334332e-01  -1.73239440e+00  -1.87550166e+00
   -1.98825141e-01  -6.31878547e-01]
 [  1.15135204e+00   1.79081035e+00   2.31402247e+00   4.49961164e-01
   -5.37240288e-01   1.22412280e+00  -5.94985742e-02   8.31216501e-01
   -2.42326858e-01   5.57578847e-01]]

In [38]:
"""print _unigramTable
print words[0].word,words[0].count
print _vocab.values()[:10]
print _vocab.keys()[:10]
print words[_vocab.get('UNK')].count

print _vocab
#print W_Embedding
fig = plt.figure()
plt.scatter(W_Embedding[:,0:1], W_Embedding[:,1:2], W_Embedding[:,2:3])
plt.show()"""
y='La démarche terroriste est désormais une aberration qui ne séduit guère plus qu’une minorité.'
x = word_tokenize(_start+' '+y+' ')
net=np.array([-5, 2, 4, 3])
print 1/(1+np.exp(-net))
print sigmoid(net)
print np.negative(net)
print np.sum(np.log(sigmoid(np.negative(net))))


[ 0.00669285  0.88079708  0.98201379  0.95257413]
[ 0.00669285  0.88079708  0.98201379  0.95257413]
[ 5 -2 -4 -3]
-9.20038063902