In [11]:
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
from sklearn.linear_model import Ridge, LinearRegression, Lars, Lasso
import gc
import io
import re
import json
import scipy.spatial.distance as distance
import gensim.utils as utils
from six import iteritems
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD


Using Theano backend.

In [2]:
def valid_and_separate(line):
    vals = line.split(';')
    if len(vals) != 2:
        return None
    for i in range(2):
        v = re.sub(r'[\.\,\%«"\)\(]', '', vals[i]).strip()
        if not v:
            return None
        vals[i] = v
    if vals[0] == vals[1]:
        return None
    return vals


def readvocub(filename):
    tr = {}
    with io.open(filename, 'r', encoding='utf8') as fin:
        for line in fin:
            vals = valid_and_separate(line)
            if vals is None:
                continue
            tr[vals[0]] = [vals[1]]
    fin.close()
    return tr

In [3]:
folder = 'data'

source_corpora = folder + '\\eng_corpora.txt'
source_model_save = folder + '\\eng_model'

target_corpora = folder + '\\rus_corpora.txt'
target_model_save = folder + '\\rus_model'

final_file= folder +'\\bilingual.bin'

vector_size = 100
load_from_file = True

In [4]:
if load_from_file:
    target_model = Word2Vec.load(target_model_save)
    # target_model = Word2Vec.load_word2vec_format('E:\\NLp\\Data\\kaz_dataset_size(200).bin', binary=True)
else:
    print 'Train W2V for target'
    sentences = LineSentence(target_corpora)
    target_model = Word2Vec(sentences, size=vector_size, window=6, min_count=4, sg=1)
    target_model.save(target_model_save)

if load_from_file:
    source_model = Word2Vec.load(source_model_save)
    # source_model = Word2Vec.load_word2vec_format('E:\\NLp\\Data\\rus_dataset2_size(200).bin', binary=True)
else:
    print 'Train W2V for source'
    sentences = LineSentence(source_corpora)
    source_model = Word2Vec(sentences, size=vector_size, window=9, min_count=10, sg=1)
    source_model.save(source_model_save)

In [5]:
source2target = {}
target2source = {}
#load word pairs
source2target = readvocub('data\\eng_rus_vocab.txt')
# target2source = readvocub('E:\\Nlp\\Data\\kazakh_news_vocub_translations.txt')

In [6]:
#rus-kaz
# with io.open('E:\\Nlp\\Data\\en_ru_kaz-dictionary.json','r',encoding='utf8') as file:
#     data = json.load(file)
#     for t in data["Translations"]:
#         index_source = str(t["Rus"])
#         index_target = str(t["Kaz"])
#         sourceword = data["Words"][index_source]["Text"].lower();
#         targetword = data["Words"][index_target]["Text"].lower();
#         if sourceword not in source2target:
#             source2target[sourceword] = []
#         elif targetword not in source2target[sourceword]:
#             source2target[sourceword].append(targetword)
#         if targetword not in target2source:
#             target2source[targetword] = []
#         elif sourceword not in target2source[targetword]:
#             target2source[targetword].append(sourceword)

In [7]:
sourcematrix=[]
targetmatrix=[]
cur = 0
for w_source in source_model.vocab:
    cur += 1
    if w_source in source2target:
        trans = source2target[w_source]
        for w_target in trans:
            if w_target in target_model.vocab:
                w_source_index = source_model.vocab[w_source].index
                w_target_index = target_model.vocab[w_target].index
                sourcematrix.append(source_model.syn0[w_source_index])
                targetmatrix.append(target_model.syn0[w_target_index])

source2target = None
target2source = None
gc.collect()
sourcematrix = np.array(sourcematrix)
targetmatrix = np.array(targetmatrix)
print('len of matricies', len(sourcematrix), len(targetmatrix))


('len of matricies', 94135, 94135)

In [10]:
r = Ridge(alpha=0.0001,random_state=241)
print 'Start ', type(r).__name__ 
r.fit(sourcematrix, targetmatrix)
l = len(sourcematrix)
#test
distances = np.zeros(l)
losses = np.zeros(l-1)
x_new = r.predict(sourcematrix)
avg = 0
for i in xrange(len(x_new)):
    dist = 1 - distance.cosine(targetmatrix[i],x_new[i])
    distances[i] = dist    
for i in xrange(l -1):
    dist1 = distance.cosine(sourcematrix[i],sourcematrix[i+1])
    dist2 = distance.cosine(x_new[i],x_new[i+1])
    losses[i] = dist1 - dist2
print 'avg:', distances.mean()
print 'std:', distances.std()
print 'best:', distances.max()
print 'worst:', distances.min()
print 'loss:', losses.mean()


Start  Ridge
avg: 0.61528204812
std: 0.135780626879
best: 0.960418843684
worst: 0.0356565778291
loss: 0.462244874494

In [16]:
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(200, input_dim=sourcematrix.shape[1], init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(targetmatrix.shape[1], init='uniform'))
model.add(Activation('relu'))
sgd = SGD(lr=0.1, decay=1e-6)
model.compile(loss='mean_squared_error',
              optimizer=sgd)

model.fit(sourcematrix, targetmatrix,
          nb_epoch=5,
          batch_size=16, verbose=1)


Epoch 1/5
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-16-55d541b503c2> in <module>()
     14 model.fit(sourcematrix, targetmatrix,
     15           nb_epoch=5,
---> 16           batch_size=16, verbose=1)

C:\Users\aelam_000\Anaconda2\lib\site-packages\keras\models.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, **kwargs)
    400                               shuffle=shuffle,
    401                               class_weight=class_weight,
--> 402                               sample_weight=sample_weight)
    403 
    404     def evaluate(self, x, y, batch_size=32, verbose=1,

C:\Users\aelam_000\Anaconda2\lib\site-packages\keras\engine\training.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight)
   1034                               verbose=verbose, callbacks=callbacks,
   1035                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
-> 1036                               callback_metrics=callback_metrics)
   1037 
   1038     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

C:\Users\aelam_000\Anaconda2\lib\site-packages\keras\engine\training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, nb_epoch, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics)
    778                     batch_logs[l] = o
    779 
--> 780                 callbacks.on_batch_end(batch_index, batch_logs)
    781 
    782                 epoch_logs = {}

C:\Users\aelam_000\Anaconda2\lib\site-packages\keras\callbacks.pyc in on_batch_end(self, batch, logs)
     58         t_before_callbacks = time.time()
     59         for callback in self.callbacks:
---> 60             callback.on_batch_end(batch, logs)
     61         self._delta_ts_batch_end.append(time.time() - t_before_callbacks)
     62         delta_t_median = np.median(self._delta_ts_batch_end)

C:\Users\aelam_000\Anaconda2\lib\site-packages\keras\callbacks.pyc in on_batch_end(self, batch, logs)
    186         # will be handled by on_epoch_end
    187         if self.verbose and self.seen < self.params['nb_sample']:
--> 188             self.progbar.update(self.seen, self.log_values)
    189 
    190     def on_epoch_end(self, epoch, logs={}):

C:\Users\aelam_000\Anaconda2\lib\site-packages\keras\utils\generic_utils.pyc in update(self, current, values)
     82             bar += ('.' * (self.width - prog_width))
     83             bar += ']'
---> 84             sys.stdout.write(bar)
     85             self.total_width = len(bar)
     86 

C:\Users\aelam_000\Anaconda2\lib\site-packages\ipykernel\iostream.pyc in write(self, string)
    315 
    316             is_child = (not self._is_master_process())
--> 317             self._buffer.write(string)
    318             if is_child:
    319                 # newlines imply flush in subprocesses

ValueError: I/O operation on closed file

In [16]:
final_source_matrix = r.predict(source_model.syn0)

#test
l = len(final_source_matrix)
distances = np.zeros(l)
losses = np.zeros(l-1)
for i in xrange(l -1):
    dist1 = distance.cosine(source_model.syn0[i],source_model.syn0[i+1])
    dist2 = distance.cosine(final_source_matrix[i],final_source_matrix[i+1])
    losses[i] = dist1 - dist2
print 'loss:', losses.mean()


loss: 0.475587342

In [17]:
#save combined vectors
with utils.smart_open(final_file, 'wb') as fout:
    fout.write(utils.to_utf8("%s %s\n" % (source_model.syn0.shape[0]+target_model.syn0.shape[0],vector_size)))
    for word, vocab in sorted(iteritems(source_model.vocab), key=lambda item: -item[1].count):
        row = source_model.syn0[vocab.index]
        fout.write(utils.to_utf8(word) + b" " + row.tostring())
    print('end source')
    for word, vocab in sorted(iteritems(target_model.vocab), key=lambda item: -item[1].count):
        row = target_model.syn0[vocab.index]
        fout.write(utils.to_utf8(word) + b" " + row.tostring())
    print('end target')


end source
end target