In [11]:
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
from sklearn.linear_model import Ridge, LinearRegression, Lars, Lasso
import gc
import io
import re
import json
import scipy.spatial.distance as distance
import gensim.utils as utils
from six import iteritems
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
In [2]:
def valid_and_separate(line):
vals = line.split(';')
if len(vals) != 2:
return None
for i in range(2):
v = re.sub(r'[\.\,\%«"\)\(]', '', vals[i]).strip()
if not v:
return None
vals[i] = v
if vals[0] == vals[1]:
return None
return vals
def readvocub(filename):
tr = {}
with io.open(filename, 'r', encoding='utf8') as fin:
for line in fin:
vals = valid_and_separate(line)
if vals is None:
continue
tr[vals[0]] = [vals[1]]
fin.close()
return tr
In [3]:
folder = 'data'
source_corpora = folder + '\\eng_corpora.txt'
source_model_save = folder + '\\eng_model'
target_corpora = folder + '\\rus_corpora.txt'
target_model_save = folder + '\\rus_model'
final_file= folder +'\\bilingual.bin'
vector_size = 100
load_from_file = True
In [4]:
if load_from_file:
target_model = Word2Vec.load(target_model_save)
# target_model = Word2Vec.load_word2vec_format('E:\\NLp\\Data\\kaz_dataset_size(200).bin', binary=True)
else:
print 'Train W2V for target'
sentences = LineSentence(target_corpora)
target_model = Word2Vec(sentences, size=vector_size, window=6, min_count=4, sg=1)
target_model.save(target_model_save)
if load_from_file:
source_model = Word2Vec.load(source_model_save)
# source_model = Word2Vec.load_word2vec_format('E:\\NLp\\Data\\rus_dataset2_size(200).bin', binary=True)
else:
print 'Train W2V for source'
sentences = LineSentence(source_corpora)
source_model = Word2Vec(sentences, size=vector_size, window=9, min_count=10, sg=1)
source_model.save(source_model_save)
In [5]:
source2target = {}
target2source = {}
#load word pairs
source2target = readvocub('data\\eng_rus_vocab.txt')
# target2source = readvocub('E:\\Nlp\\Data\\kazakh_news_vocub_translations.txt')
In [6]:
#rus-kaz
# with io.open('E:\\Nlp\\Data\\en_ru_kaz-dictionary.json','r',encoding='utf8') as file:
# data = json.load(file)
# for t in data["Translations"]:
# index_source = str(t["Rus"])
# index_target = str(t["Kaz"])
# sourceword = data["Words"][index_source]["Text"].lower();
# targetword = data["Words"][index_target]["Text"].lower();
# if sourceword not in source2target:
# source2target[sourceword] = []
# elif targetword not in source2target[sourceword]:
# source2target[sourceword].append(targetword)
# if targetword not in target2source:
# target2source[targetword] = []
# elif sourceword not in target2source[targetword]:
# target2source[targetword].append(sourceword)
In [7]:
sourcematrix=[]
targetmatrix=[]
cur = 0
for w_source in source_model.vocab:
cur += 1
if w_source in source2target:
trans = source2target[w_source]
for w_target in trans:
if w_target in target_model.vocab:
w_source_index = source_model.vocab[w_source].index
w_target_index = target_model.vocab[w_target].index
sourcematrix.append(source_model.syn0[w_source_index])
targetmatrix.append(target_model.syn0[w_target_index])
source2target = None
target2source = None
gc.collect()
sourcematrix = np.array(sourcematrix)
targetmatrix = np.array(targetmatrix)
print('len of matricies', len(sourcematrix), len(targetmatrix))
In [10]:
r = Ridge(alpha=0.0001,random_state=241)
print 'Start ', type(r).__name__
r.fit(sourcematrix, targetmatrix)
l = len(sourcematrix)
#test
distances = np.zeros(l)
losses = np.zeros(l-1)
x_new = r.predict(sourcematrix)
avg = 0
for i in xrange(len(x_new)):
dist = 1 - distance.cosine(targetmatrix[i],x_new[i])
distances[i] = dist
for i in xrange(l -1):
dist1 = distance.cosine(sourcematrix[i],sourcematrix[i+1])
dist2 = distance.cosine(x_new[i],x_new[i+1])
losses[i] = dist1 - dist2
print 'avg:', distances.mean()
print 'std:', distances.std()
print 'best:', distances.max()
print 'worst:', distances.min()
print 'loss:', losses.mean()
In [16]:
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(200, input_dim=sourcematrix.shape[1], init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(targetmatrix.shape[1], init='uniform'))
model.add(Activation('relu'))
sgd = SGD(lr=0.1, decay=1e-6)
model.compile(loss='mean_squared_error',
optimizer=sgd)
model.fit(sourcematrix, targetmatrix,
nb_epoch=5,
batch_size=16, verbose=1)
In [16]:
final_source_matrix = r.predict(source_model.syn0)
#test
l = len(final_source_matrix)
distances = np.zeros(l)
losses = np.zeros(l-1)
for i in xrange(l -1):
dist1 = distance.cosine(source_model.syn0[i],source_model.syn0[i+1])
dist2 = distance.cosine(final_source_matrix[i],final_source_matrix[i+1])
losses[i] = dist1 - dist2
print 'loss:', losses.mean()
In [17]:
#save combined vectors
with utils.smart_open(final_file, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (source_model.syn0.shape[0]+target_model.syn0.shape[0],vector_size)))
for word, vocab in sorted(iteritems(source_model.vocab), key=lambda item: -item[1].count):
row = source_model.syn0[vocab.index]
fout.write(utils.to_utf8(word) + b" " + row.tostring())
print('end source')
for word, vocab in sorted(iteritems(target_model.vocab), key=lambda item: -item[1].count):
row = target_model.syn0[vocab.index]
fout.write(utils.to_utf8(word) + b" " + row.tostring())
print('end target')