In [20]:
# -*- coding: utf-8 -*-
import numpy as np
from gensim.models import word2vec
import io, re
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def valid_and_separate(line):
vals = line.split(';')
if len(vals) != 2:
return None
for i in range(2):
v = re.sub(r'[\.\,\%«"\)\(]', '', vals[i]).strip()
if not v:
return None
vals[i] = v
if vals[0] == vals[1]:
return None
return vals
def readvocub(filename):
tr = {}
with io.open(filename, 'r', encoding='utf8') as fin:
for line in fin:
vals = valid_and_separate(line)
if vals is None:
continue
tr[vals[0]] = vals[1]
fin.close()
return tr
def mix_text(target_file, source_file, dictionary, translate_limit, window_limit,verbose=1):
words_count = 0
translated_count = 0
order_count = 0
for line in source_file:
if(words_count % 1000 == 0 and verbose):
print "\r processed words: {0}K, translated words: {1}K".format(words_count/1000,translated_count/1000),
if(words_count < translate_limit):
words = line.split()
words_count += len(words)
for word in words:
w = None
if(word in dictionary and order_count > window_limit):
w = dictionary[word].lower()
order_count = 0
translated_count+=1
else:
order_count += 1
w = word
target_file.write(w + ' ')
target_file.write(u'\r\n')
else:
words_count += len(line.split())
target_file.write(line)
target_file.write(u'\r\n')
print ' '
In [16]:
mix = True
file_corpora = 'data\\rus_en_mixed_file.txt'
if mix:
# load word pairs
en_ru = readvocub('data\\eng_rus_vocab.txt')
ru_en = readvocub('data\\rus_eng_vocab.txt')
with io.open(file_corpora, 'w', encoding='utf8') as out_file:
with io.open('data\\eng_corpora.txt','r', encoding='utf8') as eng_f:
print 'Start eng'
mix_text(out_file,eng_f,en_ru,11e+7,6)
with io.open('data\\rus_corpora.txt','r', encoding='utf8') as rus_f:
print 'Start rus'
mix_text(out_file,rus_f,ru_en,8e+7,6)
In [22]:
load_from_file = False
model_save = 'data\\bilingual_mixed'
if load_from_file:
model = Word2Vec.load(model_save)
else:
print 'Train W2V for target'
sentences = LineSentence(file_corpora)
model = Word2Vec(sentences, size=100, window=6, min_count=4, sg=1, workers=6)
model.save(model_save)