notebook.community

Edit and run



In [20]:

    
# -*- coding: utf-8 -*-
import numpy as np
from gensim.models import word2vec
import io, re
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def valid_and_separate(line):
    vals = line.split(';')
    if len(vals) != 2:
        return None
    for i in range(2):
        v = re.sub(r'[\.\,\%«"\)\(]', '', vals[i]).strip()
        if not v:
            return None
        vals[i] = v
    if vals[0] == vals[1]:
        return None
    return vals

def readvocub(filename):
    tr = {}
    with io.open(filename, 'r', encoding='utf8') as fin:
        for line in fin:
            vals = valid_and_separate(line)
            if vals is None:
                continue
            tr[vals[0]] = vals[1]
    fin.close()
    return tr

def mix_text(target_file, source_file, dictionary, translate_limit, window_limit,verbose=1):
    words_count = 0
    translated_count = 0
    order_count = 0
    for line in source_file:
        if(words_count % 1000 == 0 and verbose):
            print "\r processed words: {0}K, translated words: {1}K".format(words_count/1000,translated_count/1000),
        if(words_count < translate_limit):
            words = line.split()
            words_count += len(words)
            for word in words:
                w = None
                if(word in dictionary and order_count > window_limit):
                    w = dictionary[word].lower()
                    order_count = 0
                    translated_count+=1
                else:
                    order_count += 1
                    w = word
                target_file.write(w + ' ')
            target_file.write(u'\r\n')
        else:
            words_count += len(line.split())
            target_file.write(line)
            target_file.write(u'\r\n')
    print ' '



In [16]:

    
mix = True
file_corpora = 'data\\rus_en_mixed_file.txt'
if mix:
    # load word pairs
    en_ru = readvocub('data\\eng_rus_vocab.txt')
    ru_en = readvocub('data\\rus_eng_vocab.txt')
    with io.open(file_corpora, 'w', encoding='utf8') as out_file:
        with io.open('data\\eng_corpora.txt','r', encoding='utf8') as eng_f:
            print 'Start eng'
            mix_text(out_file,eng_f,en_ru,11e+7,6)
        with io.open('data\\rus_corpora.txt','r', encoding='utf8') as rus_f:
            print 'Start rus'
            mix_text(out_file,rus_f,ru_en,8e+7,6)









    



Start eng
 processed words: 267186K, translated words: 13010K  
Start rus
 processed words: 197433K, translated words: 9405K



In [22]:

    
load_from_file = False
model_save = 'data\\bilingual_mixed'
if load_from_file:
    model = Word2Vec.load(model_save)
else:
    print 'Train W2V for target'
    sentences = LineSentence(file_corpora)
    model = Word2Vec(sentences, size=100, window=6, min_count=4, sg=1, workers=6)
    model.save(model_save)









    



Train W2V for target