In [1]:
import pickle
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
import pandas as pd
import time
import gzip
import logging
import operator
import gensim
from gensim.models.wrappers import FastText
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [6]:
%%time
import operator
import codecs
class MyDocuments(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
with gzip.open(self.dirname, 'rb') as f:
for line in f:
yield line.decode('utf-8', 'ignore').split('\t')[1].split()
x = MyDocuments('data/eos/ngram/bigram_transformed_docs_%s.gz' % 'all')
with codecs.open('data/eos/ngram/bigram_transformed_docs_all.txt', "w", "utf-8") as targetFile:
for sentence_no, sentence in enumerate(x):
# print(sentence_no)
# print(sentence)
targetFile.write(u' '.join(sentence) + u' \n')
# break
In [7]:
%%time
def generate_fastText(corpus_path, model_file):
print("starting epoche " + time.strftime("%H:%M:%S"))
# initiate the model and perform the first epoch of training
model = gensim.models.wrappers.fasttext.FastText.train(ft_path='/home/sonic/sonic/fastText/fasttext',
corpus_file=corpus_path)
model.save(model_file)
print("Finished epoche " + time.strftime("%H:%M:%S"))
print ("{} training epochs so far".format(model.train_count))
print ("{:,} terms in the FastText EOS vocabulary.".format(len(model.wv.vocab)))
def load_fastText(model_file):
# load the finished model from disk
model = Word2Vec.load(model_file)
# model.init_sims(replace=True)
return model
In [8]:
%%time
# Load EOS processed corpus
corpus_path = 'data/eos/ngram/bigram_transformed_docs_all.txt'
print(corpus_path)
model_file = 'data/eos/fastText_model_all.model'
generate_fastText(corpus_path, model_file)
In [9]:
model = load_fastText(model_file)
In [10]:
# print(model['trump'])
model.similar_by_word('obama', topn=10, restrict_vocab=None)
Out[10]:
In [ ]: