In [1]:
import pickle
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
import pandas as pd
import time
import gzip
import logging
import operator
import gensim
from gensim.models.wrappers import FastText
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
%%time 


import operator
import codecs

class MyDocuments(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        with gzip.open(self.dirname, 'rb') as f:
            for line in f:
                yield line.decode('utf-8', 'ignore').split('\t')[1].split()
                
                
x = MyDocuments('data/eos/ngram/bigram_transformed_docs_%s.gz' % 'all')

with codecs.open('data/eos/ngram/bigram_transformed_docs_all.txt', "w", "utf-8") as targetFile:
    
    for sentence_no, sentence in enumerate(x):
    #   print(sentence_no)
    #   print(sentence)
        targetFile.write(u' '.join(sentence)  + u' \n')
    #   break


CPU times: user 21.1 s, sys: 1.21 s, total: 22.4 s
Wall time: 24.1 s

In [7]:
%%time


def generate_fastText(corpus_path, model_file):
    
    print("starting epoche " + time.strftime("%H:%M:%S"))
    # initiate the model and perform the first epoch of training
    
    model = gensim.models.wrappers.fasttext.FastText.train(ft_path='/home/sonic/sonic/fastText/fasttext', 
                                                           corpus_file=corpus_path)
    
    model.save(model_file)
    print("Finished epoche " + time.strftime("%H:%M:%S"))
        
    print ("{} training epochs so far".format(model.train_count))
    print ("{:,} terms in the FastText EOS vocabulary.".format(len(model.wv.vocab)))

def load_fastText(model_file):
    # load the finished model from disk
    model = Word2Vec.load(model_file)
#     model.init_sims(replace=True)
    return model


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.3 µs

In [8]:
%%time

# Load EOS processed corpus
corpus_path = 'data/eos/ngram/bigram_transformed_docs_all.txt'
print(corpus_path)

model_file = 'data/eos/fastText_model_all.model'

generate_fastText(corpus_path, model_file)


data/eos/ngram/bigram_transformed_docs_all.txt
starting epoche 00:35:52
2017-07-14 01:00:10,907 : INFO : loading projection weights from /tmp/ft_model.vec
2017-07-14 01:00:51,840 : INFO : loaded (671085, 100) matrix from /tmp/ft_model.vec
2017-07-14 01:01:45,548 : INFO : saving FastText object under data/eos/fastText_model_all.model, separately None
2017-07-14 01:01:45,548 : INFO : storing np array 'syn0' to data/eos/fastText_model_all.model.wv.syn0.npy
2017-07-14 01:01:45,932 : INFO : not storing attribute syn0_all_norm
2017-07-14 01:01:45,933 : INFO : storing np array 'syn0_all' to data/eos/fastText_model_all.model.wv.syn0_all.npy
2017-07-14 01:01:48,781 : INFO : not storing attribute syn0norm
2017-07-14 01:01:51,379 : INFO : saved data/eos/fastText_model_all.model
Finished epoche 01:01:51
0 training epochs so far
671,085 terms in the FastText EOS vocabulary.
CPU times: user 1min 37s, sys: 2.03 s, total: 1min 39s
Wall time: 26min

In [9]:
model = load_fastText(model_file)


2017-07-14 01:01:52,660 : INFO : loading Word2Vec object from data/eos/fastText_model_all.model
2017-07-14 01:01:55,214 : INFO : loading wv recursively from data/eos/fastText_model_all.model.wv.* with mmap=None
2017-07-14 01:01:55,217 : INFO : loading syn0 from data/eos/fastText_model_all.model.wv.syn0.npy with mmap=None
2017-07-14 01:01:55,281 : INFO : loading syn0_all from data/eos/fastText_model_all.model.wv.syn0_all.npy with mmap=None
2017-07-14 01:01:55,539 : INFO : setting ignored attribute syn0_all_norm to None
2017-07-14 01:01:55,540 : INFO : setting ignored attribute syn0norm to None
2017-07-14 01:01:55,540 : INFO : loaded data/eos/fastText_model_all.model

In [10]:
# print(model['trump'])
model.similar_by_word('obama', topn=10, restrict_vocab=None)


2017-07-14 01:01:57,965 : INFO : precomputing L2-norms of word weight vectors
2017-07-14 01:01:58,138 : INFO : precomputing L2-norms of ngram weight vectors
Out[10]:
[('\x97obama', 0.9707701802253723),
 ('obama`s', 0.9619391560554504),
 ('obama\x92s', 0.9568393230438232),
 ('obamas', 0.8678238987922668),
 ('obama_habló', 0.8510994911193848),
 ('sot_obama', 0.8376156687736511),
 ('obama’s', 0.8316937685012817),
 ('obamacare', 0.8098371028900146),
 ('sopel_obama', 0.8093130588531494),
 ('barack_obama', 0.7969828248023987)]

In [ ]: