In [ ]:
import nltk
import os
import codecs
import argparse
import numpy as np
import gensim
import itertools
import regex
from six.moves import cPickle as pickle
from os.path import isfile
from collections import Counter
from tqdm import tqdm

In [17]:
repl_dict={    
    '.': '||period||',
    ',': '||comma||',
    '"': '||quotation_mark||',
    ';': '||semicolon||',
    '!': '||exclamation_mark||',
    '?': '||question_mark||',
    '(': '||left_parenthesis||',
    ')': '||right_parenthesis||',
#     '--': '||dash||',
    '?': '||question_mark||',
#     '\n': '||return||'
}


def preprocess(text):
    """
    Perform a simple multiple replace.
    repl_dict has to be a dictionary i replace patterns.
    The key cannot contain patterns.
    """
    # build regexp
    reg_exp = regex.compile("|".join(map(regex.escape, repl_dict.keys())))

    # replace :)
    return reg_exp.sub(lambda match: repl_dict[match.group(0)], text)

In [35]:
#%%time


class TextCorpus(object):
    """
    Iterate over sentences from the Text file.
    """
    
    def __init__(self, filename, line_count=None):
        self.filename = filename
        self.line_count= line_count

    def __iter__(self):
        with codecs.open(self.filename, 'r', 'utf-8') as fin:
            for line in tqdm(fin, total=self.line_count):
                words = line.split()
                if not words:
                    continue
                yield words
            

def get_line_count(filename):
    """
    Calculate numer of lines in the file
    """
    def blocks(files, size=65536):
        while True:
            b = files.read(size)
            if not b: break
            yield b

    with codecs.open(filename, 'r', 'utf-8') as f:
        return sum(bl.count("\n") for bl in blocks(f))

# Parsing 4.2GB in Wall time: 15min 9s
def get_word_count(txt_file, overwrite=False):
    """
    Calculate number of words in text file using Counter (rather than nltk.FreqDist).
    It can take several minutes for large files so it picles the results for faster retrieval.
    """
    basename=regex.sub("-pages-articles-multistream","",txt_file[:-4])
    pickle_file="{}-wf.pickle".format(basename)

    if not isfile(pickle_file) or overwrite:
        line_count=get_line_count(txt_file)
        with codecs.open(txt_file, 'r', 'utf-8') as fin:
            # memory efficient; count line by line
            wordcounts = Counter()
            for line in tqdm(fin, total=line_count):
                wordcounts.update(line.split())
#             wordcounts = Counter(itertools.chain.from_iterable([(line.split()) for line in fin]))
        with open(pickle_file, 'wb') as f:
            pickle.dump(wordcounts, f, pickle.HIGHEST_PROTOCOL)
    else:
        with open(pickle_file, 'rb') as f:
            wordcounts=pickle.load(f)
    return wordcounts


def get_text_file(txt_file, line_count=None):
    """
    Read text file up into memory and return it as list of lines.
    Each line is a list of words.
    The python implementation for list of lists is super inefficient. 
    For large text files it can take over 10 times more memory than the source file.
    In such case use TextCorpus() class that returns iterator that you can plug in to Word2Vec
    """
    #with codecs.open(txt_file, 'r', 'utf-8') as fin:
    with open(txt_file, "r", encoding='utf_8') as fin:
        sents = [str.split(line) for line in tqdm(fin, total=line_count)]
    return sents

def make_wordvectors(txt_file, vector_size=300, window_size=5, vocab_size=50000, num_negative=5,
                     skip_gram=1, save_tsv=False, workers=4):
    """
    Build Word2Vec from provided corpus.
    
    """
    # read word counters
    wordcouns=get_word_count(txt_file)
    # determine the lowest frequency to match vocabulary size requirement
    min_count = wordcouns.most_common(vocab_size)[-1][1] # the count of the the top-kth word
    
    # need it for progress bar only
    line_count=get_line_count(txt_file)
    
    # Use generator for larger files
    if line_count > 50000000:
        sentences = TextCorpus(txt_file, line_count)
    else:
        # for smaller files read the file up to memory
        sentences = get_text_file(txt_file,line_count)
    
    print ("Building word2vec")
    model = gensim.models.Word2Vec(sentences, size=vector_size, min_count=min_count,
                                   negative=num_negative, 
                                   window=window_size,
                                   sg=skip_gram,
                                   workers=workers
                                  )
    # construct filename
    word_count=sum(wordcouns.values())
    basename=regex.sub("-pages-articles-multistream","",txt_file[:-4])
    basename=regex.sub("\/","/w2v-{}-{}-{}-{}-{}-".format(word_count, vocab_size, vector_size, window_size, num_negative), basename)
    model_file="{}.bin".format(basename)
    model.save(model_file)
    
    if save_tsv:
        # Save to tsv file
        with codecs.open("{}.tsv".format(basename), 'w', 'utf-8') as fout:
            for i, word in enumerate(model.wv.index2word):
                fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
                                                  np.array_str(model[word])
                                                  ))



# fd=get_word_count("data/OpenSubtitles2016.txt")
# fd=get_word_count("data/plwikibooks-20170820-pages-articles-multistream.txt", overwrite=True)
# fd=get_word_count("data/plwiktionary-20170820-pages-articles-multistream.txt")
# fd=get_word_count("data/plwiki-20170820-pages-articles-multistream.txt")
# print(fd.most_common(1000000)[-1])


make_wordvectors("data/OpenSubtitles2016.txt", vocab_size=1000000, workers=12)


print ("Done")


100%|██████████| 142788125/142788125 [15:25<00:00, 154364.77it/s]
  0%|          | 20255/142788125 [00:00<11:44, 202536.67it/s]
Building word2vec
100%|██████████| 142788125/142788125 [13:50<00:00, 171843.53it/s]
100%|██████████| 142788125/142788125 [33:24<00:00, 71238.18it/s] 
100%|██████████| 142788125/142788125 [33:15<00:00, 71543.32it/s] 
100%|██████████| 142788125/142788125 [33:24<00:00, 71250.01it/s] 
100%|██████████| 142788125/142788125 [33:25<00:00, 71202.84it/s] 
100%|██████████| 142788125/142788125 [33:20<00:00, 71365.81it/s] 
Done

In [23]:
%%time
make_wordvectors("data/plwiktionary-20170820-pages-articles-multistream.txt")


100%|██████████| 136078/136078 [00:00<00:00, 261478.91it/s]
CPU times: user 6min 31s, sys: 352 ms, total: 6min 31s
Wall time: 42.1 s

In [25]:
%%time
make_wordvectors("data/plwikibooks-20170820-pages-articles-multistream.txt")


100%|██████████| 115922/115922 [00:00<00:00, 239687.63it/s]
CPU times: user 6min 32s, sys: 284 ms, total: 6min 33s
Wall time: 42 s

In [26]:
%%time
make_wordvectors("data/plwiki-20170820-pages-articles-multistream.txt")


100%|██████████| 10327370/10327370 [00:44<00:00, 233709.68it/s]
CPU times: user 7h 59min 1s, sys: 13.6 s, total: 7h 59min 14s
Wall time: 50min 2s

In [ ]: