In [ ]:
import nltk
import os
import codecs
import argparse
import numpy as np
import gensim
import itertools
import regex
from six.moves import cPickle as pickle
from os.path import isfile
from collections import Counter
from tqdm import tqdm
In [17]:
repl_dict={
'.': '||period||',
',': '||comma||',
'"': '||quotation_mark||',
';': '||semicolon||',
'!': '||exclamation_mark||',
'?': '||question_mark||',
'(': '||left_parenthesis||',
')': '||right_parenthesis||',
# '--': '||dash||',
'?': '||question_mark||',
# '\n': '||return||'
}
def preprocess(text):
"""
Perform a simple multiple replace.
repl_dict has to be a dictionary i replace patterns.
The key cannot contain patterns.
"""
# build regexp
reg_exp = regex.compile("|".join(map(regex.escape, repl_dict.keys())))
# replace :)
return reg_exp.sub(lambda match: repl_dict[match.group(0)], text)
In [35]:
#%%time
class TextCorpus(object):
"""
Iterate over sentences from the Text file.
"""
def __init__(self, filename, line_count=None):
self.filename = filename
self.line_count= line_count
def __iter__(self):
with codecs.open(self.filename, 'r', 'utf-8') as fin:
for line in tqdm(fin, total=self.line_count):
words = line.split()
if not words:
continue
yield words
def get_line_count(filename):
"""
Calculate numer of lines in the file
"""
def blocks(files, size=65536):
while True:
b = files.read(size)
if not b: break
yield b
with codecs.open(filename, 'r', 'utf-8') as f:
return sum(bl.count("\n") for bl in blocks(f))
# Parsing 4.2GB in Wall time: 15min 9s
def get_word_count(txt_file, overwrite=False):
"""
Calculate number of words in text file using Counter (rather than nltk.FreqDist).
It can take several minutes for large files so it picles the results for faster retrieval.
"""
basename=regex.sub("-pages-articles-multistream","",txt_file[:-4])
pickle_file="{}-wf.pickle".format(basename)
if not isfile(pickle_file) or overwrite:
line_count=get_line_count(txt_file)
with codecs.open(txt_file, 'r', 'utf-8') as fin:
# memory efficient; count line by line
wordcounts = Counter()
for line in tqdm(fin, total=line_count):
wordcounts.update(line.split())
# wordcounts = Counter(itertools.chain.from_iterable([(line.split()) for line in fin]))
with open(pickle_file, 'wb') as f:
pickle.dump(wordcounts, f, pickle.HIGHEST_PROTOCOL)
else:
with open(pickle_file, 'rb') as f:
wordcounts=pickle.load(f)
return wordcounts
def get_text_file(txt_file, line_count=None):
"""
Read text file up into memory and return it as list of lines.
Each line is a list of words.
The python implementation for list of lists is super inefficient.
For large text files it can take over 10 times more memory than the source file.
In such case use TextCorpus() class that returns iterator that you can plug in to Word2Vec
"""
#with codecs.open(txt_file, 'r', 'utf-8') as fin:
with open(txt_file, "r", encoding='utf_8') as fin:
sents = [str.split(line) for line in tqdm(fin, total=line_count)]
return sents
def make_wordvectors(txt_file, vector_size=300, window_size=5, vocab_size=50000, num_negative=5,
skip_gram=1, save_tsv=False, workers=4):
"""
Build Word2Vec from provided corpus.
"""
# read word counters
wordcouns=get_word_count(txt_file)
# determine the lowest frequency to match vocabulary size requirement
min_count = wordcouns.most_common(vocab_size)[-1][1] # the count of the the top-kth word
# need it for progress bar only
line_count=get_line_count(txt_file)
# Use generator for larger files
if line_count > 50000000:
sentences = TextCorpus(txt_file, line_count)
else:
# for smaller files read the file up to memory
sentences = get_text_file(txt_file,line_count)
print ("Building word2vec")
model = gensim.models.Word2Vec(sentences, size=vector_size, min_count=min_count,
negative=num_negative,
window=window_size,
sg=skip_gram,
workers=workers
)
# construct filename
word_count=sum(wordcouns.values())
basename=regex.sub("-pages-articles-multistream","",txt_file[:-4])
basename=regex.sub("\/","/w2v-{}-{}-{}-{}-{}-".format(word_count, vocab_size, vector_size, window_size, num_negative), basename)
model_file="{}.bin".format(basename)
model.save(model_file)
if save_tsv:
# Save to tsv file
with codecs.open("{}.tsv".format(basename), 'w', 'utf-8') as fout:
for i, word in enumerate(model.wv.index2word):
fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
np.array_str(model[word])
))
# fd=get_word_count("data/OpenSubtitles2016.txt")
# fd=get_word_count("data/plwikibooks-20170820-pages-articles-multistream.txt", overwrite=True)
# fd=get_word_count("data/plwiktionary-20170820-pages-articles-multistream.txt")
# fd=get_word_count("data/plwiki-20170820-pages-articles-multistream.txt")
# print(fd.most_common(1000000)[-1])
make_wordvectors("data/OpenSubtitles2016.txt", vocab_size=1000000, workers=12)
print ("Done")
In [23]:
%%time
make_wordvectors("data/plwiktionary-20170820-pages-articles-multistream.txt")
In [25]:
%%time
make_wordvectors("data/plwikibooks-20170820-pages-articles-multistream.txt")
In [26]:
%%time
make_wordvectors("data/plwiki-20170820-pages-articles-multistream.txt")
In [ ]: