In [19]:
from pymongo import MongoClient
import numpy as np
import gensim, cython, codecs, os, logging, re, json
from gensim import models

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [20]:
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "ns"

DATA_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name
LOG_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name + '/embedding_logs'

out_filename = collection_name + "_model"

stopwords = ['the', 'be', 'and', 'of', 'a', 'an', 'in', 'to', 'or', 'have', 'has',
            'it', 'i', 'that', 'for', 'you', 'he', 'with', 'on', 'do','say',
            'this', 'they', 'at', 'but', 'we', 'rt', '']

stopword_patterns = ['www\.[^ ]+\.[^ ]+', 'https?:\/\/[^ ]+', '([@#])([a-z\d_]+)', '^\d$'] #links, mentions, hashtags, numbers
word_endings = u'.,!?:…"\'/-\n'

cores = 4
word_dimension = 200

In [21]:
stopword_pattern = re.compile("|".join(["^" + stopword + "$" for stopword in stopwords] + stopword_patterns))
stemmer = SnowballStemmer("english")

In [22]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]

In [23]:
def word_filter(word):
    word = word.rstrip(word_endings).lower()
    word = re.sub(stopword_pattern, '', word)
    word = word.rstrip(word_endings)
    word = stemmer.stem(word)
    return word

In [28]:
class MySentences(object):
    def __init__(self, key, separator):
        # self.search_filter = json.loads(search_filter)
        self.separator = separator
        self.key = key
 
    def __iter__(self):
        for tweet in collection.find({"text": {"$exists": "true"}}, {'text': 1}).limit(150000):
            yield filter(None, [ word_filter(word) for word in tweet[self.key].split(self.separator)])
 
sentences = MySentences('text', separator) # a memory-friendly iterator
model = models.Word2Vec(sentences, workers=cores, size=word_dimension)


2017-07-27 14:50:28,374 : INFO : collecting all words and their counts
2017-07-27 14:50:28,870 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-07-27 14:50:30,725 : INFO : PROGRESS: at sentence #10000, processed 96661 words, keeping 6679 word types
2017-07-27 14:50:32,525 : INFO : PROGRESS: at sentence #20000, processed 192361 words, keeping 9966 word types
2017-07-27 14:50:34,886 : INFO : PROGRESS: at sentence #30000, processed 295571 words, keeping 12648 word types
2017-07-27 14:50:36,821 : INFO : PROGRESS: at sentence #40000, processed 397984 words, keeping 14916 word types
2017-07-27 14:50:38,743 : INFO : PROGRESS: at sentence #50000, processed 499634 words, keeping 16793 word types
2017-07-27 14:50:41,487 : INFO : PROGRESS: at sentence #60000, processed 600317 words, keeping 18794 word types
2017-07-27 14:50:43,359 : INFO : PROGRESS: at sentence #70000, processed 700450 words, keeping 20757 word types
2017-07-27 14:50:45,252 : INFO : PROGRESS: at sentence #80000, processed 797954 words, keeping 22751 word types
2017-07-27 14:50:47,701 : INFO : PROGRESS: at sentence #90000, processed 892941 words, keeping 24882 word types
2017-07-27 14:50:49,554 : INFO : PROGRESS: at sentence #100000, processed 988386 words, keeping 26839 word types
2017-07-27 14:50:51,235 : INFO : PROGRESS: at sentence #110000, processed 1079936 words, keeping 28640 word types
2017-07-27 14:50:53,664 : INFO : PROGRESS: at sentence #120000, processed 1168807 words, keeping 30415 word types
2017-07-27 14:50:55,388 : INFO : PROGRESS: at sentence #130000, processed 1262564 words, keeping 31871 word types
2017-07-27 14:50:57,119 : INFO : PROGRESS: at sentence #140000, processed 1356267 words, keeping 33469 word types
2017-07-27 14:50:58,876 : INFO : collected 34954 word types from a corpus of 1450160 raw words and 150000 sentences
2017-07-27 14:50:58,876 : INFO : Loading a fresh vocabulary
2017-07-27 14:50:58,901 : INFO : min_count=5 retains 7001 unique words (20% of original 34954, drops 27953)
2017-07-27 14:50:58,902 : INFO : min_count=5 leaves 1412866 word corpus (97% of original 1450160, drops 37294)
2017-07-27 14:50:58,919 : INFO : deleting the raw counts dictionary of 34954 items
2017-07-27 14:50:58,925 : INFO : sample=0.001 downsamples 51 most-common words
2017-07-27 14:50:58,926 : INFO : downsampling leaves estimated 1299733 word corpus (92.0% of prior 1412866)
2017-07-27 14:50:58,927 : INFO : estimated required memory for 7001 words and 200 dimensions: 14702100 bytes
2017-07-27 14:50:58,953 : INFO : resetting layer weights
2017-07-27 14:50:59,020 : INFO : training model with 4 workers on 7001 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-07-27 14:51:00,080 : INFO : PROGRESS: at 0.26% examples, 17226 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:01,263 : INFO : PROGRESS: at 0.96% examples, 29708 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:02,436 : INFO : PROGRESS: at 1.67% examples, 32279 words/s, in_qsize 1, out_qsize 0
2017-07-27 14:51:03,440 : INFO : PROGRESS: at 2.37% examples, 34768 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:04,653 : INFO : PROGRESS: at 3.15% examples, 36450 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:05,736 : INFO : PROGRESS: at 3.80% examples, 37159 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:06,810 : INFO : PROGRESS: at 4.06% examples, 34351 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:07,828 : INFO : PROGRESS: at 4.70% examples, 35514 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:09,148 : INFO : PROGRESS: at 5.36% examples, 35417 words/s, in_qsize 0, out_qsize 1
2017-07-27 14:51:10,147 : INFO : PROGRESS: at 6.01% examples, 36172 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:11,695 : INFO : PROGRESS: at 6.80% examples, 35957 words/s, in_qsize 0, out_qsize 1
2017-07-27 14:51:13,003 : INFO : PROGRESS: at 7.72% examples, 37036 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:14,308 : INFO : PROGRESS: at 8.25% examples, 36355 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:15,365 : INFO : PROGRESS: at 9.05% examples, 37104 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:16,680 : INFO : PROGRESS: at 9.72% examples, 36877 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:17,812 : INFO : PROGRESS: at 10.55% examples, 37550 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:19,027 : INFO : PROGRESS: at 11.25% examples, 37593 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:20,584 : INFO : PROGRESS: at 11.95% examples, 36991 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:21,688 : INFO : PROGRESS: at 12.65% examples, 37061 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:22,852 : INFO : PROGRESS: at 13.49% examples, 37500 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:23,988 : INFO : PROGRESS: at 14.21% examples, 37587 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:25,096 : INFO : PROGRESS: at 14.96% examples, 37710 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:26,208 : INFO : PROGRESS: at 15.72% examples, 37816 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:27,228 : INFO : PROGRESS: at 16.01% examples, 37087 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:28,540 : INFO : PROGRESS: at 16.87% examples, 37332 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:29,733 : INFO : PROGRESS: at 17.72% examples, 37593 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:30,884 : INFO : PROGRESS: at 18.28% examples, 37361 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:31,956 : INFO : PROGRESS: at 19.15% examples, 37843 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:32,925 : INFO : PROGRESS: at 19.84% examples, 38043 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:33,941 : INFO : PROGRESS: at 20.12% examples, 37456 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:34,956 : INFO : PROGRESS: at 20.81% examples, 37670 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:36,074 : INFO : PROGRESS: at 21.52% examples, 37758 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:37,184 : INFO : PROGRESS: at 22.08% examples, 37607 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:38,304 : INFO : PROGRESS: at 22.89% examples, 37850 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:39,316 : INFO : PROGRESS: at 23.53% examples, 37979 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:40,748 : INFO : PROGRESS: at 24.05% examples, 37537 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:41,822 : INFO : PROGRESS: at 24.69% examples, 37647 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:42,826 : INFO : PROGRESS: at 25.22% examples, 37604 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:44,080 : INFO : PROGRESS: at 26.00% examples, 37746 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:45,154 : INFO : PROGRESS: at 26.66% examples, 37834 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:46,232 : INFO : PROGRESS: at 27.32% examples, 37914 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:47,877 : INFO : PROGRESS: at 27.98% examples, 37536 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:48,984 : INFO : PROGRESS: at 28.77% examples, 37777 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:50,136 : INFO : PROGRESS: at 29.44% examples, 37796 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:51,200 : INFO : PROGRESS: at 30.11% examples, 37933 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:52,297 : INFO : PROGRESS: at 30.68% examples, 37793 words/s, in_qsize 0, out_qsize 1
2017-07-27 14:51:53,524 : INFO : PROGRESS: at 31.52% examples, 37927 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:54,665 : INFO : PROGRESS: at 31.94% examples, 37630 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:55,756 : INFO : PROGRESS: at 32.64% examples, 37703 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:56,804 : INFO : PROGRESS: at 33.34% examples, 37813 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:57,948 : INFO : PROGRESS: at 34.05% examples, 37815 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:51:58,960 : INFO : PROGRESS: at 34.80% examples, 37926 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:52:00,241 : INFO : PROGRESS: at 35.56% examples, 37865 words/s, in_qsize 0, out_qsize 0
2017-07-27 14:52:01,480 : INFO : PROGRESS: at 36.00% examples, 37545 words/s, in_qsize 0, out_qsize 0
----------------------------------------------------------
KeyboardInterrupt        Traceback (most recent call last)
<ipython-input-28-fe6dc319a367> in <module>()
     10 
     11 sentences = MySentences('text', separator) # a memory-friendly iterator
---> 12 model = models.Word2Vec(sentences, workers=cores, size=word_dimension)

/home/nipg1/.local/lib/python2.7/site-packages/gensim/models/word2vec.pyc in __init__(self, sentences, size, alpha, window, min_count, max_vocab_size, sample, seed, workers, min_alpha, sg, hs, negative, cbow_mean, hashfxn, iter, null_word, trim_rule, sorted_vocab, batch_words)
    477             self.build_vocab(sentences, trim_rule=trim_rule)
    478             self.train(sentences, total_examples=self.corpus_count, epochs=self.iter,
--> 479                        start_alpha=self.alpha, end_alpha=self.min_alpha)
    480         else :
    481             if trim_rule is not None :

/home/nipg1/.local/lib/python2.7/site-packages/gensim/models/word2vec.pyc in train(self, sentences, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay)
    920 
    921         while unfinished_worker_count > 0:
--> 922             report = progress_queue.get()  # blocks if workers too slow
    923             if report is None:  # a thread reporting that it finished
    924                 unfinished_worker_count -= 1

/usr/lib/python2.7/Queue.pyc in get(self, block, timeout)
    166             elif timeout is None:
    167                 while not self._qsize():
--> 168                     self.not_empty.wait()
    169             elif timeout < 0:
    170                 raise ValueError("'timeout' must be a non-negative number")

/usr/lib/python2.7/threading.pyc in wait(self, timeout)
    333             raise RuntimeError("cannot wait on un-acquired lock")
    334         waiter = _allocate_lock()
--> 335         waiter.acquire()
    336         self.__waiters.append(waiter)
    337         saved_state = self._release_save()

KeyboardInterrupt: 

In [29]:
model.save(os.path.join(DATA_DIR, out_filename))


2017-07-25 17:38:00,712 : INFO : saving Word2Vec object under /home/nipg1/Documents/summer_project/data/models/ns_model, separately None
2017-07-25 17:38:00,713 : INFO : not storing attribute syn0norm
2017-07-25 17:38:00,713 : INFO : storing np array 'syn0' to /home/nipg1/Documents/summer_project/data/models/ns_model.wv.syn0.npy
2017-07-25 17:38:00,740 : INFO : storing np array 'syn1neg' to /home/nipg1/Documents/summer_project/data/models/ns_model.syn1neg.npy
2017-07-25 17:38:00,768 : INFO : not storing attribute cum_table
2017-07-25 17:38:01,150 : INFO : saved /home/nipg1/Documents/summer_project/data/models/ns_model

In [119]:
model = models.Word2Vec.load(os.path.join(DATA_DIR, out_filename))


2017-07-26 13:51:11,109 : INFO : loading Word2Vec object from /home/nipg1/Documents/summer_project/data/models/ns_model
2017-07-26 13:51:11,262 : INFO : loading wv recursively from /home/nipg1/Documents/summer_project/data/models/ns_model.wv.* with mmap=None
2017-07-26 13:51:11,263 : INFO : loading syn0 from /home/nipg1/Documents/summer_project/data/models/ns_model.wv.syn0.npy with mmap=None
2017-07-26 13:51:11,285 : INFO : setting ignored attribute syn0norm to None
2017-07-26 13:51:11,286 : INFO : loading syn1neg from /home/nipg1/Documents/summer_project/data/models/ns_model.syn1neg.npy with mmap=None
2017-07-26 13:51:11,311 : INFO : setting ignored attribute cum_table to None
2017-07-26 13:51:11,312 : INFO : loaded /home/nipg1/Documents/summer_project/data/models/ns_model

In [ ]: