About

We'll be following the tutorial "Topic Modeling for Fun and Profit" from the author of the Gensim library.



In [3]:

    
# import and setup modules we'll be using in this notebook
import logging
import itertools
import os
import pickle

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

Setup document streaming

And a little more preprocessing



In [4]:

    
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
# from gensim.parsing.preprocessing import STOPWORDS
from cltk.stop.greek.stops import STOPS_LIST

STOPS_LIST = [simple_preprocess(stop, deacc=True)[0] for stop in STOPS_LIST if len(simple_preprocess(stop, deacc=True)) > 0]



In [6]:

    
def tokenize(text):
    # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
    tokens = [token for token in simple_preprocess(text, deacc=True)]
    return [token for token in tokens if token not in STOPS_LIST]
    

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens

def iter_tlg(tlg_dir):
    file_names = os.listdir(tlg_dir)
    for file_name in file_names:
        file_path = os.path.join(tlg_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read)
        # ignore short docs
        if len(tokens) < 50:
            continue
        yield file_name, tokens



In [7]:

    
#stream = iter_wiki('./data/simplewiki-20140623-pages-articles.xml.bz2')

tlg_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
stream = iter_tlg(tlg_preprocessed)



In [8]:

    
for title, tokens in itertools.islice(iter_tlg(tlg_preprocessed), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens









    



TLG2346.TXT ['πολυκριτης', 'ιστορια', 'αυτη', 'εληφθη', 'αʹ', 'ανδρισκου', 'ναξιακων', 'γραφει', 'αυτης', 'θεοφραστος']
TLG1389.TXT ['αβαρις', 'ονομα', 'κυριον', 'λοιμου', 'φασι', 'πασαν', 'οικου', 'μενην', 'γεγονοτος', 'ανειλεν']
TLG0404.TXT ['ειτ', 'γυναικος', 'εστιν', 'ευνοικωτερον', 'γαμετης', 'εταιρα', 'πολυ', 'μαλ', 'εικοτως', 'νομω']
TLG0235.TXT ['εκητι', 'συλοσωντος', 'ευρυχωριη', 'πολλα', 'μεταιχμιωι', 'νοτος', 'κυλινδει', 'κυματ', 'ευρειης', 'αλος']
TLG0535.TXT ['αναιτιον', 'αιτιον', 'οιον', 'αμα', 'τουτο', 'γεγονεναι', 'τουτο', 'λαμβανουσιν', 'μαλιστα', 'ταις']
TLG0507.TXT ['σφιγγ', 'αρρεν', 'μαγειρον', 'οικιαν', 'ειληφ', 'απλως', 'μα', 'θεους', 'ων', 'λεγη']
TLG1816.TXT ['φωσφορε', 'φωσφορεουσα', 'φιλων', 'φως', 'φως', 'φερε', 'λαμπας', 'μοι', 'τεον', 'αεισαι']
TLG0476.TXT ['πυθαγοριστι', 'θυομεν', 'λοξια', 'εμψυχον', 'ουδεν', 'εσθιοντες', 'παντελως', 'βοιωτιος', 'ολιγα', 'αλλων']

Mk word dictionaries



In [9]:

    
doc_stream = (tokens for _, tokens in iter_tlg(tlg_preprocessed))



In [10]:

    
%time id2word_tlg = gensim.corpora.Dictionary(doc_stream)
print(id2word_tlg)









    



INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(1161925 unique tokens: ['πολυκριτης', 'ιστορια', 'αυτη', 'εληφθη', 'αʹ']...) from 1484 documents (total 44700964 corpus positions)






    



CPU times: user 3min 59s, sys: 1.9 s, total: 4min 1s
Wall time: 4min 2s
Dictionary(1161925 unique tokens: ['πολυκριτης', 'ιστορια', 'αυτη', 'εληφθη', 'αʹ']...)



In [11]:

    
# this cutoff might lose too much info, we'll see
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_tlg.filter_extremes(no_below=20, no_above=0.1)
print(id2word_tlg)









    



INFO : discarding 1077805 tokens: [('πολυκριτης', 4), ('ιστορια', 280), ('αυτη', 876), ('εληφθη', 150), ('αʹ', 319), ('ανδρισκου', 7), ('ναξιακων', 4), ('γραφει', 417), ('αυτης', 873), ('θεοφραστος', 150)]...
INFO : keeping 84120 tokens which were in no less than 20 and no more than 148 (=10.0%) documents
INFO : resulting dictionary: Dictionary(84120 unique tokens: ['συνεβησαν', 'ετεμνον', 'εφρουρουν', 'δηλιω', 'ερυθραιων']...)






    



Dictionary(84120 unique tokens: ['συνεβησαν', 'ετεμνον', 'εφρουρουν', 'δηλιω', 'ερυθραιων']...)

Mk vectors

Now start again with the corpus, turning the actual words into integers from our map.



In [12]:

    
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_tlg.doc2bow(tokenize(doc))
print(bow)









    



[(6880, 1), (12323, 1)]



In [13]:

    
print(id2word_tlg[6880], id2word_tlg[12323])









    



ποιητικης μεθοδου



In [14]:

    
# Save for reuse
# can also use `id2word_tlg.save('~/cltk_data/user_data/tlg_bow_id2word.dict')`
with open(os.path.expanduser('~/cltk_data/user_data/tlg_bow_id2word.dict'), 'wb') as file_open:
    pickle.dump(id2word_tlg, file_open)



In [31]:

    
class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

class TLGCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """Yield each document in turn, as a list of tokens (unicode strings).
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_tlg(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs



In [32]:

    
# create a stream of bag-of-words vectors
tlg_corpus = TLGCorpus(tlg_preprocessed, id2word_tlg)
vector = next(iter(tlg_corpus))
print(vector)  # print the first vector in the stream









    



[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1)]



In [30]:

    
# get titles, save to disk
tlg_corpus = TLGCorpus(tlg_preprocessed, id2word_tlg)
for item in tlg_corpus:
    print(type(item))
    input()









    



<class 'list'>

<class 'list'>






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~/cltk/venv_pypi/lib/python3.6/site-packages/ipykernel/kernelbase.py in _input_request(self, prompt, ident, parent, password)
    729             try:
--> 730                 ident, reply = self.session.recv(self.stdin_socket, 0)
    731             except Exception:

~/cltk/venv_pypi/lib/python3.6/site-packages/jupyter_client/session.py in recv(self, socket, mode, content, copy)
    795         try:
--> 796             msg_list = socket.recv_multipart(mode, copy=copy)
    797         except zmq.ZMQError as e:

~/cltk/venv_pypi/lib/python3.6/site-packages/zmq/sugar/socket.py in recv_multipart(self, flags, copy, track)
    394         """
--> 395         parts = [self.recv(flags, copy=copy, track=track)]
    396         # have first part already, only loop while more to receive

zmq/backend/cython/socket.pyx in zmq.backend.cython.socket.Socket.recv (zmq/backend/cython/socket.c:7683)()

zmq/backend/cython/socket.pyx in zmq.backend.cython.socket.Socket.recv (zmq/backend/cython/socket.c:7460)()

zmq/backend/cython/socket.pyx in zmq.backend.cython.socket._recv_copy (zmq/backend/cython/socket.c:2344)()

~/cltk/venv_pypi/lib/python3.6/site-packages/zmq/backend/cython/checkrc.pxd in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/socket.c:9621)()

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-30-3bce21c900e4> in <module>()
      3 for item in tlg_corpus:
      4     print(type(item))
----> 5     input()

~/cltk/venv_pypi/lib/python3.6/site-packages/ipykernel/kernelbase.py in raw_input(self, prompt)
    703             self._parent_ident,
    704             self._parent_header,
--> 705             password=False,
    706         )
    707 

~/cltk/venv_pypi/lib/python3.6/site-packages/ipykernel/kernelbase.py in _input_request(self, prompt, ident, parent, password)
    733             except KeyboardInterrupt:
    734                 # re-raise KeyboardInterrupt, to truncate traceback
--> 735                 raise KeyboardInterrupt
    736             else:
    737                 break

KeyboardInterrupt:



In [ ]:



In [30]:

    
# what is the most common word in that first article?
most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
print(id2word_tlg[most_index], most_count)









    



μιλησιοις 2



In [40]:

    
from gensim.corpora.mmcorpus import MmCorpus



In [43]:

    
# Save BoW
user_dir = os.path.expanduser('~/cltk_data/user_data/')
try:
    os.makedirs(user_dir)
except FileExistsError:
    pass
bow_path = os.path.join(user_dir, 'bow_lda_gensim.mm')

%time gensim.corpora.MmCorpus.serialize(bow_path, tlg_corpus)









    



INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/bow_lda_gensim.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/bow_lda_gensim.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1484x84120 matrix, density=3.173% (3961081/124834080)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/bow_lda_gensim.mm.index






    



CPU times: user 3min 51s, sys: 2.61 s, total: 3min 53s
Wall time: 3min 53s



In [44]:

    
mm_corpus = gensim.corpora.MmCorpus(bow_path)
print(mm_corpus)









    



INFO : loaded corpus index from /home/kyle/cltk_data/user_data/bow_lda_gensim.mm.index
INFO : initializing corpus reader from /home/kyle/cltk_data/user_data/bow_lda_gensim.mm
INFO : accepted corpus with 1484 documents, 84120 features, 3961081 non-zero entries






    



MmCorpus(1484 documents, 84120 features, 3961081 non-zero entries)



In [45]:

    
print(next(iter(mm_corpus)))









    



[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 2.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0), (31, 1.0), (32, 1.0), (33, 1.0), (34, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 1.0), (42, 1.0), (43, 1.0), (44, 1.0), (45, 1.0), (46, 1.0), (47, 1.0), (48, 2.0), (49, 1.0), (50, 1.0), (51, 1.0), (52, 1.0), (53, 1.0), (54, 1.0), (55, 1.0), (56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0), (60, 1.0), (61, 1.0), (62, 1.0), (63, 1.0), (64, 1.0), (65, 1.0), (66, 1.0), (67, 1.0), (68, 1.0), (69, 1.0)]