notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [45]:

    
import gensim
import os
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [46]:

    
def iter_documents(top_directory):
    '''
    Generator: iterate over all relevant documents, yielding one
    document(=list of utf8 tokens) at a time.
    '''
    # Find all .txt documents, no matter how deep under top_directory
    for root, dirs, files in os.walk(top_directory):
        #print root
        #for fname in filter(lambda fname: fname.endswith('.txt'), files):
        # replace with just file names since 
        # our files have no extension
        for fname in files:
            #print fname
            # read each document as one big string
            document = open(os.path.join(root, fname)).read()
            #break document into utf8 tokens
            yield gensim.utils.tokenize(document, lower=True, errors='ignore')

class TxtSubdirsCorpus(object):
    '''
    iterable: on each iteration, return bag-of-words vectors,
    one vector for each document.
    
    Process one document at a time using generators, never
    load the entire corpus in RAM.
    '''
    def __init__(self, top_dir):
        self.top_dir = top_dir
        # create a dictionary = mapping for documents => sparse vectors
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
    
    def __iter__(self):
        '''
        Again, __iter__ is a generator => TxtSubdirsCorpus is a streamed iterable.
        '''
        for tokens in iter_documents(self.top_dir):
            # transforms tokens (strings) in a sparse vector, one at a time
            yield self.dictionary.doc2bow(tokens)



In [47]:

    
# that's it! the streamed corpus of sparse vectors is ready
search_dir = r'C:\Users\fch80_000\Temp2\Intro to Machine Learning\ud120-projects\enron_mail_20110402\maildir\allen-p\_sent_mail'
corpus = TxtSubdirsCorpus(search_dir)



In [41]:

    
#print the corpus vectors
#for vector in corpus:
#    print vector



In [42]:

    
#from gensim.models.lsimodel import stochastic_svd as svd
#u, s = svd(corpus, rank=200, num_terms=len(corpus.dictionary), chunksize=5000)



In [49]:

    
from gensim import models
tfidf = models.TfidfModel(corpus)



In [50]:

    
vec = [(0, 1), (4, 1)]
print(tfidf[vec])









    



[(0, 0.6918996341898515), (4, 0.7219936954073419)]



In [53]:

    
from gensim import similarities
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)



In [ ]:

    
sims = index[tfidf[vec]]



In [ ]:

    
print(list(enumerate(sims)))