In [1]:
%matplotlib inline
In [45]:
import gensim
import os
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [46]:
def iter_documents(top_directory):
'''
Generator: iterate over all relevant documents, yielding one
document(=list of utf8 tokens) at a time.
'''
# Find all .txt documents, no matter how deep under top_directory
for root, dirs, files in os.walk(top_directory):
#print root
#for fname in filter(lambda fname: fname.endswith('.txt'), files):
# replace with just file names since
# our files have no extension
for fname in files:
#print fname
# read each document as one big string
document = open(os.path.join(root, fname)).read()
#break document into utf8 tokens
yield gensim.utils.tokenize(document, lower=True, errors='ignore')
class TxtSubdirsCorpus(object):
'''
iterable: on each iteration, return bag-of-words vectors,
one vector for each document.
Process one document at a time using generators, never
load the entire corpus in RAM.
'''
def __init__(self, top_dir):
self.top_dir = top_dir
# create a dictionary = mapping for documents => sparse vectors
self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
def __iter__(self):
'''
Again, __iter__ is a generator => TxtSubdirsCorpus is a streamed iterable.
'''
for tokens in iter_documents(self.top_dir):
# transforms tokens (strings) in a sparse vector, one at a time
yield self.dictionary.doc2bow(tokens)
In [47]:
# that's it! the streamed corpus of sparse vectors is ready
search_dir = r'C:\Users\fch80_000\Temp2\Intro to Machine Learning\ud120-projects\enron_mail_20110402\maildir\allen-p\_sent_mail'
corpus = TxtSubdirsCorpus(search_dir)
In [41]:
#print the corpus vectors
#for vector in corpus:
# print vector
In [42]:
#from gensim.models.lsimodel import stochastic_svd as svd
#u, s = svd(corpus, rank=200, num_terms=len(corpus.dictionary), chunksize=5000)
In [49]:
from gensim import models
tfidf = models.TfidfModel(corpus)
In [50]:
vec = [(0, 1), (4, 1)]
print(tfidf[vec])
In [53]:
from gensim import similarities
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
In [ ]:
sims = index[tfidf[vec]]
In [ ]:
print(list(enumerate(sims)))