In [1]:
import pandas as pd
import os.path
#import textmining - breaks with float objects
from sklearn.feature_extraction.text import CountVectorizer

path = '../NYTimes_Data/'

dataSet = pd.read_csv(os.path.join(path, 'NYTimesBlogTrain.csv'))

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
from gensim import corpora, models, similarities

In [11]:
documents = dataSet['Abstract'].tolist()
documents = map(str, documents)

In [13]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

#from pprint import pprint   # pretty-printer
#pprint(texts)

In [14]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/NYTimes.dict')

In [15]:
print(dictionary)


Dictionary(9269 unique tokens: [u'yellow', u'four', u'hanging', u'woody', u'cyprus']...)

In [16]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/NYTimes.mm', corpus)

Topic Modelling


In [20]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/NYTimes.dict')
corpus = corpora.MmCorpus('/tmp/NYTimes.mm')
print(corpus)


MmCorpus(6532 documents, 9269 features, 94539 non-zero entries)

In [23]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]

In [24]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation

In [ ]: