In [1]:
import pandas as pd
import os.path
#import textmining - breaks with float objects
from sklearn.feature_extraction.text import CountVectorizer
path = '../NYTimes_Data/'
dataSet = pd.read_csv(os.path.join(path, 'NYTimesBlogTrain.csv'))
In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [4]:
from gensim import corpora, models, similarities
In [11]:
documents = dataSet['Abstract'].tolist()
documents = map(str, documents)
In [13]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
#from pprint import pprint # pretty-printer
#pprint(texts)
In [14]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/NYTimes.dict')
In [15]:
print(dictionary)
In [16]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/NYTimes.mm', corpus)
In [20]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/NYTimes.dict')
corpus = corpora.MmCorpus('/tmp/NYTimes.mm')
print(corpus)
In [23]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]
In [24]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
In [ ]: