In [ ]:
import os
import numpy as np
import lda
import textmining
import matplotlib.pyplot as plt
In [ ]:
# Load destemming data
destem = {}
with open("stopwords/destemming.txt") as f:
for entry in f:
entry = entry.split(':')
if len(entry) == 2:
destem[entry[0].strip()] = entry[1].strip()
esc_chars = ''.join(c for c in map(chr, range(33,256)) if not c.isalnum())+'0123456789'
# Remove stop words
stopwords = ''
for i in range(1,8):
stop_file = "stopwords/stop-words_english_%d_en.txt" % i
with open(stop_file) as myfile:
stopwords += ' ' + myfile.read()
stopwords = stopwords.split()
In [ ]:
# Create termdocument matrix
tdm = textmining.TermDocumentMatrix()
# Load document
path_name = './text/'
pid_list = []
for fname in os.listdir(path_name):
#print 'Processing ' + fname
with open(os.path.join(path_name,fname)) as myfile:
data = myfile.read().replace('\n', ' ')
for ch in esc_chars:
data = data.replace(ch, '')
data = data.lower()
word_list = data.split()
data = ' '.join([i for i in word_list if len(i) > 3 and i not in stopwords])
# Do destemming
for k, v in destem.iteritems():
data = data.replace(k, v)
tdm.add_doc(data)
pid_list.append(fname[0:4])
temp = list(tdm.rows(cutoff=1))
# Extract words
vocab = tuple(temp[0])
# Extract word freqs
X = np.array(temp[1:])
In [ ]:
# Fit lda model - used to select area chairs
model = lda.LDA(n_topics=9, n_iter=25000, alpha=0.1, eta=0.01, random_state=1)
model.fit(X)
topic_word = model.topic_word_
doc_topic = model.doc_topic_
# Show topics
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
np.savetxt("topic_word.csv", topic_word, delimiter=",")
np.savetxt("doc_topic.csv", doc_topic, delimiter=",")
np.savetxt("vocab.csv",np.array(vocab),delimiter=",", fmt="%s")
np.savetxt("filelist.csv",np.array(pid_list),delimiter=",", fmt="%s")