In [ ]:
import os
import numpy as np
import lda
import textmining
import matplotlib.pyplot as plt

In [ ]:
# Load destemming data
destem = {}
with open("stopwords/destemming.txt") as f:
        for entry in f:
                entry = entry.split(':')
                if len(entry) == 2:
                        destem[entry[0].strip()] = entry[1].strip()

esc_chars = ''.join(c for c in map(chr, range(33,256)) if not c.isalnum())+'0123456789'

# Remove stop words
stopwords = ''
for i in range(1,8):
    stop_file = "stopwords/stop-words_english_%d_en.txt" % i
    with open(stop_file) as myfile:
        stopwords += ' ' + myfile.read()

stopwords = stopwords.split()

Requires a folder ./text/ with .txt versions of your files, with 4 character paper id names (hint, using multiple years of proceedings helps give more stable topics, with less overfitting)


In [ ]:
# Create termdocument matrix
tdm = textmining.TermDocumentMatrix()
# Load document

path_name = './text/'
pid_list = []
for fname in os.listdir(path_name):
    #print 'Processing ' + fname
    
    with open(os.path.join(path_name,fname)) as myfile:

        data = myfile.read().replace('\n', ' ')  

        for ch in esc_chars:
            data = data.replace(ch, '')  

        data = data.lower()

        word_list = data.split()
        data =  ' '.join([i for i in word_list if len(i) > 3 and i not in stopwords])

        # Do destemming
        for k, v in destem.iteritems():
            data = data.replace(k, v)


        tdm.add_doc(data)
        pid_list.append(fname[0:4])

temp = list(tdm.rows(cutoff=1))
# Extract words
vocab = tuple(temp[0])
# Extract word freqs
X = np.array(temp[1:])

Topic models, there's no easy way to choose n_topics, you're looking for a balance between enough paper specific detail and overfit features


In [ ]:
# Fit lda model - used to select area chairs
model = lda.LDA(n_topics=9, n_iter=25000, alpha=0.1, eta=0.01, random_state=1)
model.fit(X) 

topic_word = model.topic_word_
doc_topic = model.doc_topic_

# Show topics
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

np.savetxt("topic_word.csv", topic_word, delimiter=",")
np.savetxt("doc_topic.csv", doc_topic, delimiter=",")
np.savetxt("vocab.csv",np.array(vocab),delimiter=",", fmt="%s")
np.savetxt("filelist.csv",np.array(pid_list),delimiter=",", fmt="%s")