In [31]:
from gensim import corpora, models, utils, parsing
from collections import defaultdict, Counter
from pprint import pprint
import os
import re
import string
In [32]:
# base_dir = "accessible-test/"
metadata_file = "aaccp-docMetadata.csv"
In [33]:
# creates metadata and minimally cleans each article file; then tokenizes
def process_docs(base_dir):
articleCount = 0
papers = os.listdir(base_dir)
for paper in papers:
if not paper.startswith('.'):
print("Opening: ", paper)
issues = os.listdir(base_dir + "/" + paper)
for issue in issues:
if not issue.startswith('.'):
# print("Opening issue: ", issue)
# get some metadata
issueYear = issue[0:4]
issueMonth = issue[4:6]
if len(issue) < 8:
issueDay = "01"
else:
issueDay = issue[6:8]
articles = os.listdir(base_dir + "/" + paper + "/" + issue)
for article in articles:
if not article.startswith('.'):
# print("Reading article: ", article)
articleFile = open(base_dir + "/" + paper + "/" + issue + "/" + article, "r")
articleText = articleFile.read()
articleFile.close()
# first write the metadata line
# in format: doc #, path to file, PAPERID, YYYY, MM, DD, TITLE, URL
# WILL NEED TO FIGURE OUT PAPERID, TITLE, AND URL LATER; FOR NOW JUST HOLD SPACE
articleMetadata = str(articleCount) + "," + paper + "/" + issue + "/" + article + ",PAPERID," + issueYear + "," + issueMonth + "," + issueDay + ",HEADLINE,URL\n"
#print(articleMetadata)
with open(metadata_file, "a") as myfile:
myfile.write(articleMetadata)
# now create the BoW for the article
articleWords = []
# ignore single-char words and words with numbers in them
for word in re.split('\W+', articleText):
if len(word) > 1 and not any(char.isdigit() for char in word):
articleWords.append(word)
wordString = ' '.join(articleWords)
# increment the article count
articleCount += 1
yield utils.tokenize(wordString, lowercase=True)
In [34]:
class MyCorpus(object):
def __init__(self, base_dir):
self.base_dir = base_dir
self.dictionary = corpora.Dictionary(process_docs(base_dir))
self.dictionary.filter_n_most_frequent(50) # filter 50 most frequent instead of stopwords
def __iter__(self):
for tokens in process_docs(self.base_dir):
yield self.dictionary.doc2bow(tokens)
In [35]:
# corpus = MyCorpus('accessible-v4.0-small')
corpus = MyCorpus('accessible-ccp')
print("Created corpus.")
id2word = corpus.dictionary
print(id2word)
# Starting LDA
print("Starting LDA....")
# this function creates model and saves it
lda = models.wrappers.LdaMallet("/Applications/mallet-2.0.8/bin/mallet", corpus, id2word = id2word, num_topics = 100, workers = 6)
lda.save('aaccp-ldamodelmallet.lda')
x=lda.load_document_topics()
result = lda.show_topics(100, 100, formatted = False)
# write topics to file
fout = open("aaccp-all_newspapers_topics.txt", "w")
for each in result:
fout.write(str(each) + "\n")
fout.close()
# write doc topics to a file
gen = lda.read_doctopics(lda.fdoctopics())
fout = open("aaccp-all_newspapers_doc_topics.txt", "w")
for i in gen:
fout.write(str(i) + "\n")
fout.close()
In [ ]:
# now try to pair topics to docs