In [31]:
from gensim import corpora, models, utils, parsing
from collections import defaultdict, Counter
from pprint import pprint
import os
import re
import string

In [32]:
# base_dir = "accessible-test/" 
metadata_file = "aaccp-docMetadata.csv"

In [33]:
# creates metadata and minimally cleans each article file; then tokenizes
def process_docs(base_dir):

    articleCount = 0
    papers = os.listdir(base_dir)

    for paper in papers:
        if not paper.startswith('.'):
            print("Opening: ", paper)
        
            issues = os.listdir(base_dir + "/" + paper)
        
            for issue in issues:
                if not issue.startswith('.'):
                    # print("Opening issue: ", issue)

                    # get some metadata
                    issueYear = issue[0:4]
                    issueMonth = issue[4:6]
                    if len(issue) < 8:
                        issueDay = "01"
                    else:
                        issueDay = issue[6:8]

                    articles = os.listdir(base_dir + "/" + paper + "/" + issue)

                    for article in articles:
                        if not article.startswith('.'):
                            # print("Reading article: ", article)
                            articleFile = open(base_dir + "/" + paper + "/" + issue + "/" + article, "r")
                            articleText = articleFile.read()
                            articleFile.close()
                        
                            # first write the metadata line
                            # in format: doc #, path to file, PAPERID, YYYY, MM, DD, TITLE, URL  
                            # WILL NEED TO FIGURE OUT PAPERID, TITLE, AND URL LATER; FOR NOW JUST HOLD SPACE
                            articleMetadata = str(articleCount) + "," + paper + "/" + issue + "/" + article + ",PAPERID," + issueYear + "," + issueMonth + "," + issueDay + ",HEADLINE,URL\n" 

                            #print(articleMetadata)
                            with open(metadata_file, "a") as myfile:
                                myfile.write(articleMetadata)
                        
                            # now create the BoW for the article
                            articleWords = []
               
                            # ignore single-char words and words with numbers in them                        
                            for word in re.split('\W+', articleText):
                                if len(word) > 1 and not any(char.isdigit() for char in word):
                                    articleWords.append(word)
                        
                            wordString = ' '.join(articleWords)
                        
                            # increment the article count
                            articleCount += 1
                            yield utils.tokenize(wordString, lowercase=True)

In [34]:
class MyCorpus(object):
    def __init__(self, base_dir):
        self.base_dir = base_dir
        self.dictionary = corpora.Dictionary(process_docs(base_dir))
        self.dictionary.filter_n_most_frequent(50) # filter 50 most frequent instead of stopwords
                                             
    def __iter__(self):
        for tokens in process_docs(self.base_dir):
            yield self.dictionary.doc2bow(tokens)

In [35]:
# corpus = MyCorpus('accessible-v4.0-small')
corpus = MyCorpus('accessible-ccp')
print("Created corpus.")

id2word = corpus.dictionary
print(id2word)

# Starting LDA
print("Starting LDA....")

# this function creates model and saves it
lda = models.wrappers.LdaMallet("/Applications/mallet-2.0.8/bin/mallet", corpus, id2word = id2word, num_topics = 100, workers = 6)

lda.save('aaccp-ldamodelmallet.lda')

x=lda.load_document_topics()

result = lda.show_topics(100, 100, formatted = False)

# write topics to file
fout = open("aaccp-all_newspapers_topics.txt", "w")

for each in result:
    fout.write(str(each) + "\n")

fout.close()
    
# write doc topics to a file

gen = lda.read_doctopics(lda.fdoctopics())

fout = open("aaccp-all_newspapers_doc_topics.txt", "w")

for i in gen:
    fout.write(str(i) + "\n")
fout.close()


Opening:  FreedomsJournal
Opening:  FrederickDouglassPaper
Opening:  NationalAntiSlaveryStandard
Opening:  TheColoredAmerican
Opening:  TheNorthStar
Opening:  TheChristianRecorder
Opening:  TheNationalEra
Opening:  ProvincialFreeman
Opening:  GodeysLadysBook
Opening:  TheLiberator
Opening:  WeeklyAdvocate
Opening:  TheLily
Opening:  DouglassMonthly
Opening:  ColoredConventions
Opening:  FrankLesliesWeekly
Created corpus.
Dictionary(1171215 unique tokens: ['about', 'administered', 'aforesaid', 'after', 'ager']...)
Starting LDA....
Opening:  FreedomsJournal
Opening:  FrederickDouglassPaper
Opening:  NationalAntiSlaveryStandard
Opening:  TheColoredAmerican
Opening:  TheNorthStar
Opening:  TheChristianRecorder
Opening:  TheNationalEra
Opening:  ProvincialFreeman
Opening:  GodeysLadysBook
Opening:  TheLiberator
Opening:  WeeklyAdvocate
Opening:  TheLily
Opening:  DouglassMonthly
Opening:  ColoredConventions
Opening:  FrankLesliesWeekly
---------------------------------------------------------------------------
CalledProcessError                        Traceback (most recent call last)
<ipython-input-35-df87bdaf882f> in <module>()
     10 
     11 # this function creates model and saves it
---> 12 lda = models.wrappers.LdaMallet("/Applications/mallet-2.0.8/bin/mallet", corpus, id2word = id2word, num_topics = 100, workers = 10)
     13 
     14 lda.save('aaccp-ldamodelmallet.lda')

~/anaconda3/lib/python3.7/site-packages/gensim/models/wrappers/ldamallet.py in __init__(self, mallet_path, corpus, num_topics, alpha, id2word, workers, prefix, optimize_interval, iterations, topic_threshold)
    124         self.iterations = iterations
    125         if corpus is not None:
--> 126             self.train(corpus)
    127 
    128     def finferencer(self):

~/anaconda3/lib/python3.7/site-packages/gensim/models/wrappers/ldamallet.py in train(self, corpus)
    276         # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
    277         logger.info("training MALLET LDA with %s", cmd)
--> 278         check_output(args=cmd, shell=True)
    279         self.word_topics = self.load_word_topics()
    280         # NOTE - we are still keeping the wordtopics variable to not break backward compatibility.

~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in check_output(stdout, *popenargs, **kwargs)
   1804             error = subprocess.CalledProcessError(retcode, cmd)
   1805             error.output = output
-> 1806             raise error
   1807         return output
   1808     except KeyboardInterrupt:

CalledProcessError: Command '/Applications/mallet-2.0.8/bin/mallet train-topics --input /var/folders/mz/491r9g5s2gjfdd83trwyp4y80000gp/T/fa9b8c_corpus.mallet --num-topics 100  --alpha 50 --optimize-interval 0 --num-threads 10 --output-state /var/folders/mz/491r9g5s2gjfdd83trwyp4y80000gp/T/fa9b8c_state.mallet.gz --output-doc-topics /var/folders/mz/491r9g5s2gjfdd83trwyp4y80000gp/T/fa9b8c_doctopics.txt --output-topic-keys /var/folders/mz/491r9g5s2gjfdd83trwyp4y80000gp/T/fa9b8c_topickeys.txt --num-iterations 1000 --inferencer-filename /var/folders/mz/491r9g5s2gjfdd83trwyp4y80000gp/T/fa9b8c_inferencer.mallet --doc-topics-threshold 0.0' returned non-zero exit status 1.

In [ ]:
# now try to pair topics to docs