In [1]:
from gensim import corpora, models, utils, parsing
from gensim.test.utils import datapath
from collections import defaultdict, Counter
from pprint import pprint
import os
import re
import string
import pandas as pd
from itertools import islice
import csv
import pandas as pd
import numpy as np

In [2]:
# this function loads previously saved model 
lda = models.LdaModel.load('lk-ldamodelmallet-new.lda')


WARNING:root:random_state not set so using default value
WARNING:root:failed to load state from lk-ldamodelmallet-new.lda.state: [Errno 2] No such file or directory: 'lk-ldamodelmallet-new.lda.state'

In [13]:
# load topics and top 10000 eventually
topics = lda.show_topics(num_topics=100, num_words=10000, formatted=False)

In [14]:
doc_topics_file = "lk-all_newspapers_doc_topics-new.txt"

In [15]:
topics_file = "lk-all_newspapers_topics-new.txt"

In [16]:
# now, create AntiSlaverytopics-new.csv in format
# topic #, word, relevance of that word,..........

topicline = ""

for topic in topics:
    # start the topic line w/ the topic number 
    topicline = str(topic[0]) + ","

    wordpairs = topic[1]
    
    for wordpair in wordpairs:
        topicline += str(wordpair[0]) + "," + str(wordpair[1]) + ","

    topicline += "\n"
    
    with open("AntiSlaverytopics-new.csv","a") as myfile:
        myfile.write(topicline)

In [20]:
# next up, do docTopics.csv in format
# topic #, relevance, topic #, relevance,............. (for the 1st document)


with open(doc_topics_file) as myfile:
    for i, line in enumerate(myfile):
        docline = ""
        
        # get the topic scores
        items = line.split(",")

        topic_scores = items[1::2]
    
        for j, score in enumerate(topic_scores):  
            score = score.strip(' ()[]\n' )
            score = float(score)
            docline += str(j) + "," + str(score) + ","
        
        docline += "\n"
        
        with open('docTopics-new.csv',"a") as writefile:
            writefile.write(docline)

In [21]:
# print the first 2 lines to see if it worked
with open('docTopics-new.csv') as myfile:
    head = [next(myfile) for x in range(2)]
print(head)


['0,0.00506756756756758,1,0.00506756756756758,2,0.021959459459459513,3,0.011824324324324353,4,0.0016891891891891932,5,0.00506756756756758,6,0.0016891891891891932,7,0.0016891891891891932,8,0.0016891891891891932,9,0.008445945945945965,10,0.011824324324324353,11,0.015202702702702738,12,0.011824324324324353,13,0.0016891891891891932,14,0.021959459459459513,15,0.0016891891891891932,16,0.00506756756756758,17,0.25844594594594655,18,0.035472972972973055,19,0.0016891891891891932,20,0.042229729729729826,21,0.011824324324324353,22,0.00506756756756758,23,0.0016891891891891932,24,0.008445945945945965,25,0.0016891891891891932,26,0.0016891891891891932,27,0.0016891891891891932,28,0.0016891891891891932,29,0.0016891891891891932,30,0.0016891891891891932,31,0.011824324324324353,32,0.008445945945945965,33,0.0016891891891891932,34,0.0016891891891891932,35,0.00506756756756758,36,0.0016891891891891932,37,0.00506756756756758,38,0.0016891891891891932,39,0.011824324324324353,40,0.0016891891891891932,41,0.0016891891891891932,42,0.0016891891891891932,43,0.0016891891891891932,44,0.0016891891891891932,45,0.0016891891891891932,46,0.0016891891891891932,47,0.018581081081081124,48,0.0016891891891891932,49,0.00506756756756758,50,0.0016891891891891932,51,0.00506756756756758,52,0.0016891891891891932,53,0.0016891891891891932,54,0.0016891891891891932,55,0.0016891891891891932,56,0.008445945945945965,57,0.0016891891891891932,58,0.025337837837837898,59,0.0016891891891891932,60,0.0016891891891891932,61,0.0489864864864866,62,0.0016891891891891932,63,0.008445945945945965,64,0.0016891891891891932,65,0.0016891891891891932,66,0.0016891891891891932,67,0.07939189189189207,68,0.0016891891891891932,69,0.0016891891891891932,70,0.0016891891891891932,71,0.018581081081081124,72,0.0016891891891891932,73,0.0489864864864866,74,0.00506756756756758,75,0.0016891891891891932,76,0.0016891891891891932,77,0.008445945945945965,78,0.0016891891891891932,79,0.0016891891891891932,80,0.011824324324324353,81,0.0016891891891891932,82,0.011824324324324353,83,0.008445945945945965,84,0.015202702702702738,85,0.00506756756756758,86,0.021959459459459513,87,0.008445945945945965,88,0.0016891891891891932,89,0.011824324324324353,90,0.0016891891891891932,91,0.0016891891891891932,92,0.0016891891891891932,93,0.0016891891891891932,94,0.0016891891891891932,95,0.0016891891891891932,96,0.0016891891891891932,97,0.0016891891891891932,98,0.0016891891891891932,99,0.0016891891891891932,\n', '0,0.0034482758620689633,1,0.0034482758620689633,2,0.0034482758620689633,3,0.0034482758620689633,4,0.0034482758620689633,5,0.0034482758620689633,6,0.0034482758620689633,7,0.0034482758620689633,8,0.0034482758620689633,9,0.0034482758620689633,10,0.0034482758620689633,11,0.0034482758620689633,12,0.0034482758620689633,13,0.03103448275862067,14,0.01034482758620689,15,0.0034482758620689633,16,0.0034482758620689633,17,0.0034482758620689633,18,0.0034482758620689633,19,0.017241379310344817,20,0.0034482758620689633,21,0.0034482758620689633,22,0.0034482758620689633,23,0.0034482758620689633,24,0.0034482758620689633,25,0.0034482758620689633,26,0.0034482758620689633,27,0.0034482758620689633,28,0.0034482758620689633,29,0.0034482758620689633,30,0.05172413793103445,31,0.0034482758620689633,32,0.0034482758620689633,33,0.0034482758620689633,34,0.0034482758620689633,35,0.0034482758620689633,36,0.0034482758620689633,37,0.0034482758620689633,38,0.0034482758620689633,39,0.0034482758620689633,40,0.0034482758620689633,41,0.0034482758620689633,42,0.0034482758620689633,43,0.0034482758620689633,44,0.0034482758620689633,45,0.0034482758620689633,46,0.0034482758620689633,47,0.0034482758620689633,48,0.0034482758620689633,49,0.02413793103448274,50,0.0034482758620689633,51,0.01034482758620689,52,0.0034482758620689633,53,0.0034482758620689633,54,0.0034482758620689633,55,0.0034482758620689633,56,0.0034482758620689633,57,0.0034482758620689633,58,0.0034482758620689633,59,0.0034482758620689633,60,0.0034482758620689633,61,0.0034482758620689633,62,0.0034482758620689633,63,0.0034482758620689633,64,0.0034482758620689633,65,0.0034482758620689633,66,0.0034482758620689633,67,0.01034482758620689,68,0.0034482758620689633,69,0.0034482758620689633,70,0.0034482758620689633,71,0.01034482758620689,72,0.01034482758620689,73,0.01034482758620689,74,0.0034482758620689633,75,0.04482758620689652,76,0.0034482758620689633,77,0.0034482758620689633,78,0.0034482758620689633,79,0.4241379310344825,80,0.0034482758620689633,81,0.0034482758620689633,82,0.0034482758620689633,83,0.017241379310344817,84,0.0034482758620689633,85,0.017241379310344817,86,0.0034482758620689633,87,0.0034482758620689633,88,0.0034482758620689633,89,0.0034482758620689633,90,0.0034482758620689633,91,0.0034482758620689633,92,0.0034482758620689633,93,0.0034482758620689633,94,0.01034482758620689,95,0.0034482758620689633,96,0.0034482758620689633,97,0.0034482758620689633,98,0.0034482758620689633,99,0.01034482758620689,\n']

In [ ]: