In [185]:
import numpy as np

In [186]:
utomat = 'data/enron_lda_15.csv'
dictionary = 'data/enron_dic.csv'
delim = ','
maxword = 10

In [187]:
# load data
with open(dictionary, 'r') as fin: #'data/enron_dic.csv'
    lines = fin.readlines()
tmpLines = [ line.strip().split('\t') for line in lines ]
dic = { int(l[0]): (l[1].strip('"')) for l in tmpLines }
uto = np.loadtxt(utomat, delimiter=delim) #','
# RS1 = uto.sum(axis=1)
# RS2 = uto.sum(axis=0)
# #normalized word by topic array
# utonorm = uto / RS1[:,None]
# utonorm = utonorm / RS2[None,:]

In [188]:
sum(uto[:,1])


Out[188]:
1.0000000000004512

In [154]:
# now calculate order of words within each topic (output words)
allWords = []
for j in range(utoorder.shape[1]):
    topic = utoorder[:,j]
    tmp = [ dic[str(wordIdx)] for wordIdx in topic ]
    allWords.append(tmp)

In [155]:
#winning topic per word
winninguto = utonorm.argmax(axis=1)
#frequency table for number of words that are most popular per topic
Nwordsto = Counter(winninguto)
# now calculate order of word within each topic (output index)
utoorder = np.zeros(shape=utonorm.shape)
utoorder_words = np.zeros(shape=utonorm.shape)
idx = utonorm.argsort(axis=0)
utoorder = idx[::-1]

In [159]:
#allWords = np.array(allWords).T
allProbs = []
for j in range(utoorder.shape[1]):
    topic = utoorder[:,j]
    tmp = [ utonorm[wordIdx,j] for wordIdx in topic ]
    tmp = np.cumsum(tmp)
    tmp = 1-tmp
    tmp = np.around(tmp,decimals=3) 
    
    allProbs.append(tmp)



In [143]:



Out[143]:
array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [141]:
#merge probabilities and words into one large tuple
allProbs_flat = [y for x in allProbs for y in x]
allWords_flat = [y for x in allWords for y in x]
output = zip(allProbs_flat,allWords_flat)
# built dictionary of topics, words and probabilities
wpdict = {}
ntopics = utonorm.shape[1]
nw = utonorm.shape[0]
if maxword > nw:
    maxword = nw
for i in range(ntopics):
    i0 = i*nw
    wpdict["Topic %i" %(i+1) ] = output[i0:(i0+maxword-1)]

In [162]:
len(dic)


Out[162]:
175884