In [2]:
from corpora.corpus import *
from corpora.scikit import *
In [3]:
corpus = Corpus.load("../data/enron_mail_clean2.pkl", "../data/enron_mail_clean2.dic") # takes a while (~3-4 min)
In [17]:
c_matrix = corpus.sparse_matrix() # takes a while ~3min
In [19]:
print("nSamples (docs) : {0}".format(corpus.num_samples)))
print("nFeatures(words): {0}".format(corpus.num_features))
In [72]:
lda = ScikitLda.load("../data/full/lda_pickle_736.pkl", corpus=corpus)
In [73]:
topicsByOrg, orgs = topics_by_discrete_property(lda, corpus, corpus.metadata_frame['user']) # takes ~2min
In [74]:
documents_filename = "./data/full/docs_by_user_10.pkl"
with open(documents_filename, 'r') as f:
dd = pickle.load(f)
In [250]:
dd_sorted = sorted(dd.items(), key=lambda x:x[1])
In [80]:
%pylab inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,10))
plt_bar = plt.bar(range(len(dd_sorted)),[el[1] for el in dd_sorted], align='center')
plt_xticks = plt.xticks(range(len(dd_sorted)), [el[0] for el in dd_sorted], rotation=75, size=4)
In [103]:
# print top N = 10 topics
topicWords, topicWeightedWords = topic_words(lda, corpus, num_words = 10)
In [82]:
# nTopics = 10
img = []
for idx,topic in enumerate(topicWeightedWords):
wc = WordCloud(background_color="white")
img.append(wc.generate_from_frequencies([ (word, weight) for weight,word in topic ]))
# subplot(nTopics,2,2*idx+1)
# imshow(img)
# axis('off')
# imshow(img)
In [83]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [84]:
for idx, im in enumerate(img):
figure(figsize=(10,5))
title('Topic #%2d'%(idx))
imshow(im)
In [85]:
dd_sorted.reverse()
In [86]:
dd_sorted_reverse = dd_sorted
In [91]:
dd_sorted_reverse_top20 = dd_sorted_reverse[:20]
In [92]:
%pylab inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,7))
plt_bar = plt.bar(range(len(dd_sorted_reverse_top20)),[el[1] for el in dd_sorted_reverse_top20], align='center')
plt_xticks = plt.xticks(range(len(dd_sorted_reverse_top20)), [el[0] for el in dd_sorted_reverse_top20], rotation=75, size=15)
In [301]:
fig = plt.figure(figsize=(10,10))
plt.pcolor(topicsByOrg, cmap = plt.cm.Reds)
plt.show()
In [204]:
topics_exclude_highest = topicsByOrg[:,range(0,7)+range(8,10)]
In [288]:
row_labels = [0,1,2,3,4,5,6,8,9]
column_labels = list(orgs)
In [298]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 35)
ax.set_xticks(np.arange(topics_exclude_highest.shape[1])-0.5, minor=False)
ax.set_yticks(np.arange(topics_exclude_highest.shape[0])-0.5, minor=False)
heatmap = ax.pcolor(topics_exclude_highest, cmap = plt.cm.Reds)
ax.set_yticklabels(['']+column_labels, minor=False)
ax.set_xticklabels(['']+row_labels, minor=False)
plt.show()
In [287]:
column_labels
Out[287]:
In [ ]: