In [2]:
from corpora.corpus import *
from corpora.scikit import *

In [3]:
corpus = Corpus.load("../data/enron_mail_clean2.pkl", "../data/enron_mail_clean2.dic") # takes a while (~3-4 min)

In [17]:
c_matrix = corpus.sparse_matrix() # takes a while ~3min

In [19]:
print("nSamples (docs) : {0}".format(corpus.num_samples)))
print("nFeatures(words): {0}".format(corpus.num_features))


nSamples (docs) : 517401
nFeatures(words): 169913

In [72]:
lda = ScikitLda.load("../data/full/lda_pickle_736.pkl", corpus=corpus)

In [73]:
topicsByOrg, orgs = topics_by_discrete_property(lda, corpus, corpus.metadata_frame['user']) # takes  ~2min

In [74]:
documents_filename = "./data/full/docs_by_user_10.pkl"
with open(documents_filename, 'r') as f:
        dd = pickle.load(f)

In [250]:
dd_sorted = sorted(dd.items(), key=lambda x:x[1])

In [80]:
%pylab inline
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,10))
plt_bar = plt.bar(range(len(dd_sorted)),[el[1] for el in dd_sorted], align='center')
plt_xticks = plt.xticks(range(len(dd_sorted)), [el[0] for el in dd_sorted], rotation=75, size=4)


Populating the interactive namespace from numpy and matplotlib

In [103]:
# print top N = 10 topics
topicWords, topicWeightedWords = topic_words(lda, corpus, num_words = 10)


Topic #0:
image free click new online e-mail email news service information
Topic #1:
enron please information business group new services contact houston management
Topic #2:
final schedule vince date start file operation hourahead log variances
Topic #3:
get know like would one week time good going don't
Topic #4:
would may agreement also issues need time credit one issue
Topic #5:
cc subject pm forwarded sara mark kate re john david
Topic #6:
energy power enron said company california million state new electricity
Topic #7:
gas deal trading power price market energy prices deals new
Topic #8:
pm scheduled database thru sat outages dbcaps97data london ct travel
Topic #9:
please thanks know attached subject let pm call cc enron

In [82]:
# nTopics = 10
img = []
for idx,topic in enumerate(topicWeightedWords):
    wc = WordCloud(background_color="white")
    img.append(wc.generate_from_frequencies([ (word, weight) for weight,word in topic ]))
#     subplot(nTopics,2,2*idx+1)
#     imshow(img)
#     axis('off')
#     imshow(img)

In [83]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;



In [84]:
for idx, im in enumerate(img):
    figure(figsize=(10,5))
    title('Topic #%2d'%(idx))
    imshow(im)



In [85]:
dd_sorted.reverse()

In [86]:
dd_sorted_reverse = dd_sorted

In [91]:
dd_sorted_reverse_top20 = dd_sorted_reverse[:20]

In [92]:
%pylab inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,7))
plt_bar = plt.bar(range(len(dd_sorted_reverse_top20)),[el[1] for el in dd_sorted_reverse_top20], align='center')
plt_xticks = plt.xticks(range(len(dd_sorted_reverse_top20)), [el[0] for el in dd_sorted_reverse_top20], rotation=75, size=15)


Populating the interactive namespace from numpy and matplotlib

In [301]:
fig = plt.figure(figsize=(10,10))
plt.pcolor(topicsByOrg, cmap = plt.cm.Reds)
plt.show()



In [204]:
topics_exclude_highest = topicsByOrg[:,range(0,7)+range(8,10)]

In [288]:
row_labels = [0,1,2,3,4,5,6,8,9]
column_labels = list(orgs)

In [298]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 35)
ax.set_xticks(np.arange(topics_exclude_highest.shape[1])-0.5, minor=False)
ax.set_yticks(np.arange(topics_exclude_highest.shape[0])-0.5, minor=False)
heatmap = ax.pcolor(topics_exclude_highest, cmap = plt.cm.Reds)
ax.set_yticklabels(['']+column_labels, minor=False)
ax.set_xticklabels(['']+row_labels, minor=False)

plt.show()



In [287]:
column_labels


Out[287]:
['allen-p',
 'arnold-j',
 'arora-h',
 'badeer-r',
 'bailey-s',
 'bass-e',
 'baughman-d',
 'beck-s',
 'benson-r',
 'blair-l',
 'brawner-s',
 'buy-r',
 'campbell-l',
 'carson-m',
 'cash-m',
 'causholli-m',
 'corman-s',
 'crandell-s',
 'cuilla-m',
 'dasovich-j',
 'davis-d',
 'dean-c',
 'delainey-d',
 'derrick-j',
 'dickson-s',
 'donoho-l',
 'donohoe-t',
 'dorland-c',
 'ermis-f',
 'farmer-d',
 'fischer-m',
 'forney-j',
 'fossum-d',
 'gang-l',
 'gay-r',
 'geaccone-t',
 'germany-c',
 'gilbertsmith-d',
 'giron-d',
 'griffith-j',
 'grigsby-m',
 'guzman-m',
 'haedicke-m',
 'hain-m',
 'harris-s',
 'hayslett-r',
 'heard-m',
 'hendrickson-s',
 'hernandez-j',
 'hodge-j',
 'holst-k',
 'horton-s',
 'hyatt-k',
 'hyvl-d',
 'jones-t',
 'kaminski-v',
 'kean-s',
 'keavey-p',
 'keiser-k',
 'king-j',
 'kitchen-l',
 'kuykendall-t',
 'lavorato-j',
 'lay-k',
 'lenhart-m',
 'lewis-a',
 'linder-e',
 'lokay-m',
 'lokey-t',
 'love-p',
 'lucci-p',
 'maggi-m',
 'mann-k',
 'martin-t',
 'may-l',
 'mccarty-d',
 'mcconnell-m',
 'mckay-b',
 'mckay-j',
 'mclaughlin-e',
 'merriss-s',
 'meyers-a',
 'mims-thurston-p',
 'motley-m',
 'neal-s',
 'nemec-g',
 'panus-s',
 'parks-j',
 'pereira-s',
 'perlingiere-d',
 'phanis-s',
 'pimenov-v',
 'platter-p',
 'presto-k',
 'quenet-j',
 'quigley-d',
 'rapp-b',
 'reitmeyer-j',
 'richey-c',
 'ring-a',
 'ring-r',
 'rodrique-r',
 'rogers-b',
 'ruscitti-k',
 'sager-e',
 'saibi-e',
 'salisbury-h',
 'sanchez-m',
 'sanders-r',
 'scholtes-d',
 'schoolcraft-d',
 'schwieger-j',
 'scott-s',
 'semperger-c',
 'shackleton-s',
 'shankman-j',
 'shapiro-r',
 'shively-h',
 'skilling-j',
 'slinger-r',
 'smith-m',
 'solberg-g',
 'south-s',
 'staab-t',
 'stclair-c',
 'steffes-j',
 'stepenovitch-j',
 'stokley-c',
 'storey-g',
 'sturm-f',
 'swerzbin-m',
 'symes-k',
 'taylor-m',
 'tholt-j',
 'thomas-p',
 'townsend-j',
 'tycholiz-b',
 'ward-k',
 'watson-k',
 'weldon-c',
 'whalley-g',
 'whalley-l',
 'white-s',
 'whitt-m',
 'williams-j',
 'williams-w3',
 'wolfe-j',
 'ybarbo-p',
 'zipper-a',
 'zufferli-j']

In [ ]: