In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '..')


Populating the interactive namespace from numpy and matplotlib

In [2]:
from corpora.corpus import *
from corpora.scikit import *

In [3]:
lda = ScikitLda.load('../data/lda_pickle_736.pkl')

In [5]:
import gensim
dic = gensim.corpora.Dictionary.load('../data/corpus_dic.dat')

In [7]:
from glob2 import glob
basedir = '../../enron_mail_clean/'
docs = glob(basedir + '/**/*.')

In [10]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

In [63]:
from scipy.sparse import csr_matrix

def docVector(doc, dic):
    with open(doc, 'r') as fin:
        mail = fin.readlines()
        mail = ''.join(mail)
        tokens = tokenizer.tokenize(mail)
        tokens = [ t.lower() for t in tokens ]

        wcTuples = dic.doc2bow(tokens)
        data = []
        row  = []
        col  = []

        for w,c in wcTuples:
            col.append(0)
            row.append(w)
            data.append(c)

        nSamples = 1
        nFeatures = len(dic)
        oneDoc = csr_matrix((data, (col,row)), shape=(nSamples, nFeatures))
        docWeights = lda.lda.transform(oneDoc)[0]
        print len(tokens)
        #docWeights /= docWeights.sum()
        return docWeights

In [32]:
from scipy.spatial.distance import pdist,squareform

In [64]:
docTopics = np.array([ docVector(doc, dic) for doc in docs[:500] ])


369
195
59
4
95
690
59
150
157
70
119
26
84
64
10
3
34
36
58
19
2
289
162
252
2158
68
2
92
128
257
83
153
4
428
6
647
261
16
98
222
198
12
298
20
441
685
25
118
142
37
368
146
775
1375
96
461
438
15
193
52
160
2
77
50
131
621
246
4
186
67
108
97
477
66
456
66
57
25
18
410
67
786
114
130
389
244
82
127
136
88
30
149
124
385
67
2
416
20
105
85
225
124
1
403
132
159
620
135
139
214
497
25
54
699
81
202
1131
39
216
40
1
55
661
59
99
153
279
8
438
168
115
17
258
4
5
1235
0
141
19
328
38
110
808
380
124
48
249
2122
647
3
197
5
4
1551
111
124
150
184
9
395
11
126
29
127
142
136
491
2
184
137
87
124
135
288
44
135
646
24
25
2
96
80
171
136
63
259
2
479
83
156
1146
1093
423
184
135
130
169
1464
166
62
498
2
32
201
52
820
208
2
103
136
134
274
392
385
862
185
2
162
463
153
196
130
162
135
11
1388
129
5
78
16
25
245
33
82
239
2
352
381
87
102
505
55
602
1903
237
326
607
106
92
80
161
782
2
1445
189
309
1
13
16
124
252
29
16
139
1660
92
103
13
135
40
456
101
124
295
562
227
368
1375
62
209
386
900
44
34
117
134
95
135
508
141
290
23
1224
124
71
136
148
82
4
381
4
18
158
30
127
688
29
25
155
262
216
307
53
301
171
40
114
59
500
231
136
69
73
147
322
2278
347
62
150
71
118
329
198
124
357
124
19
3
34
48
234
67
117
43
333
182
48
4
228
915
135
665
650
176
915
263
87
61
250
88
40
104
4
482
115
294
1035
70
215
2113
7
52
137
32
138
278
610
282
2997
42
332
697
0
118
87
145
6
66
191
28
430
1200
285
125
381
314
231
365
117
61
312
136
245
69
187
1755
141
224
137
272
161
5
1612
730
293
117
134
46
130
196
344
25
45
185
13
584
206
13
134
875
117
395
4
718
251
391
718
64
35
135
135
472
1359
329
124
39
14
154
278
117
15
330
293
229
2
53
111
1024
107
69
520
134
66
116
20
15
205
51
135
229
1551
126
4
107
140
58
85
135
124
18
9
111
5
98
3
10
475
124
176
44
72
721
77
204
257
121
354
93
135
135

In [66]:
docTopics.sum(axis=1)


Out[66]:
array([  1.84000000e+02,   9.60000000e+01,   3.70000000e+01,
         1.00000000e+00,   4.50000000e+01,   2.34000000e+02,
         4.00000000e+01,   7.80000000e+01,   7.20000000e+01,
         3.70000000e+01,   5.40000000e+01,   1.50000000e+01,
         3.90000000e+01,   2.50000000e+01,   6.00000000e+00,
         1.00000000e+00,   1.50000000e+01,   1.90000000e+01,
         2.70000000e+01,   6.00000000e+00,   1.00000000e+00,
         1.33000000e+02,   8.10000000e+01,   9.50000000e+01,
         1.05900000e+03,   2.80000000e+01,   3.00000000e+00,
         2.70000000e+01,   7.30000000e+01,   1.15000000e+02,
         3.70000000e+01,   8.30000000e+01,   2.00000000e+00,
         2.18000000e+02,   3.00000000e+00,   2.23000000e+02,
         1.20000000e+02,   1.00000000e+01,   5.30000000e+01,
         1.18000000e+02,   3.49999999e+01,   5.00000000e+00,
         1.19000000e+02,   1.10000000e+01,   1.73000000e+02,
         3.01000000e+02,   1.70000000e+01,   4.90000000e+01,
         8.20000000e+01,   1.70000000e+01,   2.00000000e+02,
         6.80000000e+01,   3.42000000e+02,   6.49000000e+02,
         4.10000000e+01,   2.27000000e+02,   1.70000000e+02,
         1.00000000e+01,   1.08000000e+02,   2.40000000e+01,
         7.69999999e+01,   2.00000000e+00,   3.00000000e+01,
         2.40000000e+01,   3.70000000e+01,   2.94000000e+02,
         1.27000000e+02,   2.00000000e+00,   8.70000000e+01,
         2.70000000e+01,   3.90000000e+01,   4.10000000e+01,
         2.27000000e+02,   3.20000000e+01,   2.06000000e+02,
         2.70000000e+01,   3.10000000e+01,   8.00000000e+00,
         1.30000000e+01,   1.80000000e+02,   2.70000000e+01,
         3.72000000e+02,   3.70000000e+01,   7.70000000e+01,
         2.00000000e+02,   1.09000000e+02,   4.60000000e+01,
         5.49999999e+01,   6.00000000e+01,   3.00000000e+01,
         1.50000000e+01,   4.20000000e+01,   4.60000000e+01,
         1.78000000e+02,   3.10000000e+01,   3.00000000e+00,
         1.94000000e+02,   1.10000000e+01,   4.80000000e+01,
         2.70000000e+01,   1.04000000e+02,   6.70000000e+01,
         1.00000000e+00,   1.85000000e+02,   3.70000000e+01,
         7.80000000e+01,   3.32000000e+02,   6.10000000e+01,
         7.40000000e+01,   1.01000000e+02,   2.30000000e+02,
         1.40000000e+01,   2.50000000e+01,   2.82000000e+02,
         3.40000000e+01,   1.02000000e+02,   3.81000000e+02,
         2.10000000e+01,   1.13000000e+02,   2.10000000e+01,
         1.00000000e+00,   2.40000000e+01,   2.60000000e+02,
         2.40000000e+01,   2.70000000e+01,   7.20000000e+01,
         1.25000000e+02,   4.00000000e+00,   2.01000000e+02,
         7.60000000e+01,   6.90000000e+01,   9.00000000e+00,
         1.32000000e+02,   1.00000000e+00,   3.00000000e+00,
         6.00000000e+02,   1.00000000e+00,   6.50000000e+01,
         1.00000000e+01,   1.64000000e+02,   2.50000000e+01,
         3.70000000e+01,   4.03000000e+02,   1.78000000e+02,
         6.70000000e+01,   2.20000000e+01,   1.32000000e+02,
         1.04900000e+03,   2.92000000e+02,   1.00000000e+00,
         9.90000000e+01,   3.00000000e+00,   1.00000000e+00,
         6.85000000e+02,   4.90000000e+01,   6.70000000e+01,
         7.80000000e+01,   7.80000000e+01,   6.00000000e+00,
         1.87000000e+02,   6.00000000e+00,   3.70000000e+01,
         1.60000000e+01,   6.10000000e+01,   8.20000000e+01,
         6.10000000e+01,   1.57000000e+02,   2.00000000e+00,
         5.60000000e+01,   6.20000000e+01,   3.80000000e+01,
         6.90000000e+01,   6.00000000e+01,   1.25000000e+02,
         1.80000000e+01,   6.00000000e+01,   3.15000000e+02,
         1.30000000e+01,   1.30000000e+01,   2.00000000e+00,
         4.10000000e+01,   3.40000000e+01,   7.70000000e+01,
         6.80000000e+01,   2.50000000e+01,   1.07000000e+02,
         3.00000000e+00,   2.21000000e+02,   3.50000000e+01,
         6.30000000e+01,   4.87000000e+02,   4.57000000e+02,
         1.84000000e+02,   9.10000000e+01,   6.10000000e+01,
         5.30000000e+01,   6.80000000e+01,   6.19000000e+02,
         7.50000000e+01,   2.30000000e+01,   1.26000000e+02,
         2.00000000e+00,   1.80000000e+01,   8.50000000e+01,
         1.40000000e+01,   3.72000000e+02,   1.02000000e+02,
         2.00000000e+00,   3.70000000e+01,   6.00000000e+01,
         6.70000000e+01,   1.21000000e+02,   2.18000000e+02,
         1.86000000e+02,   4.31000000e+02,   7.70000000e+01,
         2.00000000e+00,   6.50000000e+01,   2.03000000e+02,
         4.60000000e+01,   1.01000000e+02,   4.20000000e+01,
         8.80000000e+01,   6.10000000e+01,   4.00000000e+00,
         6.93000000e+02,   6.80000000e+01,   1.00000000e+00,
         3.50000000e+01,   5.00000000e+00,   1.50000000e+01,
         1.22000000e+02,   1.70000000e+01,   4.40000000e+01,
         1.20000000e+02,   2.00000000e+00,   9.40000000e+01,
         1.76000000e+02,   4.00000000e+01,   3.10000000e+01,
         2.63000000e+02,   2.50000000e+01,   2.56000000e+02,
         8.84000000e+02,   9.40000000e+01,   1.48000000e+02,
         1.56000000e+02,   4.70000000e+01,   4.10000000e+01,
         3.90000000e+01,   4.50000000e+01,   4.05000000e+02,
         2.00000000e+00,   7.16000000e+02,   1.05000000e+02,
         1.52000000e+02,   2.00000000e+00,   7.00000000e+00,
         6.00000000e+00,   6.70000000e+01,   1.21000000e+02,
         1.60000000e+01,   1.00000000e+01,   6.80000000e+01,
         3.13000000e+02,   4.20000000e+01,   4.90000000e+01,
         6.00000000e+00,   6.00000000e+01,   1.70000000e+01,
         2.17000000e+02,   4.00000000e+01,   6.50000000e+01,
         1.31000000e+02,   2.90000000e+02,   1.08000000e+02,
         1.88000000e+02,   5.50000000e+02,   4.00000000e+01,
         9.20000000e+01,   1.83000000e+02,   3.95000000e+02,
         2.30000000e+01,   1.70000000e+01,   5.30000000e+01,
         6.00000000e+01,   3.00000000e+01,   6.10000000e+01,
         2.44000000e+02,   4.10000000e+01,   1.35000000e+02,
         1.10000000e+01,   5.43000000e+02,   6.70000000e+01,
         2.80000000e+01,   6.10000000e+01,   7.20000000e+01,
         3.70000000e+01,   1.00000000e+00,   1.80000000e+02,
         1.00000000e+00,   1.20000000e+01,   6.50000000e+01,
         2.00000000e+01,   3.20000000e+01,   2.99000000e+02,
         1.60000000e+01,   1.20000000e+01,   7.10000000e+01,
         1.21000000e+02,   1.05000000e+02,   2.00000000e+02,
         2.20000000e+01,   1.35000000e+02,   9.30000000e+01,
         2.10000000e+01,   4.90000000e+01,   3.00000000e+01,
         2.52000000e+02,   1.12000000e+02,   6.10000000e+01,
         2.70000000e+01,   3.10000000e+01,   6.60000000e+01,
         1.35000000e+02,   8.85000000e+02,   1.72000000e+02,
         3.00000000e+01,   6.10000000e+01,   3.20000000e+01,
         5.90000000e+01,   1.60000000e+02,   8.90000000e+01,
         6.70000000e+01,   1.85000000e+02,   6.70000000e+01,
         1.20000000e+01,   2.00000000e+00,   1.70000000e+01,
         2.80000000e+01,   9.80000000e+01,   3.50000000e+01,
         5.40000000e+01,   1.60000000e+01,   1.55000000e+02,
         9.10000000e+01,   2.30000000e+01,   1.00000000e+00,
         1.11000000e+02,   3.71000000e+02,   7.00000000e+01,
         3.36000000e+02,   3.81000000e+02,   8.10000000e+01,
         4.09000000e+02,   1.32000000e+02,   4.40000000e+01,
         3.00000000e+01,   1.16000000e+02,   3.70000000e+01,
         1.60000000e+01,   5.20000000e+01,   1.00000000e+00,
         2.14000000e+02,   4.90000000e+01,   1.39000000e+02,
         4.97000000e+02,   3.50000000e+01,   9.30000000e+01,
         1.04400000e+03,   3.00000000e+00,   1.90000000e+01,
         6.10000000e+01,   1.40000000e+01,   4.60000000e+01,
         1.20000000e+02,   3.08000000e+02,   1.26000000e+02,
         1.65200000e+03,   1.80000000e+01,   1.49000000e+02,
         3.34000000e+02,   1.00000000e+00,   3.70000000e+01,
         2.70000000e+01,   6.50000000e+01,   5.00000000e+00,
         3.00000000e+01,   8.90000000e+01,   1.80000000e+01,
         1.58000000e+02,   5.77000000e+02,   1.35000000e+02,
         2.70000000e+01,   1.73000000e+02,   1.35000000e+02,
         1.11000000e+02,   1.66000000e+02,   6.30000000e+01,
         3.20000000e+01,   1.41000000e+02,   6.10000000e+01,
         1.11000000e+02,   3.10000000e+01,   8.60000000e+01,
         8.15000000e+02,   7.50000000e+01,   9.60000000e+01,
         6.20000000e+01,   1.18000000e+02,   7.50000000e+01,
         3.00000000e+00,   8.04000000e+02,   3.04000000e+02,
         1.27000000e+02,   5.20000000e+01,   6.00000000e+01,
         2.10000000e+01,   5.90000000e+01,   8.70000000e+01,
         1.43000000e+02,   1.10000000e+01,   1.50000000e+01,
         9.00000000e+01,   5.00000000e+00,   2.54000000e+02,
         4.99999999e+01,   7.00000000e+00,   6.00000000e+01,
         3.97000000e+02,   5.40000000e+01,   1.33000000e+02,
         1.00000000e+00,   3.13000000e+02,   1.18000000e+02,
         1.60000000e+02,   3.09000000e+02,   3.20000000e+01,
         2.20000000e+01,   6.00000000e+01,   6.00000000e+01,
         2.08000000e+02,   6.09000000e+02,   1.49000000e+02,
         6.70000000e+01,   1.80000000e+01,   8.00000000e+00,
         6.60000000e+01,   1.37000000e+02,   5.00000000e+01,
         1.00000000e+01,   1.55000000e+02,   1.32000000e+02,
         1.15000000e+02,   2.00000000e+00,   2.20000000e+01,
         4.40000000e+01,   4.90000000e+02,   5.10000000e+01,
         3.70000000e+01,   1.68000000e+02,   6.50000000e+01,
         3.50000000e+01,   3.70000000e+01,   8.00000000e+00,
         5.00000000e+00,   9.40000000e+01,   2.50000000e+01,
         6.10000000e+01,   9.40000000e+01,   3.43000000e+02,
         7.00000000e+01,   1.00000000e+00,   5.30000000e+01,
         6.50000000e+01,   3.00000000e+01,   3.60000000e+01,
         6.00000000e+01,   6.70000000e+01,   1.10000000e+01,
         4.00000000e+00,   5.40000000e+01,   1.00000000e+00,
         3.20000000e+01,   1.00000000e+00,   4.00000000e+00,
         2.02000000e+02,   5.40000000e+01,   7.80000000e+01,
         2.20000000e+01,   3.60000000e+01,   3.23000000e+02,
         3.10000000e+01,   9.10000000e+01,   1.19000000e+02,
         5.80000000e+01,   1.86000000e+02,   4.10000000e+01,
         6.10000000e+01,   5.90000000e+01])

In [49]:
docTopics[docTopics < 0.2] = 0
docTopics


Out[49]:
array([[ 0.34705483,  0.22756193,  0.        , ...,  0.29115519,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.40504574,  0.        , ...,  0.        ,
         0.        ,  0.20902303],
       ..., 
       [ 0.        ,  0.21878985,  0.        , ...,  0.        ,
         0.20627538,  0.35743528],
       [ 0.46758474,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.27178024],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [79]:
distDocs = pdist(docTopics, 'cosine')

In [90]:
import pylab
# Compute and plot dendrogram.
fig = pylab.figure(figsize=(20,20))
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
Y = sch.linkage(distDocs, method='complete')
Z = sch.dendrogram(Y, orientation='right')
axdendro.set_xticks([])
axdendro.set_yticks([])

# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
index = Z['leaves']
D = squareform(distDocs)
D = D[index,:]
D = D[:,index]
im = axmatrix.matshow(D, aspect='auto', origin='lower')
axmatrix.set_xticks([])
axmatrix.set_yticks([])

# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
pylab.colorbar(im, cax=axcolor)

# Display and save figure.
fig.show()



In [70]:
figure(figsize=(12,12))
imshow(squareform(distDocs))
#colormaps('gray')


Out[70]:
<matplotlib.image.AxesImage at 0x7fde31e9f5d0>