In [1]:
import pyLDAvis
import pandas as pd
import json
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 250)
# np.set_printoptions(suppress=True)
In [2]:
phi_topic_term_dists_file = "/tmp/scalaLDAvis/phi/part-00000"
theta_doc_topics_dists_file = "/tmp/scalaLDAvis/theta/part-00000"
# doc_length_file = "/tmp/lda-vis/doc-length/part-00000"
vocab_file = "/tmp/scalaLDAvis/vocab/part-00000"
term_freq_file = "/tmp/scalaLDAvis/termFreq/part-00000"
In [3]:
import glob
path = '/tmp/scalaLDAvis/theta'
thetaFiles = glob.glob(path + "/part*")
theta_with_size = pd.DataFrame()
list_ = []
for file_ in (thetaFiles):
df = pd.read_csv(file_, index_col=None, header=None)
list_.append(df)
theta_with_size = pd.concat(list_)
In [4]:
theta_with_size
Out[4]:
0
1
2
3
4
5
6
7
8
9
10
0
32.0
0.002919
0.002915
0.002914
0.973755
0.002917
0.002919
0.002919
0.002916
0.002913
0.002914
1
47.0
0.002006
0.002004
0.002003
0.981962
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
2
96.0
0.000993
0.000991
0.000991
0.991076
0.000992
0.000992
0.000992
0.000991
0.000991
0.000991
3
90.0
0.001058
0.001057
0.001056
0.990488
0.001057
0.001058
0.001058
0.001057
0.001056
0.001056
4
44.0
0.002140
0.002137
0.002136
0.980759
0.002138
0.002140
0.002140
0.002138
0.002136
0.002136
5
53.0
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
6
41.0
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
7
36.0
0.002603
0.002600
0.002599
0.976595
0.002601
0.002603
0.002603
0.002600
0.002598
0.002599
8
48.0
0.001965
0.001963
0.001962
0.982330
0.001964
0.001965
0.001965
0.001963
0.001961
0.001962
9
19.0
0.004820
0.004813
0.004811
0.956668
0.004816
0.004819
0.004819
0.004814
0.004810
0.004811
10
89.0
0.001070
0.001068
0.001068
0.990382
0.001069
0.001070
0.001070
0.001068
0.001068
0.001068
11
62.0
0.001528
0.001526
0.001526
0.986259
0.001527
0.001528
0.001528
0.001527
0.001525
0.001526
12
143.0
0.000668
0.000668
0.000667
0.993990
0.000668
0.000668
0.000668
0.000668
0.000667
0.000667
13
87.0
0.001094
0.001093
0.001092
0.990164
0.001093
0.001094
0.001094
0.001093
0.001092
0.001092
14
137.0
0.000698
0.000697
0.000696
0.993729
0.000697
0.000697
0.000697
0.000697
0.000696
0.000696
15
92.0
0.001035
0.001034
0.001033
0.990692
0.001034
0.001035
0.001035
0.001034
0.001033
0.001034
16
41.0
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
17
230.0
0.000417
0.000416
0.000416
0.996254
0.000416
0.000417
0.000417
0.000416
0.000416
0.000416
18
83.0
0.001146
0.001145
0.001144
0.989695
0.001145
0.001146
0.001146
0.001145
0.001144
0.001144
19
416.0
0.000231
0.000231
0.000230
0.997925
0.000231
0.000231
0.000231
0.000231
0.000230
0.000230
20
23.0
0.004015
0.004010
0.004008
0.963901
0.004012
0.004015
0.004014
0.004010
0.004007
0.004008
21
77.0
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
22
59.0
0.001605
0.001603
0.001602
0.985572
0.001603
0.001605
0.001605
0.001603
0.001602
0.001602
23
73.0
0.001301
0.001299
0.001299
0.988302
0.001300
0.001301
0.001301
0.001300
0.001298
0.001299
24
394.0
0.000244
0.000243
0.000243
0.997809
0.000243
0.000244
0.000244
0.000243
0.000243
0.000243
25
57.0
0.001660
0.001658
0.001657
0.985073
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
26
367.0
0.000262
0.000261
0.000261
0.997648
0.000261
0.000262
0.000262
0.000261
0.000261
0.000261
27
54.0
0.001751
0.001748
0.001748
0.984258
0.001749
0.001751
0.001751
0.001749
0.001747
0.001748
28
43.0
0.002189
0.002186
0.002185
0.980321
0.002187
0.002189
0.002188
0.002186
0.002184
0.002185
29
145.0
0.000659
0.000658
0.000658
0.994072
0.000659
0.000659
0.000659
0.000659
0.000658
0.000658
30
47.0
0.002006
0.002004
0.002003
0.981961
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
31
141.0
0.000678
0.000677
0.000677
0.993905
0.000677
0.000678
0.000678
0.000677
0.000677
0.000677
32
39.0
0.002408
0.002405
0.002404
0.978352
0.002406
0.002407
0.002407
0.002405
0.002403
0.002404
33
28.0
0.003322
0.003318
0.003316
0.970132
0.003319
0.003322
0.003321
0.003318
0.003315
0.003316
34
38.0
0.002470
0.002466
0.002465
0.977796
0.002467
0.002469
0.002469
0.002467
0.002465
0.002466
35
57.0
0.001660
0.001658
0.001657
0.985074
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
36
63.0
0.001504
0.001502
0.001502
0.986473
0.001503
0.001504
0.001504
0.001503
0.001501
0.001502
37
89.0
0.001070
0.001068
0.001068
0.990383
0.001069
0.001070
0.001070
0.001068
0.001068
0.001068
38
54.0
0.001751
0.001748
0.001748
0.984259
0.001749
0.001751
0.001751
0.001749
0.001747
0.001748
39
36.0
0.002603
0.002600
0.002599
0.976596
0.002601
0.002603
0.002603
0.002600
0.002598
0.002599
40
74.0
0.001284
0.001282
0.001281
0.988458
0.001283
0.001284
0.001284
0.001282
0.001281
0.001282
41
868.0
0.000111
0.000111
0.000111
0.999004
0.000111
0.000111
0.000111
0.000111
0.000111
0.000111
42
114.0
0.000837
0.000836
0.000836
0.992474
0.000836
0.000837
0.000837
0.000836
0.000835
0.000836
43
16.0
0.005671
0.005663
0.005661
0.949013
0.005666
0.005670
0.005670
0.005664
0.005659
0.005661
44
33.0
0.002833
0.002829
0.002828
0.974528
0.002831
0.002833
0.002833
0.002830
0.002827
0.002828
45
120.0
0.000796
0.000795
0.000794
0.992847
0.000795
0.000796
0.000795
0.000795
0.000794
0.000794
46
209.0
0.000458
0.000458
0.000458
0.995879
0.000458
0.000458
0.000458
0.000458
0.000457
0.000458
47
92.0
0.001035
0.001034
0.001033
0.990693
0.001034
0.001035
0.001035
0.001034
0.001033
0.001033
48
47.0
0.002006
0.002004
0.002003
0.981961
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
49
66.0
0.001437
0.001435
0.001435
0.987079
0.001436
0.001437
0.001437
0.001435
0.001434
0.001435
50
204.0
0.000470
0.000469
0.000469
0.995778
0.000469
0.000469
0.000469
0.000469
0.000469
0.000469
51
51.0
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
52
32.0
0.002919
0.002915
0.002914
0.973755
0.002917
0.002919
0.002919
0.002916
0.002913
0.002914
53
134.0
0.000713
0.000712
0.000712
0.993589
0.000712
0.000713
0.000713
0.000712
0.000712
0.000712
54
210.0
0.000456
0.000456
0.000455
0.995899
0.000456
0.000456
0.000456
0.000456
0.000455
0.000455
55
23.0
0.004015
0.004010
0.004008
0.963900
0.004012
0.004015
0.004014
0.004010
0.004007
0.004008
56
55.0
0.001720
0.001717
0.001717
0.984540
0.001718
0.001719
0.001719
0.001718
0.001716
0.001717
57
50.0
0.001888
0.001886
0.001885
0.983024
0.001887
0.001888
0.001888
0.001886
0.001884
0.001885
58
123.0
0.000776
0.000775
0.000775
0.993020
0.000776
0.000776
0.000776
0.000775
0.000775
0.000775
59
14.0
0.006429
0.006421
0.006418
0.942195
0.006424
0.006429
0.006428
0.006422
0.006416
0.006419
60
40.0
0.002349
0.002346
0.002345
0.978880
0.002347
0.002349
0.002349
0.002346
0.002344
0.002345
61
75.0
0.001267
0.001265
0.001265
0.988610
0.001266
0.001267
0.001267
0.001265
0.001264
0.001265
62
71.0
0.001337
0.001335
0.001335
0.987977
0.001336
0.001337
0.001337
0.001336
0.001335
0.001335
63
33.0
0.002833
0.002829
0.002828
0.974528
0.002831
0.002833
0.002833
0.002830
0.002827
0.002828
64
45.0
0.002094
0.002091
0.002090
0.981177
0.002092
0.002093
0.002093
0.002091
0.002089
0.002090
65
72.0
0.001319
0.001317
0.001317
0.988142
0.001318
0.001319
0.001319
0.001317
0.001316
0.001317
66
127.0
0.000752
0.000751
0.000751
0.993238
0.000751
0.000752
0.000752
0.000751
0.000751
0.000751
67
233.0
0.000411
0.000411
0.000411
0.996302
0.000411
0.000411
0.000411
0.000411
0.000411
0.000411
68
129.0
0.000740
0.000739
0.000739
0.993343
0.000740
0.000740
0.000740
0.000740
0.000739
0.000739
69
140.0
0.000683
0.000682
0.000681
0.993862
0.000682
0.000683
0.000683
0.000682
0.000681
0.000682
70
122.0
0.000783
0.000782
0.000781
0.992964
0.000782
0.000783
0.000782
0.000782
0.000781
0.000781
71
299.0
0.000321
0.000320
0.000320
0.997115
0.000321
0.000321
0.000321
0.000320
0.000320
0.000320
72
139.0
0.000688
0.000687
0.000686
0.993818
0.000687
0.000688
0.000687
0.000687
0.000686
0.000686
73
35.0
0.002676
0.002672
0.002671
0.975944
0.002673
0.002675
0.002675
0.002673
0.002670
0.002671
74
66.0
0.001437
0.001435
0.001435
0.987080
0.001436
0.001437
0.001437
0.001435
0.001434
0.001435
75
44.0
0.002140
0.002137
0.002136
0.980758
0.002138
0.002140
0.002140
0.002138
0.002136
0.002137
76
208.0
0.000461
0.000460
0.000460
0.995859
0.000460
0.000461
0.000460
0.000460
0.000460
0.000460
77
28.0
0.003322
0.003318
0.003316
0.970133
0.003319
0.003322
0.003321
0.003318
0.003315
0.003316
78
171.0
0.000560
0.000559
0.000559
0.994968
0.000559
0.000560
0.000560
0.000559
0.000558
0.000559
79
102.0
0.000935
0.000933
0.000933
0.991597
0.000934
0.000935
0.000934
0.000934
0.000933
0.000933
80
38.0
0.002469
0.002466
0.002465
0.977797
0.002467
0.002469
0.002469
0.002467
0.002465
0.002465
81
27.0
0.003441
0.003436
0.003435
0.969064
0.003438
0.003440
0.003440
0.003437
0.003434
0.003435
82
58.0
0.001632
0.001630
0.001629
0.985325
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
83
3149.0
0.000031
0.000031
0.000030
0.999725
0.000031
0.000031
0.000031
0.000031
0.000030
0.000031
84
63.0
0.001504
0.001502
0.001502
0.986473
0.001503
0.001504
0.001504
0.001503
0.001501
0.001502
85
58.0
0.001632
0.001630
0.001629
0.985326
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
86
38.0
0.002470
0.002466
0.002465
0.977795
0.002468
0.002469
0.002469
0.002467
0.002465
0.002466
87
96.0
0.000992
0.000991
0.000991
0.991077
0.000992
0.000992
0.000992
0.000991
0.000990
0.000991
88
156.0
0.000613
0.000612
0.000612
0.994488
0.000613
0.000613
0.000613
0.000612
0.000612
0.000612
89
25.0
0.003706
0.003701
0.003699
0.966681
0.003703
0.003705
0.003705
0.003702
0.003698
0.003700
90
70.0
0.001356
0.001354
0.001354
0.987807
0.001355
0.001356
0.001356
0.001355
0.001353
0.001354
91
38.0
0.002470
0.002466
0.002465
0.977796
0.002468
0.002469
0.002469
0.002467
0.002465
0.002465
92
34.0
0.002752
0.002748
0.002747
0.975256
0.002750
0.002752
0.002752
0.002749
0.002747
0.002747
93
103.0
0.000926
0.000924
0.000924
0.991678
0.000925
0.000926
0.000925
0.000925
0.000924
0.000924
94
29.0
0.003211
0.003207
0.003205
0.971129
0.003208
0.003211
0.003211
0.003207
0.003205
0.003206
95
637.0
0.000151
0.000151
0.000151
0.998644
0.000151
0.000151
0.000151
0.000151
0.000151
0.000151
96
142.0
0.000673
0.000672
0.000672
0.993948
0.000673
0.000673
0.000673
0.000672
0.000672
0.000672
97
40.0
0.002349
0.002346
0.002345
0.978880
0.002347
0.002349
0.002349
0.002346
0.002344
0.002345
98
151.0
0.000633
0.000632
0.000632
0.994306
0.000633
0.000633
0.000633
0.000633
0.000632
0.000632
99
49.0
0.001926
0.001923
0.001923
0.982684
0.001924
0.001926
0.001926
0.001924
0.001922
0.001923
100
80.0
0.001189
0.001187
0.001187
0.989313
0.001188
0.001188
0.001188
0.001187
0.001186
0.001187
101
245.0
0.000391
0.000391
0.000391
0.996482
0.000391
0.000391
0.000391
0.000391
0.000390
0.000391
102
120.0
0.000796
0.000795
0.000794
0.992847
0.000795
0.000795
0.000795
0.000795
0.000794
0.000794
103
47.0
0.002006
0.002004
0.002003
0.981962
0.002004
0.002006
0.002006
0.002004
0.002002
0.002003
104
96.0
0.000992
0.000991
0.000991
0.991077
0.000992
0.000992
0.000992
0.000991
0.000990
0.000991
105
58.0
0.001632
0.001630
0.001629
0.985327
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
106
77.0
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
107
124.0
0.000770
0.000769
0.000769
0.993076
0.000769
0.000770
0.000770
0.000769
0.000769
0.000769
108
70.0
0.001356
0.001354
0.001354
0.987808
0.001355
0.001356
0.001356
0.001354
0.001353
0.001354
109
244.0
0.000393
0.000392
0.000392
0.996468
0.000393
0.000393
0.000393
0.000392
0.000392
0.000392
110
92.0
0.001035
0.001034
0.001033
0.990693
0.001034
0.001035
0.001035
0.001034
0.001033
0.001033
111
72.0
0.001319
0.001317
0.001317
0.988142
0.001318
0.001319
0.001319
0.001317
0.001316
0.001317
112
71.0
0.001337
0.001335
0.001335
0.987977
0.001336
0.001337
0.001337
0.001336
0.001335
0.001335
113
80.0
0.001189
0.001187
0.001187
0.989313
0.001188
0.001189
0.001188
0.001187
0.001186
0.001187
114
44.0
0.002140
0.002137
0.002136
0.980758
0.002138
0.002140
0.002140
0.002138
0.002136
0.002137
115
69.0
0.001375
0.001374
0.001373
0.987633
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
116
53.0
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
117
76.0
0.001250
0.001249
0.001248
0.988758
0.001249
0.001250
0.001250
0.001249
0.001248
0.001248
118
57.0
0.001660
0.001658
0.001657
0.985073
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
119
28.0
0.003322
0.003317
0.003316
0.970133
0.003319
0.003322
0.003321
0.003318
0.003315
0.003316
120
132.0
0.000724
0.000723
0.000723
0.993493
0.000723
0.000724
0.000724
0.000723
0.000722
0.000723
121
75.0
0.001267
0.001265
0.001265
0.988610
0.001266
0.001267
0.001267
0.001265
0.001264
0.001265
122
52.0
0.001817
0.001814
0.001814
0.983665
0.001815
0.001817
0.001817
0.001815
0.001813
0.001814
123
56.0
0.001689
0.001687
0.001686
0.984812
0.001688
0.001689
0.001689
0.001687
0.001686
0.001686
124
21.0
0.004381
0.004375
0.004373
0.960615
0.004377
0.004380
0.004380
0.004375
0.004372
0.004373
...
...
...
...
...
...
...
...
...
...
...
...
6375
42.0
0.002240
0.002237
0.002236
0.979862
0.002238
0.002240
0.002239
0.002237
0.002235
0.002236
6376
123.0
0.000776
0.000775
0.000775
0.993020
0.000776
0.000776
0.000776
0.000775
0.000775
0.000775
6377
77.0
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
6378
98.0
0.000972
0.000971
0.000971
0.991257
0.000972
0.000972
0.000972
0.000971
0.000970
0.000971
6379
41.0
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002291
0.002288
0.002289
6380
58.0
0.001632
0.001630
0.001629
0.985327
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
6381
123.0
0.000776
0.000775
0.000775
0.993020
0.000776
0.000776
0.000776
0.000775
0.000775
0.000775
6382
51.0
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
6383
192.0
0.000499
0.000498
0.000498
0.995516
0.000498
0.000499
0.000499
0.000498
0.000498
0.000498
6384
65.0
0.001459
0.001457
0.001456
0.986883
0.001458
0.001459
0.001459
0.001457
0.001456
0.001456
6385
49.0
0.001926
0.001923
0.001923
0.982683
0.001924
0.001926
0.001926
0.001924
0.001922
0.001923
6386
111.0
0.000860
0.000858
0.000858
0.992272
0.000859
0.000859
0.000859
0.000859
0.000858
0.000858
6387
73.0
0.001301
0.001299
0.001299
0.988302
0.001300
0.001301
0.001301
0.001300
0.001298
0.001299
6388
35.0
0.002676
0.002672
0.002671
0.975943
0.002673
0.002675
0.002675
0.002673
0.002670
0.002671
6389
66.0
0.001437
0.001435
0.001435
0.987079
0.001436
0.001437
0.001437
0.001435
0.001434
0.001435
6390
50.0
0.001888
0.001886
0.001885
0.983024
0.001887
0.001888
0.001888
0.001886
0.001884
0.001885
6391
68.0
0.001395
0.001394
0.001393
0.987454
0.001394
0.001395
0.001395
0.001394
0.001393
0.001393
6392
99.0
0.000963
0.000961
0.000961
0.991344
0.000962
0.000963
0.000963
0.000962
0.000961
0.000961
6393
143.0
0.000668
0.000668
0.000667
0.993990
0.000668
0.000668
0.000668
0.000668
0.000667
0.000667
6394
54.0
0.001751
0.001749
0.001748
0.984259
0.001749
0.001751
0.001750
0.001749
0.001747
0.001748
6395
314.0
0.000306
0.000305
0.000305
0.997253
0.000305
0.000306
0.000305
0.000305
0.000305
0.000305
6396
52.0
0.001817
0.001814
0.001814
0.983664
0.001815
0.001817
0.001817
0.001815
0.001813
0.001814
6397
29.0
0.003211
0.003207
0.003206
0.971128
0.003209
0.003211
0.003211
0.003207
0.003205
0.003206
6398
28.0
0.003322
0.003318
0.003316
0.970132
0.003319
0.003322
0.003322
0.003318
0.003315
0.003317
6399
24.0
0.003854
0.003849
0.003847
0.965347
0.003851
0.003854
0.003854
0.003850
0.003846
0.003848
6400
62.0
0.001528
0.001526
0.001526
0.986259
0.001527
0.001528
0.001528
0.001527
0.001525
0.001526
6401
41.0
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
6402
51.0
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
6403
26.0
0.003568
0.003564
0.003562
0.967917
0.003565
0.003568
0.003568
0.003564
0.003561
0.003562
6404
241.0
0.000398
0.000397
0.000397
0.996424
0.000397
0.000398
0.000398
0.000397
0.000397
0.000397
6405
69.0
0.001375
0.001374
0.001373
0.987633
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
6406
54.0
0.001751
0.001748
0.001748
0.984259
0.001749
0.001751
0.001750
0.001749
0.001747
0.001748
6407
20.0
0.004589
0.004583
0.004581
0.958738
0.004585
0.004589
0.004588
0.004584
0.004580
0.004582
6408
316.0
0.000304
0.000303
0.000303
0.997270
0.000303
0.000304
0.000304
0.000303
0.000303
0.000303
6409
27.0
0.003441
0.003436
0.003435
0.969063
0.003438
0.003441
0.003440
0.003437
0.003434
0.003435
6410
19.0
0.004819
0.004813
0.004811
0.956672
0.004815
0.004819
0.004818
0.004814
0.004809
0.004811
6411
32.0
0.002919
0.002915
0.002914
0.973754
0.002917
0.002919
0.002919
0.002916
0.002913
0.002914
6412
81.0
0.001174
0.001172
0.001172
0.989444
0.001173
0.001174
0.001174
0.001173
0.001172
0.001172
6413
159.0
0.000602
0.000601
0.000601
0.994591
0.000601
0.000602
0.000602
0.000601
0.000600
0.000601
6414
44.0
0.002140
0.002137
0.002136
0.980759
0.002138
0.002140
0.002140
0.002138
0.002136
0.002136
6415
112.0
0.000852
0.000851
0.000850
0.992340
0.000851
0.000852
0.000852
0.000851
0.000850
0.000850
6416
251.0
0.000382
0.000381
0.000381
0.996566
0.000382
0.000382
0.000382
0.000381
0.000381
0.000381
6417
21.0
0.004380
0.004375
0.004373
0.960616
0.004377
0.004380
0.004380
0.004375
0.004372
0.004373
6418
243.0
0.000394
0.000394
0.000394
0.996453
0.000394
0.000394
0.000394
0.000394
0.000394
0.000394
6419
57.0
0.001660
0.001658
0.001657
0.985074
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
6420
12.0
0.007420
0.007410
0.007407
0.933285
0.007414
0.007420
0.007419
0.007412
0.007405
0.007408
6421
335.0
0.000286
0.000286
0.000286
0.997425
0.000286
0.000286
0.000286
0.000286
0.000286
0.000286
6422
63.0
0.001505
0.001502
0.001502
0.986473
0.001503
0.001504
0.001504
0.001503
0.001501
0.001502
6423
31.0
0.003010
0.003006
0.003005
0.972935
0.003008
0.003010
0.003010
0.003007
0.003004
0.003005
6424
37.0
0.002535
0.002531
0.002530
0.977212
0.002532
0.002534
0.002534
0.002532
0.002529
0.002530
6425
47.0
0.002006
0.002004
0.002003
0.981962
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
6426
53.0
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
6427
33.0
0.002833
0.002830
0.002828
0.974526
0.002831
0.002833
0.002833
0.002830
0.002828
0.002829
6428
115.0
0.000830
0.000829
0.000828
0.992539
0.000829
0.000830
0.000830
0.000829
0.000828
0.000828
6429
92.0
0.001035
0.001034
0.001033
0.990693
0.001034
0.001035
0.001035
0.001034
0.001033
0.001033
6430
88.0
0.001082
0.001080
0.001080
0.990274
0.001081
0.001082
0.001082
0.001080
0.001080
0.001080
6431
67.0
0.001416
0.001414
0.001413
0.987269
0.001415
0.001416
0.001416
0.001414
0.001413
0.001414
6432
77.0
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
6433
99.0
0.000963
0.000961
0.000961
0.991344
0.000962
0.000963
0.000963
0.000962
0.000961
0.000961
6434
87.0
0.001094
0.001093
0.001092
0.990164
0.001093
0.001094
0.001094
0.001093
0.001092
0.001092
6435
100.0
0.000953
0.000952
0.000951
0.991431
0.000952
0.000953
0.000953
0.000952
0.000951
0.000952
6436
27.0
0.003441
0.003436
0.003435
0.969065
0.003438
0.003440
0.003440
0.003437
0.003434
0.003435
6437
54.0
0.001751
0.001748
0.001748
0.984259
0.001749
0.001751
0.001750
0.001749
0.001747
0.001748
6438
69.0
0.001375
0.001374
0.001373
0.987634
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
6439
97.0
0.000982
0.000981
0.000981
0.991168
0.000982
0.000982
0.000982
0.000981
0.000980
0.000981
6440
65.0
0.001459
0.001457
0.001456
0.986883
0.001458
0.001459
0.001459
0.001457
0.001456
0.001456
6441
50.0
0.001888
0.001886
0.001885
0.983024
0.001887
0.001888
0.001888
0.001886
0.001884
0.001885
6442
66.0
0.001437
0.001435
0.001435
0.987079
0.001436
0.001437
0.001437
0.001436
0.001434
0.001435
6443
46.0
0.002049
0.002046
0.002045
0.981577
0.002047
0.002049
0.002049
0.002047
0.002045
0.002046
6444
43.0
0.002189
0.002186
0.002185
0.980321
0.002187
0.002189
0.002188
0.002186
0.002184
0.002185
6445
101.0
0.000944
0.000943
0.000942
0.991514
0.000943
0.000944
0.000944
0.000943
0.000942
0.000942
6446
46.0
0.002049
0.002046
0.002045
0.981577
0.002047
0.002049
0.002049
0.002047
0.002045
0.002046
6447
143.0
0.000668
0.000668
0.000667
0.993990
0.000668
0.000668
0.000668
0.000668
0.000667
0.000667
6448
45.0
0.002094
0.002091
0.002090
0.981177
0.002092
0.002093
0.002093
0.002091
0.002089
0.002090
6449
102.0
0.000935
0.000933
0.000933
0.991596
0.000934
0.000935
0.000935
0.000934
0.000933
0.000933
6450
303.0
0.000317
0.000316
0.000316
0.997153
0.000316
0.000317
0.000317
0.000316
0.000316
0.000316
6451
86.0
0.001107
0.001105
0.001105
0.990051
0.001106
0.001106
0.001106
0.001105
0.001104
0.001105
6452
124.0
0.000770
0.000769
0.000769
0.993076
0.000769
0.000770
0.000770
0.000769
0.000769
0.000769
6453
702.0
0.000137
0.000137
0.000137
0.998769
0.000137
0.000137
0.000137
0.000137
0.000137
0.000137
6454
49.0
0.001926
0.001924
0.001923
0.982682
0.001924
0.001926
0.001926
0.001924
0.001922
0.001923
6455
65.0
0.001459
0.001457
0.001456
0.986884
0.001458
0.001459
0.001459
0.001457
0.001456
0.001456
6456
41.0
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
6457
46.0
0.002049
0.002046
0.002045
0.981578
0.002047
0.002049
0.002049
0.002047
0.002045
0.002046
6458
117.0
0.000816
0.000815
0.000814
0.992665
0.000815
0.000816
0.000816
0.000815
0.000814
0.000814
6459
882.0
0.000109
0.000109
0.000109
0.999020
0.000109
0.000109
0.000109
0.000109
0.000109
0.000109
6460
169.0
0.000566
0.000565
0.000565
0.994909
0.000566
0.000566
0.000566
0.000566
0.000565
0.000565
6461
716.0
0.000134
0.000134
0.000134
0.998793
0.000134
0.000134
0.000134
0.000134
0.000134
0.000134
6462
62.0
0.001528
0.001526
0.001526
0.986259
0.001527
0.001528
0.001528
0.001527
0.001525
0.001526
6463
70.0
0.001356
0.001354
0.001354
0.987808
0.001355
0.001356
0.001356
0.001355
0.001353
0.001354
6464
30.0
0.003107
0.003103
0.003102
0.972061
0.003105
0.003107
0.003107
0.003104
0.003101
0.003102
6465
89.0
0.001070
0.001068
0.001068
0.990382
0.001069
0.001070
0.001070
0.001068
0.001068
0.001068
6466
98.0
0.000972
0.000971
0.000971
0.991257
0.000972
0.000972
0.000972
0.000971
0.000970
0.000971
6467
25.0
0.003706
0.003701
0.003699
0.966681
0.003703
0.003705
0.003705
0.003702
0.003698
0.003700
6468
128.0
0.000746
0.000745
0.000745
0.993291
0.000746
0.000746
0.000746
0.000745
0.000745
0.000745
6469
59.0
0.001605
0.001603
0.001602
0.985572
0.001603
0.001605
0.001605
0.001603
0.001602
0.001602
6470
268.0
0.000358
0.000357
0.000357
0.996783
0.000358
0.000358
0.000358
0.000357
0.000357
0.000357
6471
44.0
0.002140
0.002137
0.002136
0.980758
0.002138
0.002140
0.002140
0.002138
0.002136
0.002137
6472
30.0
0.003108
0.003103
0.003102
0.972060
0.003105
0.003107
0.003107
0.003104
0.003101
0.003102
6473
60.0
0.001578
0.001576
0.001576
0.985808
0.001577
0.001578
0.001578
0.001577
0.001575
0.001576
6474
52.0
0.001817
0.001815
0.001814
0.983664
0.001815
0.001817
0.001817
0.001815
0.001813
0.001814
6475
82.0
0.001160
0.001158
0.001158
0.989571
0.001159
0.001160
0.001160
0.001159
0.001158
0.001158
6476
69.0
0.001375
0.001374
0.001373
0.987634
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
6477
42.0
0.002240
0.002237
0.002236
0.979863
0.002238
0.002239
0.002239
0.002237
0.002235
0.002236
6478
40.0
0.002349
0.002346
0.002345
0.978880
0.002347
0.002349
0.002349
0.002346
0.002344
0.002345
6479
38.0
0.002470
0.002466
0.002465
0.977796
0.002468
0.002469
0.002469
0.002467
0.002465
0.002465
6480
18.0
0.005073
0.005067
0.005065
0.954385
0.005069
0.005073
0.005073
0.005068
0.005063
0.005065
6481
53.0
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
6482
61.0
0.001553
0.001551
0.001550
0.986037
0.001552
0.001553
0.001553
0.001551
0.001550
0.001550
6483
122.0
0.000783
0.000782
0.000781
0.992963
0.000782
0.000783
0.000783
0.000782
0.000781
0.000781
6484
207.0
0.000463
0.000462
0.000462
0.995839
0.000462
0.000463
0.000463
0.000462
0.000462
0.000462
6485
19.0
0.004821
0.004813
0.004811
0.956666
0.004815
0.004819
0.004819
0.004814
0.004810
0.004811
6486
68.0
0.001395
0.001393
0.001393
0.987454
0.001394
0.001395
0.001395
0.001394
0.001393
0.001393
6487
87.0
0.001094
0.001093
0.001092
0.990164
0.001093
0.001094
0.001094
0.001093
0.001092
0.001092
6488
38.0
0.002470
0.002466
0.002465
0.977796
0.002467
0.002469
0.002469
0.002467
0.002465
0.002465
6489
39.0
0.002408
0.002405
0.002404
0.978351
0.002406
0.002408
0.002407
0.002405
0.002403
0.002404
6490
114.0
0.000837
0.000836
0.000836
0.992474
0.000836
0.000837
0.000837
0.000836
0.000835
0.000836
6491
305.0
0.000315
0.000314
0.000314
0.997172
0.000314
0.000315
0.000314
0.000314
0.000314
0.000314
6492
73.0
0.001301
0.001299
0.001299
0.988302
0.001300
0.001301
0.001301
0.001300
0.001298
0.001299
6493
41.0
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
6494
58.0
0.001632
0.001630
0.001629
0.985327
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
6495
51.0
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
6496
64.0
0.001482
0.001479
0.001479
0.986680
0.001480
0.001481
0.001481
0.001480
0.001478
0.001479
6497
778.0
0.000124
0.000123
0.000123
0.998889
0.000123
0.000124
0.000124
0.000123
0.000123
0.000123
6498
48.0
0.001965
0.001963
0.001962
0.982330
0.001964
0.001965
0.001965
0.001963
0.001961
0.001962
6499
992.0
0.000097
0.000097
0.000097
0.999129
0.000097
0.000097
0.000097
0.000097
0.000097
0.000097
11314 rows × 11 columns
In [5]:
theta_with_size = theta_with_size[theta_with_size[0] > 0]
phi = pd.read_csv(phi_topic_term_dists_file, header=None)
# theta_with_size = pd.read_csv(theta_doc_topics_dists_file, header=None)
theta = theta_with_size.ix[:,1:]
doc_length = theta_with_size.ix[:,0]
vocab = pd.read_csv(vocab_file, header=None)
term_freq = pd.read_csv(term_freq_file, header=None)
In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]
In [7]:
data = {'topic_term_dists': phi.values.tolist(),
'doc_topic_dists': theta.values.tolist(),
'doc_lengths': doc_length.astype("int64").values.tolist(),
'vocab': flatten(vocab.values.tolist()),
'term_frequency': flatten(term_freq.astype("int64").values.tolist())}
print('Topic-Term shape: %s' % str(np.array(data['topic_term_dists']).shape))
print('Doc-Topic shape: %s' % str(np.array(data['doc_topic_dists']).shape))
Topic-Term shape: (10, 50000)
Doc-Topic shape: (11314, 10)
In [44]:
lda_vis_data = pyLDAvis.prepare(**data)
In [45]:
pyLDAvis.enable_notebook()
In [46]:
# pyLDAvis.prepare(mds='mmds', **data)
In [57]:
pyLDAvis.save_json(lda_vis_data, "/tmp/scalaLDAvis/pyLDAvis/lda.json")
In [48]:
pyLDAvis.save_html(lda_vis_data, "/tmp/ldavis.html")
In [49]:
pyLDAvis.urls
Out[49]:
<module 'pyLDAvis.urls' from '/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/pyLDAvis/urls.py'>
In [16]:
# !cat /home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/pyLDAvis/urls.py
K(integer) = number of topics (e.g. 50)
V(integer) = number of words in the vocabulary (e.g. 50,000 or 1,000,000)
M(integer) = number of documents
$N_{d=1\dots M} $(integer) = number of words in document d
N(integer) = total number of words in all documents; sum of all $N_{d}$ values, i.e. $N=\sum _{d=1}^{M}N_{d}$
Z = N-dimension vector of integers between 1 and K = identity of topic of all words in all documents
W = N-dimension vector of integers between 1 and V = identity of all words in all documents
In [8]:
from pyLDAvis import *
from pyLDAvis._prepare import *
In [9]:
topic_term_dists = pyLDAvis._prepare._df_with_names(data['topic_term_dists'], 'topic', 'term')
#[K x V]
doc_topic_dists = pyLDAvis._prepare._df_with_names(data['doc_topic_dists'], 'doc', 'topic')
#[M x K]
term_frequency = pyLDAvis._prepare._series_with_name(data['term_frequency'], 'term_frequency')
#[V]
doc_lengths = pyLDAvis._prepare._series_with_name(data['doc_lengths'], 'doc_length')
#[M]
vocab = pyLDAvis._prepare._series_with_name(data['vocab'], 'vocab')
#[V]
In [10]:
topic_term_dists.shape#[K x V]
topic_term_dists
Out[10]:
term
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
...
49925
49926
49927
49928
49929
49930
49931
49932
49933
49934
49935
49936
49937
49938
49939
49940
49941
49942
49943
49944
49945
49946
49947
49948
49949
49950
49951
49952
49953
49954
49955
49956
49957
49958
49959
49960
49961
49962
49963
49964
49965
49966
49967
49968
49969
49970
49971
49972
49973
49974
49975
49976
49977
49978
49979
49980
49981
49982
49983
49984
49985
49986
49987
49988
49989
49990
49991
49992
49993
49994
49995
49996
49997
49998
49999
topic
0
24.870688
21.559374
24.848545
10.189182
9.700961
16.208485
7.853970
22.979569
7.406882
16.984633
15.510591
12.958005
9.055710
8.363979
4.247700
11.506454
7.047064
9.709759
10.177908
8.035401
6.826531
6.829253
3.707348
9.759646
3.332268
3.800230
6.738490
9.448646
6.062157
7.730785
6.032167
6.699437
7.445487
5.886514
8.183782
6.208295
2.992643
6.148651
8.222009
6.928704
6.434229
5.013328
4.641739
6.692826
8.928966
2.785740
7.005564
8.044817
6.570035
3.074756
7.252581
6.722737
6.668937
2.810734
4.977989
8.117550
5.247739
3.430537
4.290412
2.861926
3.671912
1.527140
1.548591
2.991851
4.643409
5.084082
4.279109
4.411219
2.840253
7.871005
4.478428
3.846753
1.984382
4.714036
7.401284
...
0.643118
1.071917
1.058301
0.967577
0.956299
0.788641
0.776995
0.906096
0.975965
0.780293
0.855194
0.952308
0.795817
1.062600
0.953764
1.081162
0.929956
1.040463
1.133978
1.064275
0.881111
0.934912
0.972580
1.060295
0.789618
0.931094
0.864498
1.073642
0.850832
0.961806
0.955679
0.752066
0.803353
0.812691
0.795662
0.911058
1.101044
1.002116
0.946687
0.893284
0.830108
0.737415
0.730675
0.925176
0.809287
0.930815
0.835031
0.920183
0.797343
1.467583
1.006026
0.965949
1.198470
0.809871
0.826039
0.729040
0.984685
0.996069
0.925957
0.937184
0.798756
1.131209
0.855144
0.845376
0.895367
0.863819
0.699012
0.819847
0.951682
0.911374
0.964902
0.966253
0.964197
0.882192
0.755758
1
33.401918
17.078428
20.044636
6.946050
13.719605
7.947532
8.534269
8.663387
6.697656
9.465463
11.057000
11.050145
9.298605
4.480227
1.766929
5.384216
10.126207
5.649822
5.147637
5.386281
9.070167
8.860614
5.994165
3.408542
7.349286
2.880106
2.133808
3.527771
6.021526
2.488975
5.013918
11.006257
4.978995
4.110157
15.355072
6.907951
4.829160
5.646619
1.776806
5.751680
3.555192
3.809261
3.240087
4.008485
5.033435
3.569024
5.101061
7.123383
6.814409
4.810447
7.196640
3.133040
3.777292
5.216637
5.975628
4.905718
8.065505
6.542123
2.360237
5.742254
2.893492
5.474456
3.386709
1.662291
2.726652
1.593486
4.276451
4.739083
1.664434
5.010978
3.763495
8.493277
13.527814
3.685822
5.048271
...
0.869655
0.954529
0.849056
0.798737
0.913945
1.064351
1.018427
0.906554
0.731343
1.007573
0.900765
0.922811
0.790644
0.974767
0.859631
0.994104
0.893179
0.832139
0.972841
0.842145
0.845484
0.761239
0.843164
0.890486
1.012105
1.067445
0.837786
0.964018
1.163420
0.864480
0.905981
0.771364
0.907022
1.037604
0.981154
1.014030
1.055378
0.875974
1.030438
0.844262
0.854144
0.846996
0.976750
0.874581
0.902618
1.004214
0.955245
1.038586
0.922442
0.963343
0.945768
0.924197
1.107366
0.843171
0.961767
0.946991
1.030004
1.029144
0.897846
0.846511
0.933582
0.938480
0.923647
0.995120
0.981613
0.943782
1.054313
0.941558
1.005561
0.729361
0.840127
1.073069
0.989798
1.068669
0.896674
2
18.231773
9.661907
10.402560
16.730183
8.751762
8.003756
11.083495
3.170324
6.382108
6.132467
4.400303
5.538933
4.512643
2.754489
5.699249
3.347013
6.372030
7.221675
4.804678
5.397067
3.476188
5.910220
4.351529
3.417071
3.968878
3.870700
4.542757
3.764698
3.675050
6.404810
3.475772
2.275368
4.140666
3.962001
2.143531
3.064304
5.035950
2.100947
2.359482
2.834560
4.076997
2.855027
6.964688
1.772505
5.158982
2.742678
3.713746
2.034397
6.793594
4.881100
3.207033
1.493543
2.174001
3.664113
4.354543
9.127847
2.796105
3.788937
3.941233
3.511743
4.206168
2.838678
1.864739
2.929351
2.236427
3.369810
1.964124
2.790292
2.835056
2.379437
2.505491
4.057797
2.514965
1.177314
3.077369
...
0.823767
0.943717
0.962192
0.886888
0.981371
0.930903
0.894427
1.117578
0.762742
0.947954
0.909953
0.825686
0.803668
0.832389
1.011635
0.817818
0.922711
1.086938
0.879801
0.851625
1.000546
0.971243
0.991476
1.171215
0.907269
0.718800
1.145152
0.962559
0.916640
0.809148
0.965341
0.825719
1.049798
0.778760
0.849985
0.896121
0.923278
0.909323
0.825253
0.997673
0.951994
0.942180
0.807907
0.810904
0.952346
0.931959
0.901531
0.960111
1.120766
0.886204
0.700634
1.126493
1.072580
1.009778
0.873241
1.000100
0.851812
0.855179
1.002405
0.874315
0.943735
1.021325
1.013449
1.022970
0.848507
0.897087
1.041026
0.882190
0.932870
1.029495
0.944008
0.823620
0.739987
0.744847
0.991434
3
569.117645
375.217869
359.171596
286.057745
299.573831
298.655176
266.731011
242.320803
203.752604
187.585252
180.052954
192.567248
180.345152
175.337857
168.259279
171.321794
140.198561
157.606527
155.221883
131.927407
126.147464
111.004187
130.982765
118.817467
141.501060
118.176844
114.398427
119.691441
124.304120
125.368234
135.422730
132.428406
111.376933
109.878513
106.410511
112.684575
117.621705
94.691534
102.574050
85.991384
121.324123
91.121158
96.505274
108.863981
101.613146
107.388160
115.977445
104.107373
87.699947
95.420218
96.954820
93.261686
114.247771
77.059637
86.443770
87.937790
86.472469
77.396072
75.848390
93.047488
79.360157
143.593021
72.537939
86.218436
102.574748
109.118329
74.664771
78.188411
121.873950
75.017177
79.930384
104.522230
93.543186
74.658469
73.756726
...
0.807071
0.941728
0.941791
0.878157
0.842317
0.885137
0.976697
0.959625
0.882182
0.818094
0.767410
0.848311
0.963916
0.896594
0.920266
0.883813
0.851175
0.967345
0.917014
0.966880
0.910213
0.967913
1.057982
1.101173
0.902564
0.953344
0.887983
0.942766
1.757310
0.905800
0.897104
0.983382
0.972594
1.376353
1.012468
0.985313
0.994441
0.916414
0.784087
0.941866
1.033972
0.906757
0.911047
0.826938
1.134547
0.996208
0.932967
0.905391
1.026659
0.792984
1.003325
0.746581
0.880525
0.910088
0.762417
0.929775
1.382520
1.163741
0.982870
0.905718
0.904870
0.910846
0.904576
0.951618
0.986382
1.469229
0.838511
0.964647
1.045563
0.953336
0.889411
1.073751
1.329903
0.896628
0.961557
4
28.168512
18.001880
14.049882
18.118982
14.684943
11.487112
17.209956
14.978091
10.016852
7.006124
16.626152
10.132525
15.285729
9.976849
7.320657
13.225463
9.245069
11.524126
5.657953
12.989374
2.838482
6.256801
7.372415
13.777652
9.305257
5.642161
4.872286
10.996571
5.892630
5.041900
5.525730
9.357960
8.137254
6.448612
11.438766
13.566489
10.498060
7.304194
5.456101
6.202625
8.094154
11.487553
9.361542
5.384001
4.852198
8.024801
5.977529
14.755373
5.878170
5.140622
6.708097
5.161157
6.211360
4.531957
6.605535
3.573619
5.881520
5.844686
5.628327
4.769399
6.631656
9.480587
4.170229
7.026337
5.140732
6.053367
6.620693
2.291043
6.170354
11.029797
3.753499
10.989146
3.760564
2.685307
4.606793
...
0.994326
0.943130
0.825697
0.891160
0.953169
0.957776
1.007873
0.817004
0.937912
0.999474
0.918251
0.836084
0.894022
0.909006
1.015652
0.783042
0.818891
0.871219
0.902542
0.741162
0.884142
0.887271
0.816523
0.803974
0.994473
0.925472
0.886477
0.954433
0.911138
0.945027
0.938987
1.010463
0.942430
0.831493
1.016364
0.732702
0.981596
1.006043
1.034364
0.957948
0.992289
0.822596
0.810961
0.914214
0.888355
0.890246
1.126745
1.059239
1.140937
0.837260
0.767217
1.050317
0.852157
1.030762
0.785417
0.852482
0.813651
0.879658
0.948659
0.948739
0.957059
0.871627
0.856278
0.855609
0.907313
0.959966
0.947130
0.792086
0.855527
1.011561
0.995477
0.980982
0.996789
0.830738
1.051075
5
20.085940
18.782918
18.437114
14.810532
16.922222
10.600998
14.438103
15.651380
9.283464
10.828416
10.221725
9.534839
4.656888
4.964353
7.431559
8.293801
9.081893
9.911928
5.277907
8.009465
12.908637
2.431235
4.826399
6.119886
5.675221
6.783313
4.046307
10.515613
4.326315
7.796709
6.177531
11.555872
4.078763
5.675803
9.122149
2.686504
5.349288
7.163367
8.374668
3.665486
4.167949
5.828164
8.361452
4.642414
5.231697
5.936526
13.024344
3.860787
2.779279
2.907834
6.570319
5.854992
7.878489
3.040769
5.298797
13.500870
2.776805
4.567612
4.520014
4.528191
5.152433
7.850788
4.860277
3.993265
34.411319
6.792786
5.598086
6.286662
13.367284
6.676110
5.152330
14.805826
4.875010
3.493082
5.216388
...
1.007492
0.858330
0.951287
0.876014
0.892081
0.904842
0.936091
0.913348
0.928388
0.978204
0.887043
0.855481
0.977239
0.731268
0.843736
0.949797
0.831076
0.991784
0.926417
0.891494
0.995620
0.926379
0.930229
0.944218
0.859060
1.033383
0.829044
0.802670
1.050398
0.850775
0.912827
0.909261
1.074922
0.852820
1.031265
1.043779
0.921746
0.884010
0.979139
1.079427
0.907433
0.895706
1.096699
0.990433
0.945077
0.859812
0.979233
0.917115
0.799169
0.863756
0.778890
0.910880
0.933602
1.021602
0.780722
0.921392
1.194571
1.022633
0.878377
0.822903
0.849760
0.815329
0.949032
1.094720
0.843519
0.979618
0.985386
0.905442
0.890735
0.791482
1.034383
0.935892
0.982111
1.003776
0.815313
6
23.814377
23.461418
20.563423
17.702458
14.146988
11.111478
21.754094
7.530462
15.487624
12.824252
9.074189
14.607102
10.351985
15.131521
10.180591
5.990964
12.038223
4.422081
7.935372
6.242144
5.939413
8.658125
6.158674
6.260695
7.193228
5.213818
8.404002
3.195739
6.501966
5.384146
6.360871
5.678477
5.012931
3.193813
3.932260
5.526677
6.253347
7.197191
9.574219
7.658033
4.783062
1.916924
4.707444
8.283744
4.429112
4.198576
2.202736
5.101160
7.364892
6.810985
3.852678
2.802099
2.643665
3.327897
3.358981
3.581209
3.278696
0.789038
5.543420
2.476756
5.024458
1.803234
3.974550
4.402888
5.291549
5.563874
7.529887
3.730552
6.579101
3.094909
3.130031
1.822069
3.827073
3.673637
10.663008
...
0.950265
0.997868
1.008335
0.917512
0.769314
0.748954
0.929362
0.936600
1.034560
0.984661
1.031994
0.876667
1.006912
1.068562
1.086639
1.003642
0.912222
0.940488
1.187170
0.922950
0.895714
0.958835
1.075324
0.857558
1.096831
0.883471
0.903648
0.783936
0.906079
0.828415
0.841786
0.867306
0.876089
1.018295
0.866050
1.539705
0.884931
1.042334
0.772274
0.810701
0.853887
0.898905
0.821602
0.851620
0.985879
0.981696
0.912827
0.837240
0.767152
0.808981
0.837221
0.941193
0.897686
0.924896
0.941055
0.859361
0.873936
0.889068
0.803840
1.036668
0.886466
0.865702
0.920205
1.101380
0.949649
0.917311
0.730633
0.915578
0.862100
1.031913
0.849849
1.046778
1.030484
0.979081
0.939272
7
28.998025
18.348402
12.806133
10.510819
15.866236
11.642008
7.072523
4.277107
7.633124
19.417741
11.361022
6.856354
7.870019
6.736113
4.305183
5.348589
10.982667
8.071871
4.144797
2.965678
6.352451
5.894479
4.032645
6.158643
6.295068
8.488952
4.351226
5.538527
3.588807
9.509300
5.759756
7.925500
2.863522
9.322388
7.174795
6.053113
3.503799
1.664304
2.688063
9.153034
3.573980
5.614140
3.786577
6.996487
5.524702
2.015780
5.374750
1.077708
4.769593
2.182569
5.659881
6.247556
3.840174
5.336527
6.478365
1.581460
8.072494
5.352936
4.699397
3.674596
2.983758
0.819125
3.206432
1.891753
2.558912
4.227376
1.755160
1.621110
1.828282
2.115969
5.116059
4.545278
8.640689
7.874085
2.313972
...
0.977020
0.896425
0.925416
1.231845
0.970198
0.807132
1.042266
0.790933
0.901928
0.851463
0.880628
0.930276
0.909298
0.920000
0.956882
0.864396
0.941989
1.136597
0.906135
0.929640
0.986775
0.814518
0.846101
1.082463
0.856743
0.718842
0.920428
1.040002
0.998349
1.113480
0.876585
0.948560
1.099499
0.945214
0.953516
0.987729
1.118536
0.897066
0.802324
1.108349
0.991733
0.759398
0.925303
0.942180
1.245246
1.008849
0.873720
0.981506
0.873077
0.903017
0.917284
0.683957
1.008881
0.680528
0.965168
0.971502
0.830506
0.745906
0.966481
0.824490
0.937966
0.968246
0.839436
0.916967
0.937466
1.053778
0.880030
1.038145
0.922845
0.921222
1.036661
0.938227
1.137922
0.896809
0.991070
8
19.259384
20.873298
10.891218
10.311046
7.713831
8.593748
3.293756
7.721109
9.013065
21.559586
12.487105
7.442171
6.613116
6.367484
7.782099
4.449667
3.471020
6.860835
7.729980
6.328083
4.768591
6.640138
5.486274
4.923742
5.717308
5.272943
8.755634
3.934738
4.404620
4.288394
4.235263
8.114158
5.498302
8.437864
2.246361
5.285536
4.165034
4.064173
5.589624
6.825313
5.093451
2.079045
2.537884
4.591293
2.167996
4.802563
4.114012
3.709397
3.240790
3.942546
4.047200
4.366170
3.055401
4.888170
3.769129
2.542534
2.516658
6.114882
4.036541
2.926307
2.085734
2.003129
1.812869
3.422786
3.613925
4.278499
4.640119
4.313159
4.784777
0.978137
1.929597
4.073013
7.757378
3.696281
2.832603
...
0.969071
0.894436
1.098357
0.784662
0.865316
0.903930
1.031903
1.024392
0.981408
0.878394
0.823070
1.231886
0.946206
0.962795
0.865557
0.888973
1.038034
1.137670
0.958427
0.884272
0.866525
1.070468
0.900355
0.844232
0.987338
0.831718
0.854035
1.019832
0.826614
0.984053
0.848300
0.885708
0.869844
0.918068
0.911561
0.929538
0.921212
0.891753
0.883489
1.062236
1.122840
0.882382
1.024350
0.738958
1.014955
0.971324
0.760354
0.942021
0.920169
0.686426
0.902196
0.872940
0.897958
0.893313
0.832789
0.940049
0.958171
0.927880
0.871042
0.959768
1.060895
0.879743
0.846332
0.981110
0.947517
0.872612
0.933273
0.831101
0.866549
0.955474
0.967027
1.076746
0.787289
1.138567
0.796959
9
32.634540
10.883548
13.055581
11.074116
10.338895
16.881450
6.250045
11.090178
6.194652
17.234312
9.316032
7.329001
5.078105
7.411794
10.511014
5.955080
4.275210
9.976554
4.211980
3.589240
3.443493
5.320616
2.866002
5.976349
5.427763
3.197896
7.964595
8.466660
4.404711
3.337090
3.958040
1.700826
8.324566
1.930532
5.605261
6.413916
6.992754
8.305001
4.153992
2.642825
6.580708
3.087669
3.761210
4.501239
3.769644
5.457825
2.971710
4.271171
2.352333
3.325819
4.227077
5.774772
4.374444
2.394271
1.697065
4.473220
4.658411
7.724148
3.487050
4.701079
4.457878
1.388721
5.960054
5.048299
3.891404
3.932685
4.488664
5.529596
2.417878
2.806081
4.949107
3.563333
0.800457
2.986846
1.188924
...
0.852224
0.814218
0.901743
0.934365
0.872058
0.878613
0.804536
0.889890
0.888234
0.868320
0.879550
0.922788
0.998778
1.027588
0.993610
0.935129
0.693160
0.966021
0.805784
0.829274
0.830103
1.107117
0.856729
0.954380
1.104449
1.092253
0.853025
0.899481
0.934719
0.974917
0.943390
0.965719
0.866990
0.845655
1.005885
0.839315
0.993450
1.157226
1.088443
1.042639
1.019365
0.952525
1.072376
0.930397
1.105347
0.921322
0.857586
0.965983
0.748458
0.872294
0.931053
0.936660
0.808634
0.944952
0.923698
0.846288
1.069774
0.863637
0.973426
1.162857
0.910061
0.847051
0.962939
0.873847
0.802336
0.843733
0.830402
0.861237
0.979756
0.992285
0.759286
1.084377
0.860706
0.931612
0.749618
10 rows × 50000 columns
In [11]:
doc_topic_dists.shape#[M x K]
doc_topic_dists
Out[11]:
topic
0
1
2
3
4
5
6
7
8
9
doc
0
0.002919
0.002915
0.002914
0.973755
0.002917
0.002919
0.002919
0.002916
0.002913
0.002914
1
0.002006
0.002004
0.002003
0.981962
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
2
0.000993
0.000991
0.000991
0.991076
0.000992
0.000992
0.000992
0.000991
0.000991
0.000991
3
0.001058
0.001057
0.001056
0.990488
0.001057
0.001058
0.001058
0.001057
0.001056
0.001056
4
0.002140
0.002137
0.002136
0.980759
0.002138
0.002140
0.002140
0.002138
0.002136
0.002136
5
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
6
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
7
0.002603
0.002600
0.002599
0.976595
0.002601
0.002603
0.002603
0.002600
0.002598
0.002599
8
0.001965
0.001963
0.001962
0.982330
0.001964
0.001965
0.001965
0.001963
0.001961
0.001962
9
0.004820
0.004813
0.004811
0.956668
0.004816
0.004819
0.004819
0.004814
0.004810
0.004811
10
0.001070
0.001068
0.001068
0.990382
0.001069
0.001070
0.001070
0.001068
0.001068
0.001068
11
0.001528
0.001526
0.001526
0.986259
0.001527
0.001528
0.001528
0.001527
0.001525
0.001526
12
0.000668
0.000668
0.000667
0.993990
0.000668
0.000668
0.000668
0.000668
0.000667
0.000667
13
0.001094
0.001093
0.001092
0.990164
0.001093
0.001094
0.001094
0.001093
0.001092
0.001092
14
0.000698
0.000697
0.000696
0.993729
0.000697
0.000697
0.000697
0.000697
0.000696
0.000696
15
0.001035
0.001034
0.001033
0.990692
0.001034
0.001035
0.001035
0.001034
0.001033
0.001034
16
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
17
0.000417
0.000416
0.000416
0.996254
0.000416
0.000417
0.000417
0.000416
0.000416
0.000416
18
0.001146
0.001145
0.001144
0.989695
0.001145
0.001146
0.001146
0.001145
0.001144
0.001144
19
0.000231
0.000231
0.000230
0.997925
0.000231
0.000231
0.000231
0.000231
0.000230
0.000230
20
0.004015
0.004010
0.004008
0.963901
0.004012
0.004015
0.004014
0.004010
0.004007
0.004008
21
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
22
0.001605
0.001603
0.001602
0.985572
0.001603
0.001605
0.001605
0.001603
0.001602
0.001602
23
0.001301
0.001299
0.001299
0.988302
0.001300
0.001301
0.001301
0.001300
0.001298
0.001299
24
0.000244
0.000243
0.000243
0.997809
0.000243
0.000244
0.000244
0.000243
0.000243
0.000243
25
0.001660
0.001658
0.001657
0.985073
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
26
0.000262
0.000261
0.000261
0.997648
0.000261
0.000262
0.000262
0.000261
0.000261
0.000261
27
0.001751
0.001748
0.001748
0.984258
0.001749
0.001751
0.001751
0.001749
0.001747
0.001748
28
0.002189
0.002186
0.002185
0.980321
0.002187
0.002189
0.002188
0.002186
0.002184
0.002185
29
0.000659
0.000658
0.000658
0.994072
0.000659
0.000659
0.000659
0.000659
0.000658
0.000658
30
0.002006
0.002004
0.002003
0.981961
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
31
0.000678
0.000677
0.000677
0.993905
0.000677
0.000678
0.000678
0.000677
0.000677
0.000677
32
0.002408
0.002405
0.002404
0.978352
0.002406
0.002407
0.002407
0.002405
0.002403
0.002404
33
0.003322
0.003318
0.003316
0.970132
0.003319
0.003322
0.003321
0.003318
0.003315
0.003316
34
0.002470
0.002466
0.002465
0.977796
0.002467
0.002469
0.002469
0.002467
0.002465
0.002466
35
0.001660
0.001658
0.001657
0.985074
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
36
0.001504
0.001502
0.001502
0.986473
0.001503
0.001504
0.001504
0.001503
0.001501
0.001502
37
0.001070
0.001068
0.001068
0.990383
0.001069
0.001070
0.001070
0.001068
0.001068
0.001068
38
0.001751
0.001748
0.001748
0.984259
0.001749
0.001751
0.001751
0.001749
0.001747
0.001748
39
0.002603
0.002600
0.002599
0.976596
0.002601
0.002603
0.002603
0.002600
0.002598
0.002599
40
0.001284
0.001282
0.001281
0.988458
0.001283
0.001284
0.001284
0.001282
0.001281
0.001282
41
0.000111
0.000111
0.000111
0.999004
0.000111
0.000111
0.000111
0.000111
0.000111
0.000111
42
0.000837
0.000836
0.000836
0.992474
0.000836
0.000837
0.000837
0.000836
0.000835
0.000836
43
0.005671
0.005663
0.005661
0.949013
0.005666
0.005670
0.005670
0.005664
0.005659
0.005661
44
0.002833
0.002829
0.002828
0.974528
0.002831
0.002833
0.002833
0.002830
0.002827
0.002828
45
0.000796
0.000795
0.000794
0.992847
0.000795
0.000796
0.000795
0.000795
0.000794
0.000794
46
0.000458
0.000458
0.000458
0.995879
0.000458
0.000458
0.000458
0.000458
0.000457
0.000458
47
0.001035
0.001034
0.001033
0.990693
0.001034
0.001035
0.001035
0.001034
0.001033
0.001033
48
0.002006
0.002004
0.002003
0.981961
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
49
0.001437
0.001435
0.001435
0.987079
0.001436
0.001437
0.001437
0.001435
0.001434
0.001435
50
0.000470
0.000469
0.000469
0.995778
0.000469
0.000469
0.000469
0.000469
0.000469
0.000469
51
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
52
0.002919
0.002915
0.002914
0.973755
0.002917
0.002919
0.002919
0.002916
0.002913
0.002914
53
0.000713
0.000712
0.000712
0.993589
0.000712
0.000713
0.000713
0.000712
0.000712
0.000712
54
0.000456
0.000456
0.000455
0.995899
0.000456
0.000456
0.000456
0.000456
0.000455
0.000455
55
0.004015
0.004010
0.004008
0.963900
0.004012
0.004015
0.004014
0.004010
0.004007
0.004008
56
0.001720
0.001717
0.001717
0.984540
0.001718
0.001719
0.001719
0.001718
0.001716
0.001717
57
0.001888
0.001886
0.001885
0.983024
0.001887
0.001888
0.001888
0.001886
0.001884
0.001885
58
0.000776
0.000775
0.000775
0.993020
0.000776
0.000776
0.000776
0.000775
0.000775
0.000775
59
0.006429
0.006421
0.006418
0.942195
0.006424
0.006429
0.006428
0.006422
0.006416
0.006419
60
0.002349
0.002346
0.002345
0.978880
0.002347
0.002349
0.002349
0.002346
0.002344
0.002345
61
0.001267
0.001265
0.001265
0.988610
0.001266
0.001267
0.001267
0.001265
0.001264
0.001265
62
0.001337
0.001335
0.001335
0.987977
0.001336
0.001337
0.001337
0.001336
0.001335
0.001335
63
0.002833
0.002829
0.002828
0.974528
0.002831
0.002833
0.002833
0.002830
0.002827
0.002828
64
0.002094
0.002091
0.002090
0.981177
0.002092
0.002093
0.002093
0.002091
0.002089
0.002090
65
0.001319
0.001317
0.001317
0.988142
0.001318
0.001319
0.001319
0.001317
0.001316
0.001317
66
0.000752
0.000751
0.000751
0.993238
0.000751
0.000752
0.000752
0.000751
0.000751
0.000751
67
0.000411
0.000411
0.000411
0.996302
0.000411
0.000411
0.000411
0.000411
0.000411
0.000411
68
0.000740
0.000739
0.000739
0.993343
0.000740
0.000740
0.000740
0.000740
0.000739
0.000739
69
0.000683
0.000682
0.000681
0.993862
0.000682
0.000683
0.000683
0.000682
0.000681
0.000682
70
0.000783
0.000782
0.000781
0.992964
0.000782
0.000783
0.000782
0.000782
0.000781
0.000781
71
0.000321
0.000320
0.000320
0.997115
0.000321
0.000321
0.000321
0.000320
0.000320
0.000320
72
0.000688
0.000687
0.000686
0.993818
0.000687
0.000688
0.000687
0.000687
0.000686
0.000686
73
0.002676
0.002672
0.002671
0.975944
0.002673
0.002675
0.002675
0.002673
0.002670
0.002671
74
0.001437
0.001435
0.001435
0.987080
0.001436
0.001437
0.001437
0.001435
0.001434
0.001435
75
0.002140
0.002137
0.002136
0.980758
0.002138
0.002140
0.002140
0.002138
0.002136
0.002137
76
0.000461
0.000460
0.000460
0.995859
0.000460
0.000461
0.000460
0.000460
0.000460
0.000460
77
0.003322
0.003318
0.003316
0.970133
0.003319
0.003322
0.003321
0.003318
0.003315
0.003316
78
0.000560
0.000559
0.000559
0.994968
0.000559
0.000560
0.000560
0.000559
0.000558
0.000559
79
0.000935
0.000933
0.000933
0.991597
0.000934
0.000935
0.000934
0.000934
0.000933
0.000933
80
0.002469
0.002466
0.002465
0.977797
0.002467
0.002469
0.002469
0.002467
0.002465
0.002465
81
0.003441
0.003436
0.003435
0.969064
0.003438
0.003440
0.003440
0.003437
0.003434
0.003435
82
0.001632
0.001630
0.001629
0.985325
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
83
0.000031
0.000031
0.000030
0.999725
0.000031
0.000031
0.000031
0.000031
0.000030
0.000031
84
0.001504
0.001502
0.001502
0.986473
0.001503
0.001504
0.001504
0.001503
0.001501
0.001502
85
0.001632
0.001630
0.001629
0.985326
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
86
0.002470
0.002466
0.002465
0.977795
0.002468
0.002469
0.002469
0.002467
0.002465
0.002466
87
0.000992
0.000991
0.000991
0.991077
0.000992
0.000992
0.000992
0.000991
0.000990
0.000991
88
0.000613
0.000612
0.000612
0.994488
0.000613
0.000613
0.000613
0.000612
0.000612
0.000612
89
0.003706
0.003701
0.003699
0.966681
0.003703
0.003705
0.003705
0.003702
0.003698
0.003700
90
0.001356
0.001354
0.001354
0.987807
0.001355
0.001356
0.001356
0.001355
0.001353
0.001354
91
0.002470
0.002466
0.002465
0.977796
0.002468
0.002469
0.002469
0.002467
0.002465
0.002465
92
0.002752
0.002748
0.002747
0.975256
0.002750
0.002752
0.002752
0.002749
0.002747
0.002747
93
0.000926
0.000924
0.000924
0.991678
0.000925
0.000926
0.000925
0.000925
0.000924
0.000924
94
0.003211
0.003207
0.003205
0.971129
0.003208
0.003211
0.003211
0.003207
0.003205
0.003206
95
0.000151
0.000151
0.000151
0.998644
0.000151
0.000151
0.000151
0.000151
0.000151
0.000151
96
0.000673
0.000672
0.000672
0.993948
0.000673
0.000673
0.000673
0.000672
0.000672
0.000672
97
0.002349
0.002346
0.002345
0.978880
0.002347
0.002349
0.002349
0.002346
0.002344
0.002345
98
0.000633
0.000632
0.000632
0.994306
0.000633
0.000633
0.000633
0.000633
0.000632
0.000632
99
0.001926
0.001923
0.001923
0.982684
0.001924
0.001926
0.001926
0.001924
0.001922
0.001923
100
0.001189
0.001187
0.001187
0.989313
0.001188
0.001188
0.001188
0.001187
0.001186
0.001187
101
0.000391
0.000391
0.000391
0.996482
0.000391
0.000391
0.000391
0.000391
0.000390
0.000391
102
0.000796
0.000795
0.000794
0.992847
0.000795
0.000795
0.000795
0.000795
0.000794
0.000794
103
0.002006
0.002004
0.002003
0.981962
0.002004
0.002006
0.002006
0.002004
0.002002
0.002003
104
0.000992
0.000991
0.000991
0.991077
0.000992
0.000992
0.000992
0.000991
0.000990
0.000991
105
0.001632
0.001630
0.001629
0.985327
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
106
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
107
0.000770
0.000769
0.000769
0.993076
0.000769
0.000770
0.000770
0.000769
0.000769
0.000769
108
0.001356
0.001354
0.001354
0.987808
0.001355
0.001356
0.001356
0.001354
0.001353
0.001354
109
0.000393
0.000392
0.000392
0.996468
0.000393
0.000393
0.000393
0.000392
0.000392
0.000392
110
0.001035
0.001034
0.001033
0.990693
0.001034
0.001035
0.001035
0.001034
0.001033
0.001033
111
0.001319
0.001317
0.001317
0.988142
0.001318
0.001319
0.001319
0.001317
0.001316
0.001317
112
0.001337
0.001335
0.001335
0.987977
0.001336
0.001337
0.001337
0.001336
0.001335
0.001335
113
0.001189
0.001187
0.001187
0.989313
0.001188
0.001189
0.001188
0.001187
0.001186
0.001187
114
0.002140
0.002137
0.002136
0.980758
0.002138
0.002140
0.002140
0.002138
0.002136
0.002137
115
0.001375
0.001374
0.001373
0.987633
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
116
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
117
0.001250
0.001249
0.001248
0.988758
0.001249
0.001250
0.001250
0.001249
0.001248
0.001248
118
0.001660
0.001658
0.001657
0.985073
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
119
0.003322
0.003317
0.003316
0.970133
0.003319
0.003322
0.003321
0.003318
0.003315
0.003316
120
0.000724
0.000723
0.000723
0.993493
0.000723
0.000724
0.000724
0.000723
0.000722
0.000723
121
0.001267
0.001265
0.001265
0.988610
0.001266
0.001267
0.001267
0.001265
0.001264
0.001265
122
0.001817
0.001814
0.001814
0.983665
0.001815
0.001817
0.001817
0.001815
0.001813
0.001814
123
0.001689
0.001687
0.001686
0.984812
0.001688
0.001689
0.001689
0.001687
0.001686
0.001686
124
0.004381
0.004375
0.004373
0.960615
0.004377
0.004380
0.004380
0.004375
0.004372
0.004373
...
...
...
...
...
...
...
...
...
...
...
11189
0.002240
0.002237
0.002236
0.979862
0.002238
0.002240
0.002239
0.002237
0.002235
0.002236
11190
0.000776
0.000775
0.000775
0.993020
0.000776
0.000776
0.000776
0.000775
0.000775
0.000775
11191
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
11192
0.000972
0.000971
0.000971
0.991257
0.000972
0.000972
0.000972
0.000971
0.000970
0.000971
11193
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002291
0.002288
0.002289
11194
0.001632
0.001630
0.001629
0.985327
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
11195
0.000776
0.000775
0.000775
0.993020
0.000776
0.000776
0.000776
0.000775
0.000775
0.000775
11196
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
11197
0.000499
0.000498
0.000498
0.995516
0.000498
0.000499
0.000499
0.000498
0.000498
0.000498
11198
0.001459
0.001457
0.001456
0.986883
0.001458
0.001459
0.001459
0.001457
0.001456
0.001456
11199
0.001926
0.001923
0.001923
0.982683
0.001924
0.001926
0.001926
0.001924
0.001922
0.001923
11200
0.000860
0.000858
0.000858
0.992272
0.000859
0.000859
0.000859
0.000859
0.000858
0.000858
11201
0.001301
0.001299
0.001299
0.988302
0.001300
0.001301
0.001301
0.001300
0.001298
0.001299
11202
0.002676
0.002672
0.002671
0.975943
0.002673
0.002675
0.002675
0.002673
0.002670
0.002671
11203
0.001437
0.001435
0.001435
0.987079
0.001436
0.001437
0.001437
0.001435
0.001434
0.001435
11204
0.001888
0.001886
0.001885
0.983024
0.001887
0.001888
0.001888
0.001886
0.001884
0.001885
11205
0.001395
0.001394
0.001393
0.987454
0.001394
0.001395
0.001395
0.001394
0.001393
0.001393
11206
0.000963
0.000961
0.000961
0.991344
0.000962
0.000963
0.000963
0.000962
0.000961
0.000961
11207
0.000668
0.000668
0.000667
0.993990
0.000668
0.000668
0.000668
0.000668
0.000667
0.000667
11208
0.001751
0.001749
0.001748
0.984259
0.001749
0.001751
0.001750
0.001749
0.001747
0.001748
11209
0.000306
0.000305
0.000305
0.997253
0.000305
0.000306
0.000305
0.000305
0.000305
0.000305
11210
0.001817
0.001814
0.001814
0.983664
0.001815
0.001817
0.001817
0.001815
0.001813
0.001814
11211
0.003211
0.003207
0.003206
0.971128
0.003209
0.003211
0.003211
0.003207
0.003205
0.003206
11212
0.003322
0.003318
0.003316
0.970132
0.003319
0.003322
0.003322
0.003318
0.003315
0.003317
11213
0.003854
0.003849
0.003847
0.965347
0.003851
0.003854
0.003854
0.003850
0.003846
0.003848
11214
0.001528
0.001526
0.001526
0.986259
0.001527
0.001528
0.001528
0.001527
0.001525
0.001526
11215
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
11216
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
11217
0.003568
0.003564
0.003562
0.967917
0.003565
0.003568
0.003568
0.003564
0.003561
0.003562
11218
0.000398
0.000397
0.000397
0.996424
0.000397
0.000398
0.000398
0.000397
0.000397
0.000397
11219
0.001375
0.001374
0.001373
0.987633
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
11220
0.001751
0.001748
0.001748
0.984259
0.001749
0.001751
0.001750
0.001749
0.001747
0.001748
11221
0.004589
0.004583
0.004581
0.958738
0.004585
0.004589
0.004588
0.004584
0.004580
0.004582
11222
0.000304
0.000303
0.000303
0.997270
0.000303
0.000304
0.000304
0.000303
0.000303
0.000303
11223
0.003441
0.003436
0.003435
0.969063
0.003438
0.003441
0.003440
0.003437
0.003434
0.003435
11224
0.004819
0.004813
0.004811
0.956672
0.004815
0.004819
0.004818
0.004814
0.004809
0.004811
11225
0.002919
0.002915
0.002914
0.973754
0.002917
0.002919
0.002919
0.002916
0.002913
0.002914
11226
0.001174
0.001172
0.001172
0.989444
0.001173
0.001174
0.001174
0.001173
0.001172
0.001172
11227
0.000602
0.000601
0.000601
0.994591
0.000601
0.000602
0.000602
0.000601
0.000600
0.000601
11228
0.002140
0.002137
0.002136
0.980759
0.002138
0.002140
0.002140
0.002138
0.002136
0.002136
11229
0.000852
0.000851
0.000850
0.992340
0.000851
0.000852
0.000852
0.000851
0.000850
0.000850
11230
0.000382
0.000381
0.000381
0.996566
0.000382
0.000382
0.000382
0.000381
0.000381
0.000381
11231
0.004380
0.004375
0.004373
0.960616
0.004377
0.004380
0.004380
0.004375
0.004372
0.004373
11232
0.000394
0.000394
0.000394
0.996453
0.000394
0.000394
0.000394
0.000394
0.000394
0.000394
11233
0.001660
0.001658
0.001657
0.985074
0.001659
0.001660
0.001660
0.001658
0.001657
0.001657
11234
0.007420
0.007410
0.007407
0.933285
0.007414
0.007420
0.007419
0.007412
0.007405
0.007408
11235
0.000286
0.000286
0.000286
0.997425
0.000286
0.000286
0.000286
0.000286
0.000286
0.000286
11236
0.001505
0.001502
0.001502
0.986473
0.001503
0.001504
0.001504
0.001503
0.001501
0.001502
11237
0.003010
0.003006
0.003005
0.972935
0.003008
0.003010
0.003010
0.003007
0.003004
0.003005
11238
0.002535
0.002531
0.002530
0.977212
0.002532
0.002534
0.002534
0.002532
0.002529
0.002530
11239
0.002006
0.002004
0.002003
0.981962
0.002005
0.002006
0.002006
0.002004
0.002002
0.002003
11240
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
11241
0.002833
0.002830
0.002828
0.974526
0.002831
0.002833
0.002833
0.002830
0.002828
0.002829
11242
0.000830
0.000829
0.000828
0.992539
0.000829
0.000830
0.000830
0.000829
0.000828
0.000828
11243
0.001035
0.001034
0.001033
0.990693
0.001034
0.001035
0.001035
0.001034
0.001033
0.001033
11244
0.001082
0.001080
0.001080
0.990274
0.001081
0.001082
0.001082
0.001080
0.001080
0.001080
11245
0.001416
0.001414
0.001413
0.987269
0.001415
0.001416
0.001416
0.001414
0.001413
0.001414
11246
0.001234
0.001233
0.001232
0.988902
0.001233
0.001234
0.001234
0.001233
0.001232
0.001232
11247
0.000963
0.000961
0.000961
0.991344
0.000962
0.000963
0.000963
0.000962
0.000961
0.000961
11248
0.001094
0.001093
0.001092
0.990164
0.001093
0.001094
0.001094
0.001093
0.001092
0.001092
11249
0.000953
0.000952
0.000951
0.991431
0.000952
0.000953
0.000953
0.000952
0.000951
0.000952
11250
0.003441
0.003436
0.003435
0.969065
0.003438
0.003440
0.003440
0.003437
0.003434
0.003435
11251
0.001751
0.001748
0.001748
0.984259
0.001749
0.001751
0.001750
0.001749
0.001747
0.001748
11252
0.001375
0.001374
0.001373
0.987634
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
11253
0.000982
0.000981
0.000981
0.991168
0.000982
0.000982
0.000982
0.000981
0.000980
0.000981
11254
0.001459
0.001457
0.001456
0.986883
0.001458
0.001459
0.001459
0.001457
0.001456
0.001456
11255
0.001888
0.001886
0.001885
0.983024
0.001887
0.001888
0.001888
0.001886
0.001884
0.001885
11256
0.001437
0.001435
0.001435
0.987079
0.001436
0.001437
0.001437
0.001436
0.001434
0.001435
11257
0.002049
0.002046
0.002045
0.981577
0.002047
0.002049
0.002049
0.002047
0.002045
0.002046
11258
0.002189
0.002186
0.002185
0.980321
0.002187
0.002189
0.002188
0.002186
0.002184
0.002185
11259
0.000944
0.000943
0.000942
0.991514
0.000943
0.000944
0.000944
0.000943
0.000942
0.000942
11260
0.002049
0.002046
0.002045
0.981577
0.002047
0.002049
0.002049
0.002047
0.002045
0.002046
11261
0.000668
0.000668
0.000667
0.993990
0.000668
0.000668
0.000668
0.000668
0.000667
0.000667
11262
0.002094
0.002091
0.002090
0.981177
0.002092
0.002093
0.002093
0.002091
0.002089
0.002090
11263
0.000935
0.000933
0.000933
0.991596
0.000934
0.000935
0.000935
0.000934
0.000933
0.000933
11264
0.000317
0.000316
0.000316
0.997153
0.000316
0.000317
0.000317
0.000316
0.000316
0.000316
11265
0.001107
0.001105
0.001105
0.990051
0.001106
0.001106
0.001106
0.001105
0.001104
0.001105
11266
0.000770
0.000769
0.000769
0.993076
0.000769
0.000770
0.000770
0.000769
0.000769
0.000769
11267
0.000137
0.000137
0.000137
0.998769
0.000137
0.000137
0.000137
0.000137
0.000137
0.000137
11268
0.001926
0.001924
0.001923
0.982682
0.001924
0.001926
0.001926
0.001924
0.001922
0.001923
11269
0.001459
0.001457
0.001456
0.986884
0.001458
0.001459
0.001459
0.001457
0.001456
0.001456
11270
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
11271
0.002049
0.002046
0.002045
0.981578
0.002047
0.002049
0.002049
0.002047
0.002045
0.002046
11272
0.000816
0.000815
0.000814
0.992665
0.000815
0.000816
0.000816
0.000815
0.000814
0.000814
11273
0.000109
0.000109
0.000109
0.999020
0.000109
0.000109
0.000109
0.000109
0.000109
0.000109
11274
0.000566
0.000565
0.000565
0.994909
0.000566
0.000566
0.000566
0.000566
0.000565
0.000565
11275
0.000134
0.000134
0.000134
0.998793
0.000134
0.000134
0.000134
0.000134
0.000134
0.000134
11276
0.001528
0.001526
0.001526
0.986259
0.001527
0.001528
0.001528
0.001527
0.001525
0.001526
11277
0.001356
0.001354
0.001354
0.987808
0.001355
0.001356
0.001356
0.001355
0.001353
0.001354
11278
0.003107
0.003103
0.003102
0.972061
0.003105
0.003107
0.003107
0.003104
0.003101
0.003102
11279
0.001070
0.001068
0.001068
0.990382
0.001069
0.001070
0.001070
0.001068
0.001068
0.001068
11280
0.000972
0.000971
0.000971
0.991257
0.000972
0.000972
0.000972
0.000971
0.000970
0.000971
11281
0.003706
0.003701
0.003699
0.966681
0.003703
0.003705
0.003705
0.003702
0.003698
0.003700
11282
0.000746
0.000745
0.000745
0.993291
0.000746
0.000746
0.000746
0.000745
0.000745
0.000745
11283
0.001605
0.001603
0.001602
0.985572
0.001603
0.001605
0.001605
0.001603
0.001602
0.001602
11284
0.000358
0.000357
0.000357
0.996783
0.000358
0.000358
0.000358
0.000357
0.000357
0.000357
11285
0.002140
0.002137
0.002136
0.980758
0.002138
0.002140
0.002140
0.002138
0.002136
0.002137
11286
0.003108
0.003103
0.003102
0.972060
0.003105
0.003107
0.003107
0.003104
0.003101
0.003102
11287
0.001578
0.001576
0.001576
0.985808
0.001577
0.001578
0.001578
0.001577
0.001575
0.001576
11288
0.001817
0.001815
0.001814
0.983664
0.001815
0.001817
0.001817
0.001815
0.001813
0.001814
11289
0.001160
0.001158
0.001158
0.989571
0.001159
0.001160
0.001160
0.001159
0.001158
0.001158
11290
0.001375
0.001374
0.001373
0.987634
0.001374
0.001375
0.001375
0.001374
0.001373
0.001373
11291
0.002240
0.002237
0.002236
0.979863
0.002238
0.002239
0.002239
0.002237
0.002235
0.002236
11292
0.002349
0.002346
0.002345
0.978880
0.002347
0.002349
0.002349
0.002346
0.002344
0.002345
11293
0.002470
0.002466
0.002465
0.977796
0.002468
0.002469
0.002469
0.002467
0.002465
0.002465
11294
0.005073
0.005067
0.005065
0.954385
0.005069
0.005073
0.005073
0.005068
0.005063
0.005065
11295
0.001783
0.001781
0.001780
0.983967
0.001782
0.001783
0.001783
0.001781
0.001780
0.001780
11296
0.001553
0.001551
0.001550
0.986037
0.001552
0.001553
0.001553
0.001551
0.001550
0.001550
11297
0.000783
0.000782
0.000781
0.992963
0.000782
0.000783
0.000783
0.000782
0.000781
0.000781
11298
0.000463
0.000462
0.000462
0.995839
0.000462
0.000463
0.000463
0.000462
0.000462
0.000462
11299
0.004821
0.004813
0.004811
0.956666
0.004815
0.004819
0.004819
0.004814
0.004810
0.004811
11300
0.001395
0.001393
0.001393
0.987454
0.001394
0.001395
0.001395
0.001394
0.001393
0.001393
11301
0.001094
0.001093
0.001092
0.990164
0.001093
0.001094
0.001094
0.001093
0.001092
0.001092
11302
0.002470
0.002466
0.002465
0.977796
0.002467
0.002469
0.002469
0.002467
0.002465
0.002465
11303
0.002408
0.002405
0.002404
0.978351
0.002406
0.002408
0.002407
0.002405
0.002403
0.002404
11304
0.000837
0.000836
0.000836
0.992474
0.000836
0.000837
0.000837
0.000836
0.000835
0.000836
11305
0.000315
0.000314
0.000314
0.997172
0.000314
0.000315
0.000314
0.000314
0.000314
0.000314
11306
0.001301
0.001299
0.001299
0.988302
0.001300
0.001301
0.001301
0.001300
0.001298
0.001299
11307
0.002293
0.002290
0.002289
0.979383
0.002291
0.002293
0.002293
0.002290
0.002288
0.002289
11308
0.001632
0.001630
0.001629
0.985327
0.001631
0.001632
0.001632
0.001630
0.001629
0.001629
11309
0.001852
0.001849
0.001849
0.983350
0.001850
0.001852
0.001852
0.001850
0.001848
0.001849
11310
0.001482
0.001479
0.001479
0.986680
0.001480
0.001481
0.001481
0.001480
0.001478
0.001479
11311
0.000124
0.000123
0.000123
0.998889
0.000123
0.000124
0.000124
0.000123
0.000123
0.000123
11312
0.001965
0.001963
0.001962
0.982330
0.001964
0.001965
0.001965
0.001963
0.001961
0.001962
11313
0.000097
0.000097
0.000097
0.999129
0.000097
0.000097
0.000097
0.000097
0.000097
0.000097
11314 rows × 10 columns
In [12]:
term_frequency.shape#[V]
term_frequency
Out[12]:
0 8654
1 6466
2 5345
3 4859
4 4597
5 4295
6 4075
7 3491
8 3370
9 3141
10 3091
11 2792
12 2763
13 2538
14 2518
15 2444
16 2327
17 2296
18 2252
19 2149
20 1975
21 1972
22 1930
23 1918
24 1905
25 1902
26 1866
27 1823
28 1820
29 1817
30 1804
31 1804
32 1799
33 1780
34 1730
35 1702
36 1693
37 1680
38 1627
39 1618
40 1606
41 1581
42 1565
43 1501
44 1482
45 1469
46 1466
47 1448
48 1433
49 1422
50 1401
51 1400
52 1388
53 1387
54 1361
55 1360
56 1359
57 1340
58 1332
59 1329
60 1325
61 1324
62 1315
63 1309
64 1298
65 1288
66 1254
67 1246
68 1246
69 1240
70 1227
71 1222
72 1216
73 1210
74 1204
75 1177
76 1170
77 1164
78 1157
79 1154
80 1148
81 1145
82 1083
83 1075
84 1065
85 1060
86 1054
87 1022
88 1017
89 1010
90 1005
91 1000
92 1000
93 995
94 991
95 988
96 981
97 971
98 964
99 964
100 964
101 959
102 956
103 952
104 950
105 946
106 943
107 931
108 928
109 914
110 897
111 895
112 893
113 887
114 885
115 875
116 870
117 868
118 867
119 867
120 865
121 861
122 855
123 855
124 853
...
49875 1
49876 1
49877 1
49878 1
49879 1
49880 1
49881 1
49882 1
49883 1
49884 1
49885 1
49886 1
49887 1
49888 1
49889 1
49890 1
49891 1
49892 1
49893 1
49894 1
49895 1
49896 1
49897 1
49898 1
49899 1
49900 1
49901 1
49902 1
49903 1
49904 1
49905 1
49906 1
49907 1
49908 1
49909 1
49910 1
49911 1
49912 1
49913 1
49914 1
49915 1
49916 1
49917 1
49918 1
49919 1
49920 1
49921 1
49922 1
49923 1
49924 1
49925 1
49926 1
49927 1
49928 1
49929 1
49930 1
49931 1
49932 1
49933 1
49934 1
49935 1
49936 1
49937 1
49938 1
49939 1
49940 1
49941 1
49942 1
49943 1
49944 1
49945 1
49946 1
49947 1
49948 1
49949 1
49950 1
49951 1
49952 1
49953 1
49954 1
49955 1
49956 1
49957 1
49958 1
49959 1
49960 1
49961 1
49962 1
49963 1
49964 1
49965 1
49966 1
49967 1
49968 1
49969 1
49970 1
49971 1
49972 1
49973 1
49974 1
49975 1
49976 1
49977 1
49978 1
49979 1
49980 1
49981 1
49982 1
49983 1
49984 1
49985 1
49986 1
49987 1
49988 1
49989 1
49990 1
49991 1
49992 1
49993 1
49994 1
49995 1
49996 1
49997 1
49998 1
49999 1
Name: term_frequency, dtype: int64
In [13]:
doc_lengths.shape#[M]
doc_lengths
Out[13]:
0 32
1 47
2 96
3 90
4 44
5 53
6 41
7 36
8 48
9 19
10 89
11 62
12 143
13 87
14 137
15 92
16 41
17 230
18 83
19 416
20 23
21 77
22 59
23 73
24 394
25 57
26 367
27 54
28 43
29 145
30 47
31 141
32 39
33 28
34 38
35 57
36 63
37 89
38 54
39 36
40 74
41 868
42 114
43 16
44 33
45 120
46 209
47 92
48 47
49 66
50 204
51 51
52 32
53 134
54 210
55 23
56 55
57 50
58 123
59 14
60 40
61 75
62 71
63 33
64 45
65 72
66 127
67 233
68 129
69 140
70 122
71 299
72 139
73 35
74 66
75 44
76 208
77 28
78 171
79 102
80 38
81 27
82 58
83 3149
84 63
85 58
86 38
87 96
88 156
89 25
90 70
91 38
92 34
93 103
94 29
95 637
96 142
97 40
98 151
99 49
100 80
101 245
102 120
103 47
104 96
105 58
106 77
107 124
108 70
109 244
110 92
111 72
112 71
113 80
114 44
115 69
116 53
117 76
118 57
119 28
120 132
121 75
122 52
123 56
124 21
...
11189 42
11190 123
11191 77
11192 98
11193 41
11194 58
11195 123
11196 51
11197 192
11198 65
11199 49
11200 111
11201 73
11202 35
11203 66
11204 50
11205 68
11206 99
11207 143
11208 54
11209 314
11210 52
11211 29
11212 28
11213 24
11214 62
11215 41
11216 51
11217 26
11218 241
11219 69
11220 54
11221 20
11222 316
11223 27
11224 19
11225 32
11226 81
11227 159
11228 44
11229 112
11230 251
11231 21
11232 243
11233 57
11234 12
11235 335
11236 63
11237 31
11238 37
11239 47
11240 53
11241 33
11242 115
11243 92
11244 88
11245 67
11246 77
11247 99
11248 87
11249 100
11250 27
11251 54
11252 69
11253 97
11254 65
11255 50
11256 66
11257 46
11258 43
11259 101
11260 46
11261 143
11262 45
11263 102
11264 303
11265 86
11266 124
11267 702
11268 49
11269 65
11270 41
11271 46
11272 117
11273 882
11274 169
11275 716
11276 62
11277 70
11278 30
11279 89
11280 98
11281 25
11282 128
11283 59
11284 268
11285 44
11286 30
11287 60
11288 52
11289 82
11290 69
11291 42
11292 40
11293 38
11294 18
11295 53
11296 61
11297 122
11298 207
11299 19
11300 68
11301 87
11302 38
11303 39
11304 114
11305 305
11306 73
11307 41
11308 58
11309 51
11310 64
11311 778
11312 48
11313 992
Name: doc_length, dtype: int64
In [22]:
print(vocab.shape)#[V]
print(doc_topic_dists.shape)
print(doc_lengths.shape)
(50000,)
(11314, 10)
(11314,)
In [15]:
topic_freq = (doc_topic_dists.T * doc_lengths).T.sum(axis=0) # elementwise multiplication and sum all the rows
print(topic_freq.shape) #[K x M] * [M] = [K x M] = [M x K] = [K,]
# doc_topic_dists.T
topic_freq
# (doc_topic_dists.T * doc_lengths).T
(10,)
Out[15]:
topic
0 2350.276861
1 1332.903437
2 2522.722597
3 989058.050533
4 1593.339492
5 1850.267334
6 2001.928812
7 2223.548656
8 1361.881698
9 2757.080580
dtype: float64
In [16]:
topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
print(topic_proportion.shape)
(10,)
In [17]:
topic_order = topic_proportion.index
# reorder all data based on new ordering of topics
topic_freq = topic_freq[topic_order]
topic_term_dists = topic_term_dists.ix[topic_order]
doc_topic_dists = doc_topic_dists[topic_order]
In [18]:
topic_order
Out[18]:
Int64Index([3, 9, 2, 0, 7, 6, 5, 4, 8, 1], dtype='int64', name='topic')
In [20]:
topic_term_dists.T.shape
Out[20]:
(50000, 10)
In [24]:
# token counts for each term-topic combination (widths of red bars)
term_topic_freq = (topic_term_dists.T * topic_freq).T
#([K x V].T * [K x 1]).T
#([V x K] * [K x 1]).T
# [V x K].T
# [K x V]
term_topic_freq.shape
Out[24]:
(10, 50000)
In [ ]:
term_frequency = np.sum(term_topic_freq, axis=0)
In [ ]:
def _find_relevance(log_ttd, log_lift, R, lambda_):
relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift #TODO log added here
return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)
def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])
Let $\phi_{wk}$ (topic_term_dists) denote the probability of term $w \in {1, ..., V }$ for topic $k \in {1, ..., K}$
Let $p_w$ (term_proportion) denote the marginal probability of term $w$ in the corpus.
Relevance of term $w$ to topic $k$ given a weight parameter $\lambda$ (where $0 \leq \lambda \leq 1$) as:
$$ r(w,k\ |\ \lambda) = \lambda \log(\phi_{wk}) + \log(1-\lambda)\log(\frac{\phi_{wk}}{p_w}) $$
In [ ]:
print(topic_term_dists.shape)
# topic_term_dists
In [ ]:
print(topic_proportion.shape)
# topic_proportion
In [ ]:
print(term_proportion.shape)
# term_proportion
In [ ]:
R=30
lambda_step = 0.01
n_jobs = -1
# marginal distribution over terms (width of blue bars)
term_proportion = term_frequency / term_frequency.sum()
# compute the distinctiveness and saliency of the terms:
# this determines the R terms that are displayed when no topic is selected
topic_given_term = topic_term_dists / topic_term_dists.sum()
kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness
# Order the terms for the "default" view by decreasing saliency:
default_term_info = pd.DataFrame({'saliency': saliency, 'Term': vocab, \
'Freq': term_frequency, 'Total': term_frequency, \
'Category': 'Default'}). \
sort_values(by='saliency', ascending=False)\
.head(R).drop('saliency', 1)
# Rounding Freq and Total to integer values to match LDAvis code:
default_term_info['Freq'] = np.floor(default_term_info['Freq'])
default_term_info['Total'] = np.floor(default_term_info['Total'])
ranks = np.arange(R, 0, -1)
default_term_info['logprob'] = default_term_info['loglift'] = ranks
## compute relevance and top terms for each topic
log_lift = np.log(topic_term_dists / term_proportion)
log_ttd = np.log(topic_term_dists)
lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)
def topic_top_term_df(tup):
new_topic_id, (original_topic_id, topic_terms) = tup
term_ix = topic_terms.unique()
# print('===========')
# print('new_topic_id: ', new_topic_id)
# print('--------')
# print('original_topic_id:' , original_topic_id)
# print('--------')
# print('term_ix: ', term_ix)
# print('-========')
return pd.DataFrame({'Term': vocab[term_ix], \
'Freq': term_topic_freq.loc[original_topic_id, term_ix], \
'Total': term_frequency[term_ix], \
'logprob': log_ttd.loc[original_topic_id, term_ix].round(4), \
'loglift': log_lift.loc[original_topic_id, term_ix].round(4), \
'Category': 'Topic%d' % new_topic_id})
top_terms = _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq)
topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
topic_info = pd.concat([default_term_info] + list(topic_dfs))
topic_info
In [ ]:
# def _token_table(topic_info, term_topic_freq, vocab, term_frequency):
# last, to compute the areas of the circles when a term is highlighted
# we must gather all unique terms that could show up (for every combination
# of topic and value of lambda) and compute its distribution over topics.
# term-topic frequency table of unique terms across all topics and all values of lambda
term_ix = topic_info.index.unique()
term_ix = np.sort(term_ix)
top_topic_terms_freq = term_topic_freq[term_ix]
# use the new ordering for the topics
K = len(term_topic_freq)
top_topic_terms_freq.index = range(1, K + 1)
top_topic_terms_freq.index.name = 'Topic'
# we filter to Freq >= 0.5 to avoid sending too much data to the browser
token_table = pd.DataFrame({'Freq': top_topic_terms_freq.unstack()}). \
reset_index().set_index('term'). \
query('Freq >= 0.5')
token_table['Freq'] = token_table['Freq'].round()
token_table['Term'] = vocab[token_table.index.values].values
# Normalize token frequencies:
token_table['Freq'] = token_table.Freq / term_frequency[token_table.index]
token_table = token_table.sort_values(by=['Term', 'Topic'])
print()
token_table
In [26]:
topic_term_dists
Out[26]:
term
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
...
49925
49926
49927
49928
49929
49930
49931
49932
49933
49934
49935
49936
49937
49938
49939
49940
49941
49942
49943
49944
49945
49946
49947
49948
49949
49950
49951
49952
49953
49954
49955
49956
49957
49958
49959
49960
49961
49962
49963
49964
49965
49966
49967
49968
49969
49970
49971
49972
49973
49974
49975
49976
49977
49978
49979
49980
49981
49982
49983
49984
49985
49986
49987
49988
49989
49990
49991
49992
49993
49994
49995
49996
49997
49998
49999
topic
0
34.560076
23.052394
35.011782
15.229357
18.862407
21.696336
12.041139
19.608960
10.776458
23.010937
15.508145
14.207191
10.866341
10.992659
10.535417
9.556278
14.158643
18.374985
9.974904
6.319570
7.204872
7.815420
6.280495
13.116165
7.838096
6.460555
10.659491
8.282834
4.431570
15.375777
6.455284
7.225659
9.755187
14.810625
7.427099
3.704050
5.676146
10.384648
8.780114
6.520003
8.981168
7.829013
9.590536
8.193141
5.235972
5.624796
9.294603
7.756465
10.863049
5.229376
12.857805
5.914746
9.216167
7.581182
4.980683
8.968172
5.774038
6.506058
5.813927
7.624951
5.747068
1.924129
2.085067
7.341583
5.151361
8.472655
7.226199
3.960461
8.701531
10.669252
4.907496
6.523535
3.105377
6.238830
5.844484
...
0.625724
1.029929
1.017088
0.931565
0.920934
0.762889
0.751911
0.873610
0.939472
0.755020
0.825627
0.917172
0.769667
1.021140
0.918545
1.126019
0.896102
1.000272
1.088472
1.022719
0.850058
0.900774
1.258427
1.018967
0.763810
0.897174
0.834409
1.031664
0.821515
0.926126
0.920350
0.728411
0.776758
0.785560
0.769508
1.014240
1.057380
0.964175
0.911873
0.861532
0.801979
0.714600
0.708247
0.891596
0.782352
0.896941
0.806620
0.886933
0.771092
0.976889
1.369195
0.930031
1.149220
0.782902
0.798179
0.706706
0.947690
0.958424
0.892332
0.902915
0.772424
1.085815
0.825636
0.816371
1.380099
0.833757
0.678400
0.792306
0.916583
0.878585
0.929071
0.930318
0.928379
0.851077
0.731892
1
50.337722
40.581697
41.140381
15.087618
33.025241
22.846752
19.277132
18.825574
23.046049
27.702032
22.896842
20.329934
14.287076
18.290987
9.025927
14.427910
15.712707
16.706891
12.434514
10.515107
25.544847
18.070308
11.166432
9.016734
14.800357
7.597889
8.258921
9.401391
10.298447
7.877626
9.106611
21.045833
12.037226
12.942835
16.126885
6.919506
12.485776
12.509401
11.687824
14.292532
10.509668
8.118996
9.468621
12.616041
12.414472
9.431784
12.656220
9.535022
5.577135
7.839322
13.209660
7.978555
10.510647
6.044113
9.088396
7.046644
18.436527
11.436126
6.253316
8.271624
6.530339
13.176651
6.771588
5.434604
19.229041
3.917880
10.120773
16.016697
7.076446
7.626771
6.331805
20.473737
24.325816
6.781279
10.221805
...
0.839282
0.919271
0.819840
0.772406
0.881008
1.022791
0.979500
0.874041
0.708877
0.969268
0.868584
0.889366
0.764791
0.938343
0.829809
0.956575
0.861433
0.803894
1.454012
0.813326
0.816473
0.737059
0.814294
0.858895
0.973541
1.025707
0.809228
0.928308
1.116179
0.834379
0.873502
0.746603
0.874483
0.997578
0.944364
0.975360
1.014332
0.845255
0.990822
0.815321
0.824637
0.817899
0.940212
0.843902
0.870331
0.966135
0.919941
0.998555
0.889019
0.835961
0.911019
0.890673
1.063340
0.814293
0.926136
0.912161
0.825771
0.989603
0.865833
0.817442
0.899520
0.904136
0.890219
0.957529
0.945074
0.909135
1.013328
0.907039
0.967374
0.707008
0.811445
1.031009
0.952512
1.026861
0.864728
2
34.399639
23.519070
13.655633
24.389804
11.009536
12.211154
15.824990
16.277737
11.225549
30.053909
12.697881
11.624102
5.584157
7.127661
9.204468
5.399320
9.017233
11.528700
7.302804
7.692432
7.276529
8.592872
7.313048
7.735513
5.820175
6.861334
6.030154
9.911728
6.975535
12.711965
4.845529
6.531035
6.221902
6.691088
6.915521
7.526189
5.687589
4.648638
3.196965
5.655204
6.899286
5.018733
6.686863
4.691510
6.616092
5.552907
10.099042
3.983911
9.342515
6.577858
3.712910
4.697486
5.061583
5.112267
6.210239
15.941712
2.585294
8.352803
6.295825
6.035192
5.496462
6.631243
3.856289
6.041307
23.030694
6.096826
5.363703
10.739196
5.496608
5.087616
4.222902
12.748787
2.082916
3.353945
9.055495
...
0.796024
0.909079
0.926489
0.855503
0.944568
0.896995
0.862610
1.072966
0.738476
0.913067
0.877246
0.797810
0.777069
0.804129
0.973098
0.790396
0.889272
1.044082
0.848854
0.822262
0.962644
0.935022
1.176447
1.123527
0.874715
0.697053
1.098977
0.926931
0.883549
0.782220
0.929458
0.797841
1.009072
0.753575
0.820716
0.864211
0.889806
0.876695
0.797402
0.959936
0.916876
0.907625
0.781050
0.783876
0.917208
0.898019
0.869306
0.924573
1.075971
0.854858
0.821360
1.081370
1.030548
0.971347
0.842678
0.962225
0.822437
0.825612
0.964396
0.843651
0.909091
0.982232
0.974882
0.983783
0.819541
0.865117
1.000803
0.851075
0.898850
0.989933
0.909373
0.795862
0.717025
0.721607
0.954055
3
1029.528805
751.784326
655.640311
540.066993
566.134127
526.255165
460.171499
423.016148
369.444783
369.459691
347.886831
343.305314
342.713871
304.650466
299.409544
299.081958
280.996848
296.683790
285.677603
263.715589
238.912917
187.595707
223.893573
215.325411
221.997247
229.465106
224.574896
238.883827
202.423430
220.845476
228.819394
244.126373
182.529016
216.217914
198.995771
182.554719
215.859120
184.885569
195.653349
184.349345
218.817100
188.416197
175.793018
193.517135
184.795571
189.197386
189.286282
177.265277
166.930898
166.316745
173.668666
171.954935
183.790617
143.281182
158.223858
158.960054
167.227459
157.083343
154.335008
155.998689
150.034526
197.276212
155.415637
155.133175
164.253435
191.317473
145.015560
184.473258
147.251819
148.245306
137.579956
173.830229
191.475584
129.039037
146.246642
...
1.313064
1.455776
0.907258
0.847273
0.813487
0.853852
0.940162
0.924069
0.851067
0.790654
0.742876
0.819138
1.492818
0.864653
0.886967
1.159601
0.821838
0.931346
0.883935
0.930908
0.877490
0.931882
1.037034
1.057502
0.870280
0.918149
1.405080
1.456379
1.127419
0.873330
0.865133
0.946465
0.936295
0.784012
0.973883
1.329945
0.956890
1.400776
0.758596
0.907328
0.994154
0.874232
0.878277
0.798990
1.088962
1.151723
0.898940
1.421349
0.987259
0.766984
0.987189
0.723241
0.849505
0.877373
1.286612
1.477315
0.954484
1.116482
0.945982
0.873253
0.872454
0.878087
1.436687
0.916522
0.949573
0.855859
0.809900
0.928804
1.487982
0.918141
1.390670
1.031652
0.755490
0.864685
0.925891
4
84.686830
50.569903
29.790239
46.609463
29.066011
32.100742
39.628684
41.644559
18.627329
21.382471
40.316177
26.448719
29.018087
21.111763
16.463535
24.787983
22.299294
16.576938
14.431890
20.670333
9.707518
19.589928
15.344833
29.048718
19.539289
19.608818
15.094792
17.649770
11.077402
8.265068
23.008406
13.799867
21.349845
5.284080
31.803037
33.916721
16.231378
15.762746
17.526287
14.598748
11.415723
22.321454
25.170574
10.124354
12.428174
13.801095
10.034727
23.666243
9.723196
13.367494
13.117179
15.064447
13.802612
11.304332
14.544449
8.693906
10.518363
11.497571
12.575327
9.872245
13.518083
13.345436
8.711541
13.919656
7.661993
14.323676
7.835069
5.923976
10.305760
15.881917
9.607894
13.861560
2.536607
10.120047
9.251958
...
0.956811
0.908525
0.797821
0.859530
0.917984
0.922327
0.969551
0.789626
0.903601
0.961633
0.885067
0.807612
0.862245
0.876353
0.976884
0.880854
0.791405
0.840732
0.870292
0.718133
0.852915
0.855864
0.789180
0.777344
0.956919
0.891875
0.855127
0.919265
0.878362
0.910308
0.904615
0.971993
0.907860
0.803285
0.977555
0.710161
0.944781
0.967877
0.994523
0.922489
0.954861
0.794897
0.783929
0.881262
0.856886
1.198302
1.081608
1.018026
1.094986
0.808721
0.742701
1.009561
0.822764
0.991128
0.759882
0.823071
0.786464
0.848688
0.913733
0.913808
0.921651
0.841117
0.826702
0.826018
0.875004
0.924390
0.912291
0.766137
0.875925
0.973028
0.957894
0.944202
0.959102
0.802573
1.010276
5 rows × 50000 columns
In [27]:
# def _topic_coordinates(mds, topic_term_dists, topic_proportion):
mds = js_PCoA
K = topic_term_dists.shape[0]
mds_res = mds(topic_term_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'topics': range(1, K + 1), \
'cluster': 1, 'Freq': topic_proportion * 100})
# note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
topic_coordinates = mds_df
topic_coordinates
Out[27]:
Freq
cluster
topics
x
y
topic
3
97.957317
1
1
-0.046095
0.003568
4
0.685661
1
2
-0.030449
-0.009181
0
0.662217
1
3
-0.052126
0.000821
2
0.353436
1
4
0.139155
-0.000147
1
0.341370
1
5
-0.010484
0.004940
In [ ]:
In [ ]:
client_topic_order = [x + 1 for x in topic_order]
In [ ]:
plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}
In [ ]:
class PreparedData(namedtuple('PreparedData', ['topic_coordinates', 'topic_info', 'token_table',\
'R', 'lambda_step', 'plot_opts', 'topic_order'])):
def to_dict(self):
return {'mdsDat': self.topic_coordinates.to_dict(orient='list'),
'tinfo': self.topic_info.to_dict(orient='list'),
'token.table': self.token_table.to_dict(orient='list'),
'R': self.R,
'lambda.step': self.lambda_step,
'plot.opts': self.plot_opts,
'topic.order': self.topic_order}
def to_json(self):
return json.dumps(self.to_dict(), cls=NumPyEncoder)
In [ ]:
# topic_coordinates, topic_info, token_table, R, lambda_step, plot_opts, client_topic_order
In [ ]:
pp = PreparedData(topic_coordinates, topic_info, token_table, R, lambda_step, plot_opts, client_topic_order)
pp.to_dict()
In [ ]:
fileobj = open("/tmp/lda1.json", 'w')
json.dump(pp.to_dict(), fileobj, cls=NumPyEncoder)
In [ ]:
# topic_coordinates.to_dict(orient='list')
# topic_info.to_dict(orient='list')
# token_table.to_dict(orient='list')
topic_order
In [ ]:
def _jensen_shannon(_P, _Q):
_M = 0.5 * (_P + _Q)
e1 = entropy(_P, _M)
e2 = entropy(_Q, _M)
res = 0.5 * ( e1 + e2 )
# print('e1 ', e1)
# print('e2 ', e2)
print('res ', res)
return res
In [ ]:
# topic_term_dists
In [ ]:
ss = pdist(topic_term_dists, metric=_jensen_shannon)
ss.shape
In [ ]:
print(ss)
In [ ]:
ss.shape[0]
d = int(np.ceil(np.sqrt(ss.shape[0] * 2)))
d
In [ ]:
d * (d - 1) / 2
In [ ]:
ss[:10]
In [ ]:
pair_dists = pyLDAvis._prepare.squareform(ss)
In [ ]:
pair_dists[:10]
In [ ]:
pyLDAvis._prepare.js_PCoA(topic_term_dists)
In [ ]:
pair_dists = np.asarray(pair_dists, np.float64)
In [ ]:
n = pair_dists.shape[0]
n
In [ ]:
H = np.eye(n) - np.ones((n, n)) / n
H
In [ ]:
B = - H.dot(pair_dists ** 2).dot(H) / 2
B
In [ ]:
eigvals, eigvecs = np.linalg.eig(B)
eigvecs
In [ ]:
ix = eigvals.argsort()[::-1][:2]
ix
In [ ]:
eigvals = eigvals[ix]
In [ ]:
eigvecs = eigvecs[:, ix]
eigvecs
In [ ]:
eigvals[np.isclose(eigvals, 0)] = 0
np.any(eigvals < 0)
In [ ]:
if np.any(eigvals < 0):
ix_neg = eigvals < 0
eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)
In [ ]:
eigvals.shape
type(eigvals)
In [ ]:
eigvecs.shape
In [ ]:
np.sqrt(eigvals) * eigvecs
In [ ]:
eigvecs * np.sqrt(eigvals)
In [ ]:
In [ ]:
Content source: iaja/scalaLDAvis
Similar notebooks: