In [1]:
import pyLDAvis
import pandas as pd
import json
import numpy as np

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 250)
# np.set_printoptions(suppress=True)

In [2]:
phi_topic_term_dists_file = "/tmp/scalaLDAvis/phi/part-00000"
theta_doc_topics_dists_file = "/tmp/scalaLDAvis/theta/part-00000"
# doc_length_file = "/tmp/lda-vis/doc-length/part-00000"
vocab_file = "/tmp/scalaLDAvis/vocab/part-00000"
term_freq_file = "/tmp/scalaLDAvis/termFreq/part-00000"

In [3]:
import glob
path = '/tmp/scalaLDAvis/theta'
thetaFiles = glob.glob(path + "/part*")
theta_with_size = pd.DataFrame()
list_ = []
for file_ in (thetaFiles):
    df = pd.read_csv(file_, index_col=None, header=None)
    list_.append(df)
theta_with_size = pd.concat(list_)

In [4]:
theta_with_size


Out[4]:
0 1 2 3 4 5 6 7 8 9 10
0 32.0 0.002919 0.002915 0.002914 0.973755 0.002917 0.002919 0.002919 0.002916 0.002913 0.002914
1 47.0 0.002006 0.002004 0.002003 0.981962 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
2 96.0 0.000993 0.000991 0.000991 0.991076 0.000992 0.000992 0.000992 0.000991 0.000991 0.000991
3 90.0 0.001058 0.001057 0.001056 0.990488 0.001057 0.001058 0.001058 0.001057 0.001056 0.001056
4 44.0 0.002140 0.002137 0.002136 0.980759 0.002138 0.002140 0.002140 0.002138 0.002136 0.002136
5 53.0 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
6 41.0 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
7 36.0 0.002603 0.002600 0.002599 0.976595 0.002601 0.002603 0.002603 0.002600 0.002598 0.002599
8 48.0 0.001965 0.001963 0.001962 0.982330 0.001964 0.001965 0.001965 0.001963 0.001961 0.001962
9 19.0 0.004820 0.004813 0.004811 0.956668 0.004816 0.004819 0.004819 0.004814 0.004810 0.004811
10 89.0 0.001070 0.001068 0.001068 0.990382 0.001069 0.001070 0.001070 0.001068 0.001068 0.001068
11 62.0 0.001528 0.001526 0.001526 0.986259 0.001527 0.001528 0.001528 0.001527 0.001525 0.001526
12 143.0 0.000668 0.000668 0.000667 0.993990 0.000668 0.000668 0.000668 0.000668 0.000667 0.000667
13 87.0 0.001094 0.001093 0.001092 0.990164 0.001093 0.001094 0.001094 0.001093 0.001092 0.001092
14 137.0 0.000698 0.000697 0.000696 0.993729 0.000697 0.000697 0.000697 0.000697 0.000696 0.000696
15 92.0 0.001035 0.001034 0.001033 0.990692 0.001034 0.001035 0.001035 0.001034 0.001033 0.001034
16 41.0 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
17 230.0 0.000417 0.000416 0.000416 0.996254 0.000416 0.000417 0.000417 0.000416 0.000416 0.000416
18 83.0 0.001146 0.001145 0.001144 0.989695 0.001145 0.001146 0.001146 0.001145 0.001144 0.001144
19 416.0 0.000231 0.000231 0.000230 0.997925 0.000231 0.000231 0.000231 0.000231 0.000230 0.000230
20 23.0 0.004015 0.004010 0.004008 0.963901 0.004012 0.004015 0.004014 0.004010 0.004007 0.004008
21 77.0 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
22 59.0 0.001605 0.001603 0.001602 0.985572 0.001603 0.001605 0.001605 0.001603 0.001602 0.001602
23 73.0 0.001301 0.001299 0.001299 0.988302 0.001300 0.001301 0.001301 0.001300 0.001298 0.001299
24 394.0 0.000244 0.000243 0.000243 0.997809 0.000243 0.000244 0.000244 0.000243 0.000243 0.000243
25 57.0 0.001660 0.001658 0.001657 0.985073 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
26 367.0 0.000262 0.000261 0.000261 0.997648 0.000261 0.000262 0.000262 0.000261 0.000261 0.000261
27 54.0 0.001751 0.001748 0.001748 0.984258 0.001749 0.001751 0.001751 0.001749 0.001747 0.001748
28 43.0 0.002189 0.002186 0.002185 0.980321 0.002187 0.002189 0.002188 0.002186 0.002184 0.002185
29 145.0 0.000659 0.000658 0.000658 0.994072 0.000659 0.000659 0.000659 0.000659 0.000658 0.000658
30 47.0 0.002006 0.002004 0.002003 0.981961 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
31 141.0 0.000678 0.000677 0.000677 0.993905 0.000677 0.000678 0.000678 0.000677 0.000677 0.000677
32 39.0 0.002408 0.002405 0.002404 0.978352 0.002406 0.002407 0.002407 0.002405 0.002403 0.002404
33 28.0 0.003322 0.003318 0.003316 0.970132 0.003319 0.003322 0.003321 0.003318 0.003315 0.003316
34 38.0 0.002470 0.002466 0.002465 0.977796 0.002467 0.002469 0.002469 0.002467 0.002465 0.002466
35 57.0 0.001660 0.001658 0.001657 0.985074 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
36 63.0 0.001504 0.001502 0.001502 0.986473 0.001503 0.001504 0.001504 0.001503 0.001501 0.001502
37 89.0 0.001070 0.001068 0.001068 0.990383 0.001069 0.001070 0.001070 0.001068 0.001068 0.001068
38 54.0 0.001751 0.001748 0.001748 0.984259 0.001749 0.001751 0.001751 0.001749 0.001747 0.001748
39 36.0 0.002603 0.002600 0.002599 0.976596 0.002601 0.002603 0.002603 0.002600 0.002598 0.002599
40 74.0 0.001284 0.001282 0.001281 0.988458 0.001283 0.001284 0.001284 0.001282 0.001281 0.001282
41 868.0 0.000111 0.000111 0.000111 0.999004 0.000111 0.000111 0.000111 0.000111 0.000111 0.000111
42 114.0 0.000837 0.000836 0.000836 0.992474 0.000836 0.000837 0.000837 0.000836 0.000835 0.000836
43 16.0 0.005671 0.005663 0.005661 0.949013 0.005666 0.005670 0.005670 0.005664 0.005659 0.005661
44 33.0 0.002833 0.002829 0.002828 0.974528 0.002831 0.002833 0.002833 0.002830 0.002827 0.002828
45 120.0 0.000796 0.000795 0.000794 0.992847 0.000795 0.000796 0.000795 0.000795 0.000794 0.000794
46 209.0 0.000458 0.000458 0.000458 0.995879 0.000458 0.000458 0.000458 0.000458 0.000457 0.000458
47 92.0 0.001035 0.001034 0.001033 0.990693 0.001034 0.001035 0.001035 0.001034 0.001033 0.001033
48 47.0 0.002006 0.002004 0.002003 0.981961 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
49 66.0 0.001437 0.001435 0.001435 0.987079 0.001436 0.001437 0.001437 0.001435 0.001434 0.001435
50 204.0 0.000470 0.000469 0.000469 0.995778 0.000469 0.000469 0.000469 0.000469 0.000469 0.000469
51 51.0 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
52 32.0 0.002919 0.002915 0.002914 0.973755 0.002917 0.002919 0.002919 0.002916 0.002913 0.002914
53 134.0 0.000713 0.000712 0.000712 0.993589 0.000712 0.000713 0.000713 0.000712 0.000712 0.000712
54 210.0 0.000456 0.000456 0.000455 0.995899 0.000456 0.000456 0.000456 0.000456 0.000455 0.000455
55 23.0 0.004015 0.004010 0.004008 0.963900 0.004012 0.004015 0.004014 0.004010 0.004007 0.004008
56 55.0 0.001720 0.001717 0.001717 0.984540 0.001718 0.001719 0.001719 0.001718 0.001716 0.001717
57 50.0 0.001888 0.001886 0.001885 0.983024 0.001887 0.001888 0.001888 0.001886 0.001884 0.001885
58 123.0 0.000776 0.000775 0.000775 0.993020 0.000776 0.000776 0.000776 0.000775 0.000775 0.000775
59 14.0 0.006429 0.006421 0.006418 0.942195 0.006424 0.006429 0.006428 0.006422 0.006416 0.006419
60 40.0 0.002349 0.002346 0.002345 0.978880 0.002347 0.002349 0.002349 0.002346 0.002344 0.002345
61 75.0 0.001267 0.001265 0.001265 0.988610 0.001266 0.001267 0.001267 0.001265 0.001264 0.001265
62 71.0 0.001337 0.001335 0.001335 0.987977 0.001336 0.001337 0.001337 0.001336 0.001335 0.001335
63 33.0 0.002833 0.002829 0.002828 0.974528 0.002831 0.002833 0.002833 0.002830 0.002827 0.002828
64 45.0 0.002094 0.002091 0.002090 0.981177 0.002092 0.002093 0.002093 0.002091 0.002089 0.002090
65 72.0 0.001319 0.001317 0.001317 0.988142 0.001318 0.001319 0.001319 0.001317 0.001316 0.001317
66 127.0 0.000752 0.000751 0.000751 0.993238 0.000751 0.000752 0.000752 0.000751 0.000751 0.000751
67 233.0 0.000411 0.000411 0.000411 0.996302 0.000411 0.000411 0.000411 0.000411 0.000411 0.000411
68 129.0 0.000740 0.000739 0.000739 0.993343 0.000740 0.000740 0.000740 0.000740 0.000739 0.000739
69 140.0 0.000683 0.000682 0.000681 0.993862 0.000682 0.000683 0.000683 0.000682 0.000681 0.000682
70 122.0 0.000783 0.000782 0.000781 0.992964 0.000782 0.000783 0.000782 0.000782 0.000781 0.000781
71 299.0 0.000321 0.000320 0.000320 0.997115 0.000321 0.000321 0.000321 0.000320 0.000320 0.000320
72 139.0 0.000688 0.000687 0.000686 0.993818 0.000687 0.000688 0.000687 0.000687 0.000686 0.000686
73 35.0 0.002676 0.002672 0.002671 0.975944 0.002673 0.002675 0.002675 0.002673 0.002670 0.002671
74 66.0 0.001437 0.001435 0.001435 0.987080 0.001436 0.001437 0.001437 0.001435 0.001434 0.001435
75 44.0 0.002140 0.002137 0.002136 0.980758 0.002138 0.002140 0.002140 0.002138 0.002136 0.002137
76 208.0 0.000461 0.000460 0.000460 0.995859 0.000460 0.000461 0.000460 0.000460 0.000460 0.000460
77 28.0 0.003322 0.003318 0.003316 0.970133 0.003319 0.003322 0.003321 0.003318 0.003315 0.003316
78 171.0 0.000560 0.000559 0.000559 0.994968 0.000559 0.000560 0.000560 0.000559 0.000558 0.000559
79 102.0 0.000935 0.000933 0.000933 0.991597 0.000934 0.000935 0.000934 0.000934 0.000933 0.000933
80 38.0 0.002469 0.002466 0.002465 0.977797 0.002467 0.002469 0.002469 0.002467 0.002465 0.002465
81 27.0 0.003441 0.003436 0.003435 0.969064 0.003438 0.003440 0.003440 0.003437 0.003434 0.003435
82 58.0 0.001632 0.001630 0.001629 0.985325 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
83 3149.0 0.000031 0.000031 0.000030 0.999725 0.000031 0.000031 0.000031 0.000031 0.000030 0.000031
84 63.0 0.001504 0.001502 0.001502 0.986473 0.001503 0.001504 0.001504 0.001503 0.001501 0.001502
85 58.0 0.001632 0.001630 0.001629 0.985326 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
86 38.0 0.002470 0.002466 0.002465 0.977795 0.002468 0.002469 0.002469 0.002467 0.002465 0.002466
87 96.0 0.000992 0.000991 0.000991 0.991077 0.000992 0.000992 0.000992 0.000991 0.000990 0.000991
88 156.0 0.000613 0.000612 0.000612 0.994488 0.000613 0.000613 0.000613 0.000612 0.000612 0.000612
89 25.0 0.003706 0.003701 0.003699 0.966681 0.003703 0.003705 0.003705 0.003702 0.003698 0.003700
90 70.0 0.001356 0.001354 0.001354 0.987807 0.001355 0.001356 0.001356 0.001355 0.001353 0.001354
91 38.0 0.002470 0.002466 0.002465 0.977796 0.002468 0.002469 0.002469 0.002467 0.002465 0.002465
92 34.0 0.002752 0.002748 0.002747 0.975256 0.002750 0.002752 0.002752 0.002749 0.002747 0.002747
93 103.0 0.000926 0.000924 0.000924 0.991678 0.000925 0.000926 0.000925 0.000925 0.000924 0.000924
94 29.0 0.003211 0.003207 0.003205 0.971129 0.003208 0.003211 0.003211 0.003207 0.003205 0.003206
95 637.0 0.000151 0.000151 0.000151 0.998644 0.000151 0.000151 0.000151 0.000151 0.000151 0.000151
96 142.0 0.000673 0.000672 0.000672 0.993948 0.000673 0.000673 0.000673 0.000672 0.000672 0.000672
97 40.0 0.002349 0.002346 0.002345 0.978880 0.002347 0.002349 0.002349 0.002346 0.002344 0.002345
98 151.0 0.000633 0.000632 0.000632 0.994306 0.000633 0.000633 0.000633 0.000633 0.000632 0.000632
99 49.0 0.001926 0.001923 0.001923 0.982684 0.001924 0.001926 0.001926 0.001924 0.001922 0.001923
100 80.0 0.001189 0.001187 0.001187 0.989313 0.001188 0.001188 0.001188 0.001187 0.001186 0.001187
101 245.0 0.000391 0.000391 0.000391 0.996482 0.000391 0.000391 0.000391 0.000391 0.000390 0.000391
102 120.0 0.000796 0.000795 0.000794 0.992847 0.000795 0.000795 0.000795 0.000795 0.000794 0.000794
103 47.0 0.002006 0.002004 0.002003 0.981962 0.002004 0.002006 0.002006 0.002004 0.002002 0.002003
104 96.0 0.000992 0.000991 0.000991 0.991077 0.000992 0.000992 0.000992 0.000991 0.000990 0.000991
105 58.0 0.001632 0.001630 0.001629 0.985327 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
106 77.0 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
107 124.0 0.000770 0.000769 0.000769 0.993076 0.000769 0.000770 0.000770 0.000769 0.000769 0.000769
108 70.0 0.001356 0.001354 0.001354 0.987808 0.001355 0.001356 0.001356 0.001354 0.001353 0.001354
109 244.0 0.000393 0.000392 0.000392 0.996468 0.000393 0.000393 0.000393 0.000392 0.000392 0.000392
110 92.0 0.001035 0.001034 0.001033 0.990693 0.001034 0.001035 0.001035 0.001034 0.001033 0.001033
111 72.0 0.001319 0.001317 0.001317 0.988142 0.001318 0.001319 0.001319 0.001317 0.001316 0.001317
112 71.0 0.001337 0.001335 0.001335 0.987977 0.001336 0.001337 0.001337 0.001336 0.001335 0.001335
113 80.0 0.001189 0.001187 0.001187 0.989313 0.001188 0.001189 0.001188 0.001187 0.001186 0.001187
114 44.0 0.002140 0.002137 0.002136 0.980758 0.002138 0.002140 0.002140 0.002138 0.002136 0.002137
115 69.0 0.001375 0.001374 0.001373 0.987633 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
116 53.0 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
117 76.0 0.001250 0.001249 0.001248 0.988758 0.001249 0.001250 0.001250 0.001249 0.001248 0.001248
118 57.0 0.001660 0.001658 0.001657 0.985073 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
119 28.0 0.003322 0.003317 0.003316 0.970133 0.003319 0.003322 0.003321 0.003318 0.003315 0.003316
120 132.0 0.000724 0.000723 0.000723 0.993493 0.000723 0.000724 0.000724 0.000723 0.000722 0.000723
121 75.0 0.001267 0.001265 0.001265 0.988610 0.001266 0.001267 0.001267 0.001265 0.001264 0.001265
122 52.0 0.001817 0.001814 0.001814 0.983665 0.001815 0.001817 0.001817 0.001815 0.001813 0.001814
123 56.0 0.001689 0.001687 0.001686 0.984812 0.001688 0.001689 0.001689 0.001687 0.001686 0.001686
124 21.0 0.004381 0.004375 0.004373 0.960615 0.004377 0.004380 0.004380 0.004375 0.004372 0.004373
... ... ... ... ... ... ... ... ... ... ... ...
6375 42.0 0.002240 0.002237 0.002236 0.979862 0.002238 0.002240 0.002239 0.002237 0.002235 0.002236
6376 123.0 0.000776 0.000775 0.000775 0.993020 0.000776 0.000776 0.000776 0.000775 0.000775 0.000775
6377 77.0 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
6378 98.0 0.000972 0.000971 0.000971 0.991257 0.000972 0.000972 0.000972 0.000971 0.000970 0.000971
6379 41.0 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002291 0.002288 0.002289
6380 58.0 0.001632 0.001630 0.001629 0.985327 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
6381 123.0 0.000776 0.000775 0.000775 0.993020 0.000776 0.000776 0.000776 0.000775 0.000775 0.000775
6382 51.0 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
6383 192.0 0.000499 0.000498 0.000498 0.995516 0.000498 0.000499 0.000499 0.000498 0.000498 0.000498
6384 65.0 0.001459 0.001457 0.001456 0.986883 0.001458 0.001459 0.001459 0.001457 0.001456 0.001456
6385 49.0 0.001926 0.001923 0.001923 0.982683 0.001924 0.001926 0.001926 0.001924 0.001922 0.001923
6386 111.0 0.000860 0.000858 0.000858 0.992272 0.000859 0.000859 0.000859 0.000859 0.000858 0.000858
6387 73.0 0.001301 0.001299 0.001299 0.988302 0.001300 0.001301 0.001301 0.001300 0.001298 0.001299
6388 35.0 0.002676 0.002672 0.002671 0.975943 0.002673 0.002675 0.002675 0.002673 0.002670 0.002671
6389 66.0 0.001437 0.001435 0.001435 0.987079 0.001436 0.001437 0.001437 0.001435 0.001434 0.001435
6390 50.0 0.001888 0.001886 0.001885 0.983024 0.001887 0.001888 0.001888 0.001886 0.001884 0.001885
6391 68.0 0.001395 0.001394 0.001393 0.987454 0.001394 0.001395 0.001395 0.001394 0.001393 0.001393
6392 99.0 0.000963 0.000961 0.000961 0.991344 0.000962 0.000963 0.000963 0.000962 0.000961 0.000961
6393 143.0 0.000668 0.000668 0.000667 0.993990 0.000668 0.000668 0.000668 0.000668 0.000667 0.000667
6394 54.0 0.001751 0.001749 0.001748 0.984259 0.001749 0.001751 0.001750 0.001749 0.001747 0.001748
6395 314.0 0.000306 0.000305 0.000305 0.997253 0.000305 0.000306 0.000305 0.000305 0.000305 0.000305
6396 52.0 0.001817 0.001814 0.001814 0.983664 0.001815 0.001817 0.001817 0.001815 0.001813 0.001814
6397 29.0 0.003211 0.003207 0.003206 0.971128 0.003209 0.003211 0.003211 0.003207 0.003205 0.003206
6398 28.0 0.003322 0.003318 0.003316 0.970132 0.003319 0.003322 0.003322 0.003318 0.003315 0.003317
6399 24.0 0.003854 0.003849 0.003847 0.965347 0.003851 0.003854 0.003854 0.003850 0.003846 0.003848
6400 62.0 0.001528 0.001526 0.001526 0.986259 0.001527 0.001528 0.001528 0.001527 0.001525 0.001526
6401 41.0 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
6402 51.0 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
6403 26.0 0.003568 0.003564 0.003562 0.967917 0.003565 0.003568 0.003568 0.003564 0.003561 0.003562
6404 241.0 0.000398 0.000397 0.000397 0.996424 0.000397 0.000398 0.000398 0.000397 0.000397 0.000397
6405 69.0 0.001375 0.001374 0.001373 0.987633 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
6406 54.0 0.001751 0.001748 0.001748 0.984259 0.001749 0.001751 0.001750 0.001749 0.001747 0.001748
6407 20.0 0.004589 0.004583 0.004581 0.958738 0.004585 0.004589 0.004588 0.004584 0.004580 0.004582
6408 316.0 0.000304 0.000303 0.000303 0.997270 0.000303 0.000304 0.000304 0.000303 0.000303 0.000303
6409 27.0 0.003441 0.003436 0.003435 0.969063 0.003438 0.003441 0.003440 0.003437 0.003434 0.003435
6410 19.0 0.004819 0.004813 0.004811 0.956672 0.004815 0.004819 0.004818 0.004814 0.004809 0.004811
6411 32.0 0.002919 0.002915 0.002914 0.973754 0.002917 0.002919 0.002919 0.002916 0.002913 0.002914
6412 81.0 0.001174 0.001172 0.001172 0.989444 0.001173 0.001174 0.001174 0.001173 0.001172 0.001172
6413 159.0 0.000602 0.000601 0.000601 0.994591 0.000601 0.000602 0.000602 0.000601 0.000600 0.000601
6414 44.0 0.002140 0.002137 0.002136 0.980759 0.002138 0.002140 0.002140 0.002138 0.002136 0.002136
6415 112.0 0.000852 0.000851 0.000850 0.992340 0.000851 0.000852 0.000852 0.000851 0.000850 0.000850
6416 251.0 0.000382 0.000381 0.000381 0.996566 0.000382 0.000382 0.000382 0.000381 0.000381 0.000381
6417 21.0 0.004380 0.004375 0.004373 0.960616 0.004377 0.004380 0.004380 0.004375 0.004372 0.004373
6418 243.0 0.000394 0.000394 0.000394 0.996453 0.000394 0.000394 0.000394 0.000394 0.000394 0.000394
6419 57.0 0.001660 0.001658 0.001657 0.985074 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
6420 12.0 0.007420 0.007410 0.007407 0.933285 0.007414 0.007420 0.007419 0.007412 0.007405 0.007408
6421 335.0 0.000286 0.000286 0.000286 0.997425 0.000286 0.000286 0.000286 0.000286 0.000286 0.000286
6422 63.0 0.001505 0.001502 0.001502 0.986473 0.001503 0.001504 0.001504 0.001503 0.001501 0.001502
6423 31.0 0.003010 0.003006 0.003005 0.972935 0.003008 0.003010 0.003010 0.003007 0.003004 0.003005
6424 37.0 0.002535 0.002531 0.002530 0.977212 0.002532 0.002534 0.002534 0.002532 0.002529 0.002530
6425 47.0 0.002006 0.002004 0.002003 0.981962 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
6426 53.0 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
6427 33.0 0.002833 0.002830 0.002828 0.974526 0.002831 0.002833 0.002833 0.002830 0.002828 0.002829
6428 115.0 0.000830 0.000829 0.000828 0.992539 0.000829 0.000830 0.000830 0.000829 0.000828 0.000828
6429 92.0 0.001035 0.001034 0.001033 0.990693 0.001034 0.001035 0.001035 0.001034 0.001033 0.001033
6430 88.0 0.001082 0.001080 0.001080 0.990274 0.001081 0.001082 0.001082 0.001080 0.001080 0.001080
6431 67.0 0.001416 0.001414 0.001413 0.987269 0.001415 0.001416 0.001416 0.001414 0.001413 0.001414
6432 77.0 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
6433 99.0 0.000963 0.000961 0.000961 0.991344 0.000962 0.000963 0.000963 0.000962 0.000961 0.000961
6434 87.0 0.001094 0.001093 0.001092 0.990164 0.001093 0.001094 0.001094 0.001093 0.001092 0.001092
6435 100.0 0.000953 0.000952 0.000951 0.991431 0.000952 0.000953 0.000953 0.000952 0.000951 0.000952
6436 27.0 0.003441 0.003436 0.003435 0.969065 0.003438 0.003440 0.003440 0.003437 0.003434 0.003435
6437 54.0 0.001751 0.001748 0.001748 0.984259 0.001749 0.001751 0.001750 0.001749 0.001747 0.001748
6438 69.0 0.001375 0.001374 0.001373 0.987634 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
6439 97.0 0.000982 0.000981 0.000981 0.991168 0.000982 0.000982 0.000982 0.000981 0.000980 0.000981
6440 65.0 0.001459 0.001457 0.001456 0.986883 0.001458 0.001459 0.001459 0.001457 0.001456 0.001456
6441 50.0 0.001888 0.001886 0.001885 0.983024 0.001887 0.001888 0.001888 0.001886 0.001884 0.001885
6442 66.0 0.001437 0.001435 0.001435 0.987079 0.001436 0.001437 0.001437 0.001436 0.001434 0.001435
6443 46.0 0.002049 0.002046 0.002045 0.981577 0.002047 0.002049 0.002049 0.002047 0.002045 0.002046
6444 43.0 0.002189 0.002186 0.002185 0.980321 0.002187 0.002189 0.002188 0.002186 0.002184 0.002185
6445 101.0 0.000944 0.000943 0.000942 0.991514 0.000943 0.000944 0.000944 0.000943 0.000942 0.000942
6446 46.0 0.002049 0.002046 0.002045 0.981577 0.002047 0.002049 0.002049 0.002047 0.002045 0.002046
6447 143.0 0.000668 0.000668 0.000667 0.993990 0.000668 0.000668 0.000668 0.000668 0.000667 0.000667
6448 45.0 0.002094 0.002091 0.002090 0.981177 0.002092 0.002093 0.002093 0.002091 0.002089 0.002090
6449 102.0 0.000935 0.000933 0.000933 0.991596 0.000934 0.000935 0.000935 0.000934 0.000933 0.000933
6450 303.0 0.000317 0.000316 0.000316 0.997153 0.000316 0.000317 0.000317 0.000316 0.000316 0.000316
6451 86.0 0.001107 0.001105 0.001105 0.990051 0.001106 0.001106 0.001106 0.001105 0.001104 0.001105
6452 124.0 0.000770 0.000769 0.000769 0.993076 0.000769 0.000770 0.000770 0.000769 0.000769 0.000769
6453 702.0 0.000137 0.000137 0.000137 0.998769 0.000137 0.000137 0.000137 0.000137 0.000137 0.000137
6454 49.0 0.001926 0.001924 0.001923 0.982682 0.001924 0.001926 0.001926 0.001924 0.001922 0.001923
6455 65.0 0.001459 0.001457 0.001456 0.986884 0.001458 0.001459 0.001459 0.001457 0.001456 0.001456
6456 41.0 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
6457 46.0 0.002049 0.002046 0.002045 0.981578 0.002047 0.002049 0.002049 0.002047 0.002045 0.002046
6458 117.0 0.000816 0.000815 0.000814 0.992665 0.000815 0.000816 0.000816 0.000815 0.000814 0.000814
6459 882.0 0.000109 0.000109 0.000109 0.999020 0.000109 0.000109 0.000109 0.000109 0.000109 0.000109
6460 169.0 0.000566 0.000565 0.000565 0.994909 0.000566 0.000566 0.000566 0.000566 0.000565 0.000565
6461 716.0 0.000134 0.000134 0.000134 0.998793 0.000134 0.000134 0.000134 0.000134 0.000134 0.000134
6462 62.0 0.001528 0.001526 0.001526 0.986259 0.001527 0.001528 0.001528 0.001527 0.001525 0.001526
6463 70.0 0.001356 0.001354 0.001354 0.987808 0.001355 0.001356 0.001356 0.001355 0.001353 0.001354
6464 30.0 0.003107 0.003103 0.003102 0.972061 0.003105 0.003107 0.003107 0.003104 0.003101 0.003102
6465 89.0 0.001070 0.001068 0.001068 0.990382 0.001069 0.001070 0.001070 0.001068 0.001068 0.001068
6466 98.0 0.000972 0.000971 0.000971 0.991257 0.000972 0.000972 0.000972 0.000971 0.000970 0.000971
6467 25.0 0.003706 0.003701 0.003699 0.966681 0.003703 0.003705 0.003705 0.003702 0.003698 0.003700
6468 128.0 0.000746 0.000745 0.000745 0.993291 0.000746 0.000746 0.000746 0.000745 0.000745 0.000745
6469 59.0 0.001605 0.001603 0.001602 0.985572 0.001603 0.001605 0.001605 0.001603 0.001602 0.001602
6470 268.0 0.000358 0.000357 0.000357 0.996783 0.000358 0.000358 0.000358 0.000357 0.000357 0.000357
6471 44.0 0.002140 0.002137 0.002136 0.980758 0.002138 0.002140 0.002140 0.002138 0.002136 0.002137
6472 30.0 0.003108 0.003103 0.003102 0.972060 0.003105 0.003107 0.003107 0.003104 0.003101 0.003102
6473 60.0 0.001578 0.001576 0.001576 0.985808 0.001577 0.001578 0.001578 0.001577 0.001575 0.001576
6474 52.0 0.001817 0.001815 0.001814 0.983664 0.001815 0.001817 0.001817 0.001815 0.001813 0.001814
6475 82.0 0.001160 0.001158 0.001158 0.989571 0.001159 0.001160 0.001160 0.001159 0.001158 0.001158
6476 69.0 0.001375 0.001374 0.001373 0.987634 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
6477 42.0 0.002240 0.002237 0.002236 0.979863 0.002238 0.002239 0.002239 0.002237 0.002235 0.002236
6478 40.0 0.002349 0.002346 0.002345 0.978880 0.002347 0.002349 0.002349 0.002346 0.002344 0.002345
6479 38.0 0.002470 0.002466 0.002465 0.977796 0.002468 0.002469 0.002469 0.002467 0.002465 0.002465
6480 18.0 0.005073 0.005067 0.005065 0.954385 0.005069 0.005073 0.005073 0.005068 0.005063 0.005065
6481 53.0 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
6482 61.0 0.001553 0.001551 0.001550 0.986037 0.001552 0.001553 0.001553 0.001551 0.001550 0.001550
6483 122.0 0.000783 0.000782 0.000781 0.992963 0.000782 0.000783 0.000783 0.000782 0.000781 0.000781
6484 207.0 0.000463 0.000462 0.000462 0.995839 0.000462 0.000463 0.000463 0.000462 0.000462 0.000462
6485 19.0 0.004821 0.004813 0.004811 0.956666 0.004815 0.004819 0.004819 0.004814 0.004810 0.004811
6486 68.0 0.001395 0.001393 0.001393 0.987454 0.001394 0.001395 0.001395 0.001394 0.001393 0.001393
6487 87.0 0.001094 0.001093 0.001092 0.990164 0.001093 0.001094 0.001094 0.001093 0.001092 0.001092
6488 38.0 0.002470 0.002466 0.002465 0.977796 0.002467 0.002469 0.002469 0.002467 0.002465 0.002465
6489 39.0 0.002408 0.002405 0.002404 0.978351 0.002406 0.002408 0.002407 0.002405 0.002403 0.002404
6490 114.0 0.000837 0.000836 0.000836 0.992474 0.000836 0.000837 0.000837 0.000836 0.000835 0.000836
6491 305.0 0.000315 0.000314 0.000314 0.997172 0.000314 0.000315 0.000314 0.000314 0.000314 0.000314
6492 73.0 0.001301 0.001299 0.001299 0.988302 0.001300 0.001301 0.001301 0.001300 0.001298 0.001299
6493 41.0 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
6494 58.0 0.001632 0.001630 0.001629 0.985327 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
6495 51.0 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
6496 64.0 0.001482 0.001479 0.001479 0.986680 0.001480 0.001481 0.001481 0.001480 0.001478 0.001479
6497 778.0 0.000124 0.000123 0.000123 0.998889 0.000123 0.000124 0.000124 0.000123 0.000123 0.000123
6498 48.0 0.001965 0.001963 0.001962 0.982330 0.001964 0.001965 0.001965 0.001963 0.001961 0.001962
6499 992.0 0.000097 0.000097 0.000097 0.999129 0.000097 0.000097 0.000097 0.000097 0.000097 0.000097

11314 rows × 11 columns


In [5]:
theta_with_size = theta_with_size[theta_with_size[0] > 0]
phi = pd.read_csv(phi_topic_term_dists_file, header=None)
# theta_with_size = pd.read_csv(theta_doc_topics_dists_file, header=None)
theta = theta_with_size.ix[:,1:]
doc_length = theta_with_size.ix[:,0]
vocab = pd.read_csv(vocab_file, header=None)
term_freq = pd.read_csv(term_freq_file, header=None)

In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [7]:
data = {'topic_term_dists': phi.values.tolist(), 
            'doc_topic_dists': theta.values.tolist(),
            'doc_lengths': doc_length.astype("int64").values.tolist(),
            'vocab': flatten(vocab.values.tolist()),
            'term_frequency': flatten(term_freq.astype("int64").values.tolist())}


print('Topic-Term shape: %s' % str(np.array(data['topic_term_dists']).shape))
print('Doc-Topic shape: %s' % str(np.array(data['doc_topic_dists']).shape))


Topic-Term shape: (10, 50000)
Doc-Topic shape: (11314, 10)

In [44]:
lda_vis_data = pyLDAvis.prepare(**data)

In [45]:
pyLDAvis.enable_notebook()

In [46]:
# pyLDAvis.prepare(mds='mmds', **data)

In [57]:
pyLDAvis.save_json(lda_vis_data, "/tmp/scalaLDAvis/pyLDAvis/lda.json")

In [48]:
pyLDAvis.save_html(lda_vis_data, "/tmp/ldavis.html")

In [49]:
pyLDAvis.urls


Out[49]:
<module 'pyLDAvis.urls' from '/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/pyLDAvis/urls.py'>

In [16]:
# !cat /home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/pyLDAvis/urls.py

Exploration for Scala Porting

Only for developers

K(integer) = number of topics (e.g. 50)
V(integer) = number of words in the vocabulary (e.g. 50,000 or 1,000,000)
M(integer) = number of documents
$N_{d=1\dots M} $(integer) = number of words in document d
N(integer) = total number of words in all documents; sum of all $N_{d}$ values, i.e. $N=\sum _{d=1}^{M}N_{d}$
Z = N-dimension vector of integers between 1 and K = identity of topic of all words in all documents
W = N-dimension vector of integers between 1 and V = identity of all words in all documents


In [8]:
from pyLDAvis import *
from pyLDAvis._prepare import *

In [9]:
topic_term_dists = pyLDAvis._prepare._df_with_names(data['topic_term_dists'], 'topic', 'term')
#[K x V]
doc_topic_dists  = pyLDAvis._prepare._df_with_names(data['doc_topic_dists'], 'doc', 'topic')
#[M x K]
term_frequency   = pyLDAvis._prepare._series_with_name(data['term_frequency'], 'term_frequency')
#[V]
doc_lengths      = pyLDAvis._prepare._series_with_name(data['doc_lengths'], 'doc_length')
#[M]
vocab            = pyLDAvis._prepare._series_with_name(data['vocab'], 'vocab')
#[V]

In [10]:
topic_term_dists.shape#[K x V]
topic_term_dists


Out[10]:
term 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 ... 49925 49926 49927 49928 49929 49930 49931 49932 49933 49934 49935 49936 49937 49938 49939 49940 49941 49942 49943 49944 49945 49946 49947 49948 49949 49950 49951 49952 49953 49954 49955 49956 49957 49958 49959 49960 49961 49962 49963 49964 49965 49966 49967 49968 49969 49970 49971 49972 49973 49974 49975 49976 49977 49978 49979 49980 49981 49982 49983 49984 49985 49986 49987 49988 49989 49990 49991 49992 49993 49994 49995 49996 49997 49998 49999
topic
0 24.870688 21.559374 24.848545 10.189182 9.700961 16.208485 7.853970 22.979569 7.406882 16.984633 15.510591 12.958005 9.055710 8.363979 4.247700 11.506454 7.047064 9.709759 10.177908 8.035401 6.826531 6.829253 3.707348 9.759646 3.332268 3.800230 6.738490 9.448646 6.062157 7.730785 6.032167 6.699437 7.445487 5.886514 8.183782 6.208295 2.992643 6.148651 8.222009 6.928704 6.434229 5.013328 4.641739 6.692826 8.928966 2.785740 7.005564 8.044817 6.570035 3.074756 7.252581 6.722737 6.668937 2.810734 4.977989 8.117550 5.247739 3.430537 4.290412 2.861926 3.671912 1.527140 1.548591 2.991851 4.643409 5.084082 4.279109 4.411219 2.840253 7.871005 4.478428 3.846753 1.984382 4.714036 7.401284 ... 0.643118 1.071917 1.058301 0.967577 0.956299 0.788641 0.776995 0.906096 0.975965 0.780293 0.855194 0.952308 0.795817 1.062600 0.953764 1.081162 0.929956 1.040463 1.133978 1.064275 0.881111 0.934912 0.972580 1.060295 0.789618 0.931094 0.864498 1.073642 0.850832 0.961806 0.955679 0.752066 0.803353 0.812691 0.795662 0.911058 1.101044 1.002116 0.946687 0.893284 0.830108 0.737415 0.730675 0.925176 0.809287 0.930815 0.835031 0.920183 0.797343 1.467583 1.006026 0.965949 1.198470 0.809871 0.826039 0.729040 0.984685 0.996069 0.925957 0.937184 0.798756 1.131209 0.855144 0.845376 0.895367 0.863819 0.699012 0.819847 0.951682 0.911374 0.964902 0.966253 0.964197 0.882192 0.755758
1 33.401918 17.078428 20.044636 6.946050 13.719605 7.947532 8.534269 8.663387 6.697656 9.465463 11.057000 11.050145 9.298605 4.480227 1.766929 5.384216 10.126207 5.649822 5.147637 5.386281 9.070167 8.860614 5.994165 3.408542 7.349286 2.880106 2.133808 3.527771 6.021526 2.488975 5.013918 11.006257 4.978995 4.110157 15.355072 6.907951 4.829160 5.646619 1.776806 5.751680 3.555192 3.809261 3.240087 4.008485 5.033435 3.569024 5.101061 7.123383 6.814409 4.810447 7.196640 3.133040 3.777292 5.216637 5.975628 4.905718 8.065505 6.542123 2.360237 5.742254 2.893492 5.474456 3.386709 1.662291 2.726652 1.593486 4.276451 4.739083 1.664434 5.010978 3.763495 8.493277 13.527814 3.685822 5.048271 ... 0.869655 0.954529 0.849056 0.798737 0.913945 1.064351 1.018427 0.906554 0.731343 1.007573 0.900765 0.922811 0.790644 0.974767 0.859631 0.994104 0.893179 0.832139 0.972841 0.842145 0.845484 0.761239 0.843164 0.890486 1.012105 1.067445 0.837786 0.964018 1.163420 0.864480 0.905981 0.771364 0.907022 1.037604 0.981154 1.014030 1.055378 0.875974 1.030438 0.844262 0.854144 0.846996 0.976750 0.874581 0.902618 1.004214 0.955245 1.038586 0.922442 0.963343 0.945768 0.924197 1.107366 0.843171 0.961767 0.946991 1.030004 1.029144 0.897846 0.846511 0.933582 0.938480 0.923647 0.995120 0.981613 0.943782 1.054313 0.941558 1.005561 0.729361 0.840127 1.073069 0.989798 1.068669 0.896674
2 18.231773 9.661907 10.402560 16.730183 8.751762 8.003756 11.083495 3.170324 6.382108 6.132467 4.400303 5.538933 4.512643 2.754489 5.699249 3.347013 6.372030 7.221675 4.804678 5.397067 3.476188 5.910220 4.351529 3.417071 3.968878 3.870700 4.542757 3.764698 3.675050 6.404810 3.475772 2.275368 4.140666 3.962001 2.143531 3.064304 5.035950 2.100947 2.359482 2.834560 4.076997 2.855027 6.964688 1.772505 5.158982 2.742678 3.713746 2.034397 6.793594 4.881100 3.207033 1.493543 2.174001 3.664113 4.354543 9.127847 2.796105 3.788937 3.941233 3.511743 4.206168 2.838678 1.864739 2.929351 2.236427 3.369810 1.964124 2.790292 2.835056 2.379437 2.505491 4.057797 2.514965 1.177314 3.077369 ... 0.823767 0.943717 0.962192 0.886888 0.981371 0.930903 0.894427 1.117578 0.762742 0.947954 0.909953 0.825686 0.803668 0.832389 1.011635 0.817818 0.922711 1.086938 0.879801 0.851625 1.000546 0.971243 0.991476 1.171215 0.907269 0.718800 1.145152 0.962559 0.916640 0.809148 0.965341 0.825719 1.049798 0.778760 0.849985 0.896121 0.923278 0.909323 0.825253 0.997673 0.951994 0.942180 0.807907 0.810904 0.952346 0.931959 0.901531 0.960111 1.120766 0.886204 0.700634 1.126493 1.072580 1.009778 0.873241 1.000100 0.851812 0.855179 1.002405 0.874315 0.943735 1.021325 1.013449 1.022970 0.848507 0.897087 1.041026 0.882190 0.932870 1.029495 0.944008 0.823620 0.739987 0.744847 0.991434
3 569.117645 375.217869 359.171596 286.057745 299.573831 298.655176 266.731011 242.320803 203.752604 187.585252 180.052954 192.567248 180.345152 175.337857 168.259279 171.321794 140.198561 157.606527 155.221883 131.927407 126.147464 111.004187 130.982765 118.817467 141.501060 118.176844 114.398427 119.691441 124.304120 125.368234 135.422730 132.428406 111.376933 109.878513 106.410511 112.684575 117.621705 94.691534 102.574050 85.991384 121.324123 91.121158 96.505274 108.863981 101.613146 107.388160 115.977445 104.107373 87.699947 95.420218 96.954820 93.261686 114.247771 77.059637 86.443770 87.937790 86.472469 77.396072 75.848390 93.047488 79.360157 143.593021 72.537939 86.218436 102.574748 109.118329 74.664771 78.188411 121.873950 75.017177 79.930384 104.522230 93.543186 74.658469 73.756726 ... 0.807071 0.941728 0.941791 0.878157 0.842317 0.885137 0.976697 0.959625 0.882182 0.818094 0.767410 0.848311 0.963916 0.896594 0.920266 0.883813 0.851175 0.967345 0.917014 0.966880 0.910213 0.967913 1.057982 1.101173 0.902564 0.953344 0.887983 0.942766 1.757310 0.905800 0.897104 0.983382 0.972594 1.376353 1.012468 0.985313 0.994441 0.916414 0.784087 0.941866 1.033972 0.906757 0.911047 0.826938 1.134547 0.996208 0.932967 0.905391 1.026659 0.792984 1.003325 0.746581 0.880525 0.910088 0.762417 0.929775 1.382520 1.163741 0.982870 0.905718 0.904870 0.910846 0.904576 0.951618 0.986382 1.469229 0.838511 0.964647 1.045563 0.953336 0.889411 1.073751 1.329903 0.896628 0.961557
4 28.168512 18.001880 14.049882 18.118982 14.684943 11.487112 17.209956 14.978091 10.016852 7.006124 16.626152 10.132525 15.285729 9.976849 7.320657 13.225463 9.245069 11.524126 5.657953 12.989374 2.838482 6.256801 7.372415 13.777652 9.305257 5.642161 4.872286 10.996571 5.892630 5.041900 5.525730 9.357960 8.137254 6.448612 11.438766 13.566489 10.498060 7.304194 5.456101 6.202625 8.094154 11.487553 9.361542 5.384001 4.852198 8.024801 5.977529 14.755373 5.878170 5.140622 6.708097 5.161157 6.211360 4.531957 6.605535 3.573619 5.881520 5.844686 5.628327 4.769399 6.631656 9.480587 4.170229 7.026337 5.140732 6.053367 6.620693 2.291043 6.170354 11.029797 3.753499 10.989146 3.760564 2.685307 4.606793 ... 0.994326 0.943130 0.825697 0.891160 0.953169 0.957776 1.007873 0.817004 0.937912 0.999474 0.918251 0.836084 0.894022 0.909006 1.015652 0.783042 0.818891 0.871219 0.902542 0.741162 0.884142 0.887271 0.816523 0.803974 0.994473 0.925472 0.886477 0.954433 0.911138 0.945027 0.938987 1.010463 0.942430 0.831493 1.016364 0.732702 0.981596 1.006043 1.034364 0.957948 0.992289 0.822596 0.810961 0.914214 0.888355 0.890246 1.126745 1.059239 1.140937 0.837260 0.767217 1.050317 0.852157 1.030762 0.785417 0.852482 0.813651 0.879658 0.948659 0.948739 0.957059 0.871627 0.856278 0.855609 0.907313 0.959966 0.947130 0.792086 0.855527 1.011561 0.995477 0.980982 0.996789 0.830738 1.051075
5 20.085940 18.782918 18.437114 14.810532 16.922222 10.600998 14.438103 15.651380 9.283464 10.828416 10.221725 9.534839 4.656888 4.964353 7.431559 8.293801 9.081893 9.911928 5.277907 8.009465 12.908637 2.431235 4.826399 6.119886 5.675221 6.783313 4.046307 10.515613 4.326315 7.796709 6.177531 11.555872 4.078763 5.675803 9.122149 2.686504 5.349288 7.163367 8.374668 3.665486 4.167949 5.828164 8.361452 4.642414 5.231697 5.936526 13.024344 3.860787 2.779279 2.907834 6.570319 5.854992 7.878489 3.040769 5.298797 13.500870 2.776805 4.567612 4.520014 4.528191 5.152433 7.850788 4.860277 3.993265 34.411319 6.792786 5.598086 6.286662 13.367284 6.676110 5.152330 14.805826 4.875010 3.493082 5.216388 ... 1.007492 0.858330 0.951287 0.876014 0.892081 0.904842 0.936091 0.913348 0.928388 0.978204 0.887043 0.855481 0.977239 0.731268 0.843736 0.949797 0.831076 0.991784 0.926417 0.891494 0.995620 0.926379 0.930229 0.944218 0.859060 1.033383 0.829044 0.802670 1.050398 0.850775 0.912827 0.909261 1.074922 0.852820 1.031265 1.043779 0.921746 0.884010 0.979139 1.079427 0.907433 0.895706 1.096699 0.990433 0.945077 0.859812 0.979233 0.917115 0.799169 0.863756 0.778890 0.910880 0.933602 1.021602 0.780722 0.921392 1.194571 1.022633 0.878377 0.822903 0.849760 0.815329 0.949032 1.094720 0.843519 0.979618 0.985386 0.905442 0.890735 0.791482 1.034383 0.935892 0.982111 1.003776 0.815313
6 23.814377 23.461418 20.563423 17.702458 14.146988 11.111478 21.754094 7.530462 15.487624 12.824252 9.074189 14.607102 10.351985 15.131521 10.180591 5.990964 12.038223 4.422081 7.935372 6.242144 5.939413 8.658125 6.158674 6.260695 7.193228 5.213818 8.404002 3.195739 6.501966 5.384146 6.360871 5.678477 5.012931 3.193813 3.932260 5.526677 6.253347 7.197191 9.574219 7.658033 4.783062 1.916924 4.707444 8.283744 4.429112 4.198576 2.202736 5.101160 7.364892 6.810985 3.852678 2.802099 2.643665 3.327897 3.358981 3.581209 3.278696 0.789038 5.543420 2.476756 5.024458 1.803234 3.974550 4.402888 5.291549 5.563874 7.529887 3.730552 6.579101 3.094909 3.130031 1.822069 3.827073 3.673637 10.663008 ... 0.950265 0.997868 1.008335 0.917512 0.769314 0.748954 0.929362 0.936600 1.034560 0.984661 1.031994 0.876667 1.006912 1.068562 1.086639 1.003642 0.912222 0.940488 1.187170 0.922950 0.895714 0.958835 1.075324 0.857558 1.096831 0.883471 0.903648 0.783936 0.906079 0.828415 0.841786 0.867306 0.876089 1.018295 0.866050 1.539705 0.884931 1.042334 0.772274 0.810701 0.853887 0.898905 0.821602 0.851620 0.985879 0.981696 0.912827 0.837240 0.767152 0.808981 0.837221 0.941193 0.897686 0.924896 0.941055 0.859361 0.873936 0.889068 0.803840 1.036668 0.886466 0.865702 0.920205 1.101380 0.949649 0.917311 0.730633 0.915578 0.862100 1.031913 0.849849 1.046778 1.030484 0.979081 0.939272
7 28.998025 18.348402 12.806133 10.510819 15.866236 11.642008 7.072523 4.277107 7.633124 19.417741 11.361022 6.856354 7.870019 6.736113 4.305183 5.348589 10.982667 8.071871 4.144797 2.965678 6.352451 5.894479 4.032645 6.158643 6.295068 8.488952 4.351226 5.538527 3.588807 9.509300 5.759756 7.925500 2.863522 9.322388 7.174795 6.053113 3.503799 1.664304 2.688063 9.153034 3.573980 5.614140 3.786577 6.996487 5.524702 2.015780 5.374750 1.077708 4.769593 2.182569 5.659881 6.247556 3.840174 5.336527 6.478365 1.581460 8.072494 5.352936 4.699397 3.674596 2.983758 0.819125 3.206432 1.891753 2.558912 4.227376 1.755160 1.621110 1.828282 2.115969 5.116059 4.545278 8.640689 7.874085 2.313972 ... 0.977020 0.896425 0.925416 1.231845 0.970198 0.807132 1.042266 0.790933 0.901928 0.851463 0.880628 0.930276 0.909298 0.920000 0.956882 0.864396 0.941989 1.136597 0.906135 0.929640 0.986775 0.814518 0.846101 1.082463 0.856743 0.718842 0.920428 1.040002 0.998349 1.113480 0.876585 0.948560 1.099499 0.945214 0.953516 0.987729 1.118536 0.897066 0.802324 1.108349 0.991733 0.759398 0.925303 0.942180 1.245246 1.008849 0.873720 0.981506 0.873077 0.903017 0.917284 0.683957 1.008881 0.680528 0.965168 0.971502 0.830506 0.745906 0.966481 0.824490 0.937966 0.968246 0.839436 0.916967 0.937466 1.053778 0.880030 1.038145 0.922845 0.921222 1.036661 0.938227 1.137922 0.896809 0.991070
8 19.259384 20.873298 10.891218 10.311046 7.713831 8.593748 3.293756 7.721109 9.013065 21.559586 12.487105 7.442171 6.613116 6.367484 7.782099 4.449667 3.471020 6.860835 7.729980 6.328083 4.768591 6.640138 5.486274 4.923742 5.717308 5.272943 8.755634 3.934738 4.404620 4.288394 4.235263 8.114158 5.498302 8.437864 2.246361 5.285536 4.165034 4.064173 5.589624 6.825313 5.093451 2.079045 2.537884 4.591293 2.167996 4.802563 4.114012 3.709397 3.240790 3.942546 4.047200 4.366170 3.055401 4.888170 3.769129 2.542534 2.516658 6.114882 4.036541 2.926307 2.085734 2.003129 1.812869 3.422786 3.613925 4.278499 4.640119 4.313159 4.784777 0.978137 1.929597 4.073013 7.757378 3.696281 2.832603 ... 0.969071 0.894436 1.098357 0.784662 0.865316 0.903930 1.031903 1.024392 0.981408 0.878394 0.823070 1.231886 0.946206 0.962795 0.865557 0.888973 1.038034 1.137670 0.958427 0.884272 0.866525 1.070468 0.900355 0.844232 0.987338 0.831718 0.854035 1.019832 0.826614 0.984053 0.848300 0.885708 0.869844 0.918068 0.911561 0.929538 0.921212 0.891753 0.883489 1.062236 1.122840 0.882382 1.024350 0.738958 1.014955 0.971324 0.760354 0.942021 0.920169 0.686426 0.902196 0.872940 0.897958 0.893313 0.832789 0.940049 0.958171 0.927880 0.871042 0.959768 1.060895 0.879743 0.846332 0.981110 0.947517 0.872612 0.933273 0.831101 0.866549 0.955474 0.967027 1.076746 0.787289 1.138567 0.796959
9 32.634540 10.883548 13.055581 11.074116 10.338895 16.881450 6.250045 11.090178 6.194652 17.234312 9.316032 7.329001 5.078105 7.411794 10.511014 5.955080 4.275210 9.976554 4.211980 3.589240 3.443493 5.320616 2.866002 5.976349 5.427763 3.197896 7.964595 8.466660 4.404711 3.337090 3.958040 1.700826 8.324566 1.930532 5.605261 6.413916 6.992754 8.305001 4.153992 2.642825 6.580708 3.087669 3.761210 4.501239 3.769644 5.457825 2.971710 4.271171 2.352333 3.325819 4.227077 5.774772 4.374444 2.394271 1.697065 4.473220 4.658411 7.724148 3.487050 4.701079 4.457878 1.388721 5.960054 5.048299 3.891404 3.932685 4.488664 5.529596 2.417878 2.806081 4.949107 3.563333 0.800457 2.986846 1.188924 ... 0.852224 0.814218 0.901743 0.934365 0.872058 0.878613 0.804536 0.889890 0.888234 0.868320 0.879550 0.922788 0.998778 1.027588 0.993610 0.935129 0.693160 0.966021 0.805784 0.829274 0.830103 1.107117 0.856729 0.954380 1.104449 1.092253 0.853025 0.899481 0.934719 0.974917 0.943390 0.965719 0.866990 0.845655 1.005885 0.839315 0.993450 1.157226 1.088443 1.042639 1.019365 0.952525 1.072376 0.930397 1.105347 0.921322 0.857586 0.965983 0.748458 0.872294 0.931053 0.936660 0.808634 0.944952 0.923698 0.846288 1.069774 0.863637 0.973426 1.162857 0.910061 0.847051 0.962939 0.873847 0.802336 0.843733 0.830402 0.861237 0.979756 0.992285 0.759286 1.084377 0.860706 0.931612 0.749618

10 rows × 50000 columns


In [11]:
doc_topic_dists.shape#[M x K]
doc_topic_dists


Out[11]:
topic 0 1 2 3 4 5 6 7 8 9
doc
0 0.002919 0.002915 0.002914 0.973755 0.002917 0.002919 0.002919 0.002916 0.002913 0.002914
1 0.002006 0.002004 0.002003 0.981962 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
2 0.000993 0.000991 0.000991 0.991076 0.000992 0.000992 0.000992 0.000991 0.000991 0.000991
3 0.001058 0.001057 0.001056 0.990488 0.001057 0.001058 0.001058 0.001057 0.001056 0.001056
4 0.002140 0.002137 0.002136 0.980759 0.002138 0.002140 0.002140 0.002138 0.002136 0.002136
5 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
6 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
7 0.002603 0.002600 0.002599 0.976595 0.002601 0.002603 0.002603 0.002600 0.002598 0.002599
8 0.001965 0.001963 0.001962 0.982330 0.001964 0.001965 0.001965 0.001963 0.001961 0.001962
9 0.004820 0.004813 0.004811 0.956668 0.004816 0.004819 0.004819 0.004814 0.004810 0.004811
10 0.001070 0.001068 0.001068 0.990382 0.001069 0.001070 0.001070 0.001068 0.001068 0.001068
11 0.001528 0.001526 0.001526 0.986259 0.001527 0.001528 0.001528 0.001527 0.001525 0.001526
12 0.000668 0.000668 0.000667 0.993990 0.000668 0.000668 0.000668 0.000668 0.000667 0.000667
13 0.001094 0.001093 0.001092 0.990164 0.001093 0.001094 0.001094 0.001093 0.001092 0.001092
14 0.000698 0.000697 0.000696 0.993729 0.000697 0.000697 0.000697 0.000697 0.000696 0.000696
15 0.001035 0.001034 0.001033 0.990692 0.001034 0.001035 0.001035 0.001034 0.001033 0.001034
16 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
17 0.000417 0.000416 0.000416 0.996254 0.000416 0.000417 0.000417 0.000416 0.000416 0.000416
18 0.001146 0.001145 0.001144 0.989695 0.001145 0.001146 0.001146 0.001145 0.001144 0.001144
19 0.000231 0.000231 0.000230 0.997925 0.000231 0.000231 0.000231 0.000231 0.000230 0.000230
20 0.004015 0.004010 0.004008 0.963901 0.004012 0.004015 0.004014 0.004010 0.004007 0.004008
21 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
22 0.001605 0.001603 0.001602 0.985572 0.001603 0.001605 0.001605 0.001603 0.001602 0.001602
23 0.001301 0.001299 0.001299 0.988302 0.001300 0.001301 0.001301 0.001300 0.001298 0.001299
24 0.000244 0.000243 0.000243 0.997809 0.000243 0.000244 0.000244 0.000243 0.000243 0.000243
25 0.001660 0.001658 0.001657 0.985073 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
26 0.000262 0.000261 0.000261 0.997648 0.000261 0.000262 0.000262 0.000261 0.000261 0.000261
27 0.001751 0.001748 0.001748 0.984258 0.001749 0.001751 0.001751 0.001749 0.001747 0.001748
28 0.002189 0.002186 0.002185 0.980321 0.002187 0.002189 0.002188 0.002186 0.002184 0.002185
29 0.000659 0.000658 0.000658 0.994072 0.000659 0.000659 0.000659 0.000659 0.000658 0.000658
30 0.002006 0.002004 0.002003 0.981961 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
31 0.000678 0.000677 0.000677 0.993905 0.000677 0.000678 0.000678 0.000677 0.000677 0.000677
32 0.002408 0.002405 0.002404 0.978352 0.002406 0.002407 0.002407 0.002405 0.002403 0.002404
33 0.003322 0.003318 0.003316 0.970132 0.003319 0.003322 0.003321 0.003318 0.003315 0.003316
34 0.002470 0.002466 0.002465 0.977796 0.002467 0.002469 0.002469 0.002467 0.002465 0.002466
35 0.001660 0.001658 0.001657 0.985074 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
36 0.001504 0.001502 0.001502 0.986473 0.001503 0.001504 0.001504 0.001503 0.001501 0.001502
37 0.001070 0.001068 0.001068 0.990383 0.001069 0.001070 0.001070 0.001068 0.001068 0.001068
38 0.001751 0.001748 0.001748 0.984259 0.001749 0.001751 0.001751 0.001749 0.001747 0.001748
39 0.002603 0.002600 0.002599 0.976596 0.002601 0.002603 0.002603 0.002600 0.002598 0.002599
40 0.001284 0.001282 0.001281 0.988458 0.001283 0.001284 0.001284 0.001282 0.001281 0.001282
41 0.000111 0.000111 0.000111 0.999004 0.000111 0.000111 0.000111 0.000111 0.000111 0.000111
42 0.000837 0.000836 0.000836 0.992474 0.000836 0.000837 0.000837 0.000836 0.000835 0.000836
43 0.005671 0.005663 0.005661 0.949013 0.005666 0.005670 0.005670 0.005664 0.005659 0.005661
44 0.002833 0.002829 0.002828 0.974528 0.002831 0.002833 0.002833 0.002830 0.002827 0.002828
45 0.000796 0.000795 0.000794 0.992847 0.000795 0.000796 0.000795 0.000795 0.000794 0.000794
46 0.000458 0.000458 0.000458 0.995879 0.000458 0.000458 0.000458 0.000458 0.000457 0.000458
47 0.001035 0.001034 0.001033 0.990693 0.001034 0.001035 0.001035 0.001034 0.001033 0.001033
48 0.002006 0.002004 0.002003 0.981961 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
49 0.001437 0.001435 0.001435 0.987079 0.001436 0.001437 0.001437 0.001435 0.001434 0.001435
50 0.000470 0.000469 0.000469 0.995778 0.000469 0.000469 0.000469 0.000469 0.000469 0.000469
51 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
52 0.002919 0.002915 0.002914 0.973755 0.002917 0.002919 0.002919 0.002916 0.002913 0.002914
53 0.000713 0.000712 0.000712 0.993589 0.000712 0.000713 0.000713 0.000712 0.000712 0.000712
54 0.000456 0.000456 0.000455 0.995899 0.000456 0.000456 0.000456 0.000456 0.000455 0.000455
55 0.004015 0.004010 0.004008 0.963900 0.004012 0.004015 0.004014 0.004010 0.004007 0.004008
56 0.001720 0.001717 0.001717 0.984540 0.001718 0.001719 0.001719 0.001718 0.001716 0.001717
57 0.001888 0.001886 0.001885 0.983024 0.001887 0.001888 0.001888 0.001886 0.001884 0.001885
58 0.000776 0.000775 0.000775 0.993020 0.000776 0.000776 0.000776 0.000775 0.000775 0.000775
59 0.006429 0.006421 0.006418 0.942195 0.006424 0.006429 0.006428 0.006422 0.006416 0.006419
60 0.002349 0.002346 0.002345 0.978880 0.002347 0.002349 0.002349 0.002346 0.002344 0.002345
61 0.001267 0.001265 0.001265 0.988610 0.001266 0.001267 0.001267 0.001265 0.001264 0.001265
62 0.001337 0.001335 0.001335 0.987977 0.001336 0.001337 0.001337 0.001336 0.001335 0.001335
63 0.002833 0.002829 0.002828 0.974528 0.002831 0.002833 0.002833 0.002830 0.002827 0.002828
64 0.002094 0.002091 0.002090 0.981177 0.002092 0.002093 0.002093 0.002091 0.002089 0.002090
65 0.001319 0.001317 0.001317 0.988142 0.001318 0.001319 0.001319 0.001317 0.001316 0.001317
66 0.000752 0.000751 0.000751 0.993238 0.000751 0.000752 0.000752 0.000751 0.000751 0.000751
67 0.000411 0.000411 0.000411 0.996302 0.000411 0.000411 0.000411 0.000411 0.000411 0.000411
68 0.000740 0.000739 0.000739 0.993343 0.000740 0.000740 0.000740 0.000740 0.000739 0.000739
69 0.000683 0.000682 0.000681 0.993862 0.000682 0.000683 0.000683 0.000682 0.000681 0.000682
70 0.000783 0.000782 0.000781 0.992964 0.000782 0.000783 0.000782 0.000782 0.000781 0.000781
71 0.000321 0.000320 0.000320 0.997115 0.000321 0.000321 0.000321 0.000320 0.000320 0.000320
72 0.000688 0.000687 0.000686 0.993818 0.000687 0.000688 0.000687 0.000687 0.000686 0.000686
73 0.002676 0.002672 0.002671 0.975944 0.002673 0.002675 0.002675 0.002673 0.002670 0.002671
74 0.001437 0.001435 0.001435 0.987080 0.001436 0.001437 0.001437 0.001435 0.001434 0.001435
75 0.002140 0.002137 0.002136 0.980758 0.002138 0.002140 0.002140 0.002138 0.002136 0.002137
76 0.000461 0.000460 0.000460 0.995859 0.000460 0.000461 0.000460 0.000460 0.000460 0.000460
77 0.003322 0.003318 0.003316 0.970133 0.003319 0.003322 0.003321 0.003318 0.003315 0.003316
78 0.000560 0.000559 0.000559 0.994968 0.000559 0.000560 0.000560 0.000559 0.000558 0.000559
79 0.000935 0.000933 0.000933 0.991597 0.000934 0.000935 0.000934 0.000934 0.000933 0.000933
80 0.002469 0.002466 0.002465 0.977797 0.002467 0.002469 0.002469 0.002467 0.002465 0.002465
81 0.003441 0.003436 0.003435 0.969064 0.003438 0.003440 0.003440 0.003437 0.003434 0.003435
82 0.001632 0.001630 0.001629 0.985325 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
83 0.000031 0.000031 0.000030 0.999725 0.000031 0.000031 0.000031 0.000031 0.000030 0.000031
84 0.001504 0.001502 0.001502 0.986473 0.001503 0.001504 0.001504 0.001503 0.001501 0.001502
85 0.001632 0.001630 0.001629 0.985326 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
86 0.002470 0.002466 0.002465 0.977795 0.002468 0.002469 0.002469 0.002467 0.002465 0.002466
87 0.000992 0.000991 0.000991 0.991077 0.000992 0.000992 0.000992 0.000991 0.000990 0.000991
88 0.000613 0.000612 0.000612 0.994488 0.000613 0.000613 0.000613 0.000612 0.000612 0.000612
89 0.003706 0.003701 0.003699 0.966681 0.003703 0.003705 0.003705 0.003702 0.003698 0.003700
90 0.001356 0.001354 0.001354 0.987807 0.001355 0.001356 0.001356 0.001355 0.001353 0.001354
91 0.002470 0.002466 0.002465 0.977796 0.002468 0.002469 0.002469 0.002467 0.002465 0.002465
92 0.002752 0.002748 0.002747 0.975256 0.002750 0.002752 0.002752 0.002749 0.002747 0.002747
93 0.000926 0.000924 0.000924 0.991678 0.000925 0.000926 0.000925 0.000925 0.000924 0.000924
94 0.003211 0.003207 0.003205 0.971129 0.003208 0.003211 0.003211 0.003207 0.003205 0.003206
95 0.000151 0.000151 0.000151 0.998644 0.000151 0.000151 0.000151 0.000151 0.000151 0.000151
96 0.000673 0.000672 0.000672 0.993948 0.000673 0.000673 0.000673 0.000672 0.000672 0.000672
97 0.002349 0.002346 0.002345 0.978880 0.002347 0.002349 0.002349 0.002346 0.002344 0.002345
98 0.000633 0.000632 0.000632 0.994306 0.000633 0.000633 0.000633 0.000633 0.000632 0.000632
99 0.001926 0.001923 0.001923 0.982684 0.001924 0.001926 0.001926 0.001924 0.001922 0.001923
100 0.001189 0.001187 0.001187 0.989313 0.001188 0.001188 0.001188 0.001187 0.001186 0.001187
101 0.000391 0.000391 0.000391 0.996482 0.000391 0.000391 0.000391 0.000391 0.000390 0.000391
102 0.000796 0.000795 0.000794 0.992847 0.000795 0.000795 0.000795 0.000795 0.000794 0.000794
103 0.002006 0.002004 0.002003 0.981962 0.002004 0.002006 0.002006 0.002004 0.002002 0.002003
104 0.000992 0.000991 0.000991 0.991077 0.000992 0.000992 0.000992 0.000991 0.000990 0.000991
105 0.001632 0.001630 0.001629 0.985327 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
106 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
107 0.000770 0.000769 0.000769 0.993076 0.000769 0.000770 0.000770 0.000769 0.000769 0.000769
108 0.001356 0.001354 0.001354 0.987808 0.001355 0.001356 0.001356 0.001354 0.001353 0.001354
109 0.000393 0.000392 0.000392 0.996468 0.000393 0.000393 0.000393 0.000392 0.000392 0.000392
110 0.001035 0.001034 0.001033 0.990693 0.001034 0.001035 0.001035 0.001034 0.001033 0.001033
111 0.001319 0.001317 0.001317 0.988142 0.001318 0.001319 0.001319 0.001317 0.001316 0.001317
112 0.001337 0.001335 0.001335 0.987977 0.001336 0.001337 0.001337 0.001336 0.001335 0.001335
113 0.001189 0.001187 0.001187 0.989313 0.001188 0.001189 0.001188 0.001187 0.001186 0.001187
114 0.002140 0.002137 0.002136 0.980758 0.002138 0.002140 0.002140 0.002138 0.002136 0.002137
115 0.001375 0.001374 0.001373 0.987633 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
116 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
117 0.001250 0.001249 0.001248 0.988758 0.001249 0.001250 0.001250 0.001249 0.001248 0.001248
118 0.001660 0.001658 0.001657 0.985073 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
119 0.003322 0.003317 0.003316 0.970133 0.003319 0.003322 0.003321 0.003318 0.003315 0.003316
120 0.000724 0.000723 0.000723 0.993493 0.000723 0.000724 0.000724 0.000723 0.000722 0.000723
121 0.001267 0.001265 0.001265 0.988610 0.001266 0.001267 0.001267 0.001265 0.001264 0.001265
122 0.001817 0.001814 0.001814 0.983665 0.001815 0.001817 0.001817 0.001815 0.001813 0.001814
123 0.001689 0.001687 0.001686 0.984812 0.001688 0.001689 0.001689 0.001687 0.001686 0.001686
124 0.004381 0.004375 0.004373 0.960615 0.004377 0.004380 0.004380 0.004375 0.004372 0.004373
... ... ... ... ... ... ... ... ... ... ...
11189 0.002240 0.002237 0.002236 0.979862 0.002238 0.002240 0.002239 0.002237 0.002235 0.002236
11190 0.000776 0.000775 0.000775 0.993020 0.000776 0.000776 0.000776 0.000775 0.000775 0.000775
11191 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
11192 0.000972 0.000971 0.000971 0.991257 0.000972 0.000972 0.000972 0.000971 0.000970 0.000971
11193 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002291 0.002288 0.002289
11194 0.001632 0.001630 0.001629 0.985327 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
11195 0.000776 0.000775 0.000775 0.993020 0.000776 0.000776 0.000776 0.000775 0.000775 0.000775
11196 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
11197 0.000499 0.000498 0.000498 0.995516 0.000498 0.000499 0.000499 0.000498 0.000498 0.000498
11198 0.001459 0.001457 0.001456 0.986883 0.001458 0.001459 0.001459 0.001457 0.001456 0.001456
11199 0.001926 0.001923 0.001923 0.982683 0.001924 0.001926 0.001926 0.001924 0.001922 0.001923
11200 0.000860 0.000858 0.000858 0.992272 0.000859 0.000859 0.000859 0.000859 0.000858 0.000858
11201 0.001301 0.001299 0.001299 0.988302 0.001300 0.001301 0.001301 0.001300 0.001298 0.001299
11202 0.002676 0.002672 0.002671 0.975943 0.002673 0.002675 0.002675 0.002673 0.002670 0.002671
11203 0.001437 0.001435 0.001435 0.987079 0.001436 0.001437 0.001437 0.001435 0.001434 0.001435
11204 0.001888 0.001886 0.001885 0.983024 0.001887 0.001888 0.001888 0.001886 0.001884 0.001885
11205 0.001395 0.001394 0.001393 0.987454 0.001394 0.001395 0.001395 0.001394 0.001393 0.001393
11206 0.000963 0.000961 0.000961 0.991344 0.000962 0.000963 0.000963 0.000962 0.000961 0.000961
11207 0.000668 0.000668 0.000667 0.993990 0.000668 0.000668 0.000668 0.000668 0.000667 0.000667
11208 0.001751 0.001749 0.001748 0.984259 0.001749 0.001751 0.001750 0.001749 0.001747 0.001748
11209 0.000306 0.000305 0.000305 0.997253 0.000305 0.000306 0.000305 0.000305 0.000305 0.000305
11210 0.001817 0.001814 0.001814 0.983664 0.001815 0.001817 0.001817 0.001815 0.001813 0.001814
11211 0.003211 0.003207 0.003206 0.971128 0.003209 0.003211 0.003211 0.003207 0.003205 0.003206
11212 0.003322 0.003318 0.003316 0.970132 0.003319 0.003322 0.003322 0.003318 0.003315 0.003317
11213 0.003854 0.003849 0.003847 0.965347 0.003851 0.003854 0.003854 0.003850 0.003846 0.003848
11214 0.001528 0.001526 0.001526 0.986259 0.001527 0.001528 0.001528 0.001527 0.001525 0.001526
11215 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
11216 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
11217 0.003568 0.003564 0.003562 0.967917 0.003565 0.003568 0.003568 0.003564 0.003561 0.003562
11218 0.000398 0.000397 0.000397 0.996424 0.000397 0.000398 0.000398 0.000397 0.000397 0.000397
11219 0.001375 0.001374 0.001373 0.987633 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
11220 0.001751 0.001748 0.001748 0.984259 0.001749 0.001751 0.001750 0.001749 0.001747 0.001748
11221 0.004589 0.004583 0.004581 0.958738 0.004585 0.004589 0.004588 0.004584 0.004580 0.004582
11222 0.000304 0.000303 0.000303 0.997270 0.000303 0.000304 0.000304 0.000303 0.000303 0.000303
11223 0.003441 0.003436 0.003435 0.969063 0.003438 0.003441 0.003440 0.003437 0.003434 0.003435
11224 0.004819 0.004813 0.004811 0.956672 0.004815 0.004819 0.004818 0.004814 0.004809 0.004811
11225 0.002919 0.002915 0.002914 0.973754 0.002917 0.002919 0.002919 0.002916 0.002913 0.002914
11226 0.001174 0.001172 0.001172 0.989444 0.001173 0.001174 0.001174 0.001173 0.001172 0.001172
11227 0.000602 0.000601 0.000601 0.994591 0.000601 0.000602 0.000602 0.000601 0.000600 0.000601
11228 0.002140 0.002137 0.002136 0.980759 0.002138 0.002140 0.002140 0.002138 0.002136 0.002136
11229 0.000852 0.000851 0.000850 0.992340 0.000851 0.000852 0.000852 0.000851 0.000850 0.000850
11230 0.000382 0.000381 0.000381 0.996566 0.000382 0.000382 0.000382 0.000381 0.000381 0.000381
11231 0.004380 0.004375 0.004373 0.960616 0.004377 0.004380 0.004380 0.004375 0.004372 0.004373
11232 0.000394 0.000394 0.000394 0.996453 0.000394 0.000394 0.000394 0.000394 0.000394 0.000394
11233 0.001660 0.001658 0.001657 0.985074 0.001659 0.001660 0.001660 0.001658 0.001657 0.001657
11234 0.007420 0.007410 0.007407 0.933285 0.007414 0.007420 0.007419 0.007412 0.007405 0.007408
11235 0.000286 0.000286 0.000286 0.997425 0.000286 0.000286 0.000286 0.000286 0.000286 0.000286
11236 0.001505 0.001502 0.001502 0.986473 0.001503 0.001504 0.001504 0.001503 0.001501 0.001502
11237 0.003010 0.003006 0.003005 0.972935 0.003008 0.003010 0.003010 0.003007 0.003004 0.003005
11238 0.002535 0.002531 0.002530 0.977212 0.002532 0.002534 0.002534 0.002532 0.002529 0.002530
11239 0.002006 0.002004 0.002003 0.981962 0.002005 0.002006 0.002006 0.002004 0.002002 0.002003
11240 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
11241 0.002833 0.002830 0.002828 0.974526 0.002831 0.002833 0.002833 0.002830 0.002828 0.002829
11242 0.000830 0.000829 0.000828 0.992539 0.000829 0.000830 0.000830 0.000829 0.000828 0.000828
11243 0.001035 0.001034 0.001033 0.990693 0.001034 0.001035 0.001035 0.001034 0.001033 0.001033
11244 0.001082 0.001080 0.001080 0.990274 0.001081 0.001082 0.001082 0.001080 0.001080 0.001080
11245 0.001416 0.001414 0.001413 0.987269 0.001415 0.001416 0.001416 0.001414 0.001413 0.001414
11246 0.001234 0.001233 0.001232 0.988902 0.001233 0.001234 0.001234 0.001233 0.001232 0.001232
11247 0.000963 0.000961 0.000961 0.991344 0.000962 0.000963 0.000963 0.000962 0.000961 0.000961
11248 0.001094 0.001093 0.001092 0.990164 0.001093 0.001094 0.001094 0.001093 0.001092 0.001092
11249 0.000953 0.000952 0.000951 0.991431 0.000952 0.000953 0.000953 0.000952 0.000951 0.000952
11250 0.003441 0.003436 0.003435 0.969065 0.003438 0.003440 0.003440 0.003437 0.003434 0.003435
11251 0.001751 0.001748 0.001748 0.984259 0.001749 0.001751 0.001750 0.001749 0.001747 0.001748
11252 0.001375 0.001374 0.001373 0.987634 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
11253 0.000982 0.000981 0.000981 0.991168 0.000982 0.000982 0.000982 0.000981 0.000980 0.000981
11254 0.001459 0.001457 0.001456 0.986883 0.001458 0.001459 0.001459 0.001457 0.001456 0.001456
11255 0.001888 0.001886 0.001885 0.983024 0.001887 0.001888 0.001888 0.001886 0.001884 0.001885
11256 0.001437 0.001435 0.001435 0.987079 0.001436 0.001437 0.001437 0.001436 0.001434 0.001435
11257 0.002049 0.002046 0.002045 0.981577 0.002047 0.002049 0.002049 0.002047 0.002045 0.002046
11258 0.002189 0.002186 0.002185 0.980321 0.002187 0.002189 0.002188 0.002186 0.002184 0.002185
11259 0.000944 0.000943 0.000942 0.991514 0.000943 0.000944 0.000944 0.000943 0.000942 0.000942
11260 0.002049 0.002046 0.002045 0.981577 0.002047 0.002049 0.002049 0.002047 0.002045 0.002046
11261 0.000668 0.000668 0.000667 0.993990 0.000668 0.000668 0.000668 0.000668 0.000667 0.000667
11262 0.002094 0.002091 0.002090 0.981177 0.002092 0.002093 0.002093 0.002091 0.002089 0.002090
11263 0.000935 0.000933 0.000933 0.991596 0.000934 0.000935 0.000935 0.000934 0.000933 0.000933
11264 0.000317 0.000316 0.000316 0.997153 0.000316 0.000317 0.000317 0.000316 0.000316 0.000316
11265 0.001107 0.001105 0.001105 0.990051 0.001106 0.001106 0.001106 0.001105 0.001104 0.001105
11266 0.000770 0.000769 0.000769 0.993076 0.000769 0.000770 0.000770 0.000769 0.000769 0.000769
11267 0.000137 0.000137 0.000137 0.998769 0.000137 0.000137 0.000137 0.000137 0.000137 0.000137
11268 0.001926 0.001924 0.001923 0.982682 0.001924 0.001926 0.001926 0.001924 0.001922 0.001923
11269 0.001459 0.001457 0.001456 0.986884 0.001458 0.001459 0.001459 0.001457 0.001456 0.001456
11270 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
11271 0.002049 0.002046 0.002045 0.981578 0.002047 0.002049 0.002049 0.002047 0.002045 0.002046
11272 0.000816 0.000815 0.000814 0.992665 0.000815 0.000816 0.000816 0.000815 0.000814 0.000814
11273 0.000109 0.000109 0.000109 0.999020 0.000109 0.000109 0.000109 0.000109 0.000109 0.000109
11274 0.000566 0.000565 0.000565 0.994909 0.000566 0.000566 0.000566 0.000566 0.000565 0.000565
11275 0.000134 0.000134 0.000134 0.998793 0.000134 0.000134 0.000134 0.000134 0.000134 0.000134
11276 0.001528 0.001526 0.001526 0.986259 0.001527 0.001528 0.001528 0.001527 0.001525 0.001526
11277 0.001356 0.001354 0.001354 0.987808 0.001355 0.001356 0.001356 0.001355 0.001353 0.001354
11278 0.003107 0.003103 0.003102 0.972061 0.003105 0.003107 0.003107 0.003104 0.003101 0.003102
11279 0.001070 0.001068 0.001068 0.990382 0.001069 0.001070 0.001070 0.001068 0.001068 0.001068
11280 0.000972 0.000971 0.000971 0.991257 0.000972 0.000972 0.000972 0.000971 0.000970 0.000971
11281 0.003706 0.003701 0.003699 0.966681 0.003703 0.003705 0.003705 0.003702 0.003698 0.003700
11282 0.000746 0.000745 0.000745 0.993291 0.000746 0.000746 0.000746 0.000745 0.000745 0.000745
11283 0.001605 0.001603 0.001602 0.985572 0.001603 0.001605 0.001605 0.001603 0.001602 0.001602
11284 0.000358 0.000357 0.000357 0.996783 0.000358 0.000358 0.000358 0.000357 0.000357 0.000357
11285 0.002140 0.002137 0.002136 0.980758 0.002138 0.002140 0.002140 0.002138 0.002136 0.002137
11286 0.003108 0.003103 0.003102 0.972060 0.003105 0.003107 0.003107 0.003104 0.003101 0.003102
11287 0.001578 0.001576 0.001576 0.985808 0.001577 0.001578 0.001578 0.001577 0.001575 0.001576
11288 0.001817 0.001815 0.001814 0.983664 0.001815 0.001817 0.001817 0.001815 0.001813 0.001814
11289 0.001160 0.001158 0.001158 0.989571 0.001159 0.001160 0.001160 0.001159 0.001158 0.001158
11290 0.001375 0.001374 0.001373 0.987634 0.001374 0.001375 0.001375 0.001374 0.001373 0.001373
11291 0.002240 0.002237 0.002236 0.979863 0.002238 0.002239 0.002239 0.002237 0.002235 0.002236
11292 0.002349 0.002346 0.002345 0.978880 0.002347 0.002349 0.002349 0.002346 0.002344 0.002345
11293 0.002470 0.002466 0.002465 0.977796 0.002468 0.002469 0.002469 0.002467 0.002465 0.002465
11294 0.005073 0.005067 0.005065 0.954385 0.005069 0.005073 0.005073 0.005068 0.005063 0.005065
11295 0.001783 0.001781 0.001780 0.983967 0.001782 0.001783 0.001783 0.001781 0.001780 0.001780
11296 0.001553 0.001551 0.001550 0.986037 0.001552 0.001553 0.001553 0.001551 0.001550 0.001550
11297 0.000783 0.000782 0.000781 0.992963 0.000782 0.000783 0.000783 0.000782 0.000781 0.000781
11298 0.000463 0.000462 0.000462 0.995839 0.000462 0.000463 0.000463 0.000462 0.000462 0.000462
11299 0.004821 0.004813 0.004811 0.956666 0.004815 0.004819 0.004819 0.004814 0.004810 0.004811
11300 0.001395 0.001393 0.001393 0.987454 0.001394 0.001395 0.001395 0.001394 0.001393 0.001393
11301 0.001094 0.001093 0.001092 0.990164 0.001093 0.001094 0.001094 0.001093 0.001092 0.001092
11302 0.002470 0.002466 0.002465 0.977796 0.002467 0.002469 0.002469 0.002467 0.002465 0.002465
11303 0.002408 0.002405 0.002404 0.978351 0.002406 0.002408 0.002407 0.002405 0.002403 0.002404
11304 0.000837 0.000836 0.000836 0.992474 0.000836 0.000837 0.000837 0.000836 0.000835 0.000836
11305 0.000315 0.000314 0.000314 0.997172 0.000314 0.000315 0.000314 0.000314 0.000314 0.000314
11306 0.001301 0.001299 0.001299 0.988302 0.001300 0.001301 0.001301 0.001300 0.001298 0.001299
11307 0.002293 0.002290 0.002289 0.979383 0.002291 0.002293 0.002293 0.002290 0.002288 0.002289
11308 0.001632 0.001630 0.001629 0.985327 0.001631 0.001632 0.001632 0.001630 0.001629 0.001629
11309 0.001852 0.001849 0.001849 0.983350 0.001850 0.001852 0.001852 0.001850 0.001848 0.001849
11310 0.001482 0.001479 0.001479 0.986680 0.001480 0.001481 0.001481 0.001480 0.001478 0.001479
11311 0.000124 0.000123 0.000123 0.998889 0.000123 0.000124 0.000124 0.000123 0.000123 0.000123
11312 0.001965 0.001963 0.001962 0.982330 0.001964 0.001965 0.001965 0.001963 0.001961 0.001962
11313 0.000097 0.000097 0.000097 0.999129 0.000097 0.000097 0.000097 0.000097 0.000097 0.000097

11314 rows × 10 columns


In [12]:
term_frequency.shape#[V]
term_frequency


Out[12]:
0        8654
1        6466
2        5345
3        4859
4        4597
5        4295
6        4075
7        3491
8        3370
9        3141
10       3091
11       2792
12       2763
13       2538
14       2518
15       2444
16       2327
17       2296
18       2252
19       2149
20       1975
21       1972
22       1930
23       1918
24       1905
25       1902
26       1866
27       1823
28       1820
29       1817
30       1804
31       1804
32       1799
33       1780
34       1730
35       1702
36       1693
37       1680
38       1627
39       1618
40       1606
41       1581
42       1565
43       1501
44       1482
45       1469
46       1466
47       1448
48       1433
49       1422
50       1401
51       1400
52       1388
53       1387
54       1361
55       1360
56       1359
57       1340
58       1332
59       1329
60       1325
61       1324
62       1315
63       1309
64       1298
65       1288
66       1254
67       1246
68       1246
69       1240
70       1227
71       1222
72       1216
73       1210
74       1204
75       1177
76       1170
77       1164
78       1157
79       1154
80       1148
81       1145
82       1083
83       1075
84       1065
85       1060
86       1054
87       1022
88       1017
89       1010
90       1005
91       1000
92       1000
93        995
94        991
95        988
96        981
97        971
98        964
99        964
100       964
101       959
102       956
103       952
104       950
105       946
106       943
107       931
108       928
109       914
110       897
111       895
112       893
113       887
114       885
115       875
116       870
117       868
118       867
119       867
120       865
121       861
122       855
123       855
124       853
         ... 
49875       1
49876       1
49877       1
49878       1
49879       1
49880       1
49881       1
49882       1
49883       1
49884       1
49885       1
49886       1
49887       1
49888       1
49889       1
49890       1
49891       1
49892       1
49893       1
49894       1
49895       1
49896       1
49897       1
49898       1
49899       1
49900       1
49901       1
49902       1
49903       1
49904       1
49905       1
49906       1
49907       1
49908       1
49909       1
49910       1
49911       1
49912       1
49913       1
49914       1
49915       1
49916       1
49917       1
49918       1
49919       1
49920       1
49921       1
49922       1
49923       1
49924       1
49925       1
49926       1
49927       1
49928       1
49929       1
49930       1
49931       1
49932       1
49933       1
49934       1
49935       1
49936       1
49937       1
49938       1
49939       1
49940       1
49941       1
49942       1
49943       1
49944       1
49945       1
49946       1
49947       1
49948       1
49949       1
49950       1
49951       1
49952       1
49953       1
49954       1
49955       1
49956       1
49957       1
49958       1
49959       1
49960       1
49961       1
49962       1
49963       1
49964       1
49965       1
49966       1
49967       1
49968       1
49969       1
49970       1
49971       1
49972       1
49973       1
49974       1
49975       1
49976       1
49977       1
49978       1
49979       1
49980       1
49981       1
49982       1
49983       1
49984       1
49985       1
49986       1
49987       1
49988       1
49989       1
49990       1
49991       1
49992       1
49993       1
49994       1
49995       1
49996       1
49997       1
49998       1
49999       1
Name: term_frequency, dtype: int64

In [13]:
doc_lengths.shape#[M]
doc_lengths


Out[13]:
0          32
1          47
2          96
3          90
4          44
5          53
6          41
7          36
8          48
9          19
10         89
11         62
12        143
13         87
14        137
15         92
16         41
17        230
18         83
19        416
20         23
21         77
22         59
23         73
24        394
25         57
26        367
27         54
28         43
29        145
30         47
31        141
32         39
33         28
34         38
35         57
36         63
37         89
38         54
39         36
40         74
41        868
42        114
43         16
44         33
45        120
46        209
47         92
48         47
49         66
50        204
51         51
52         32
53        134
54        210
55         23
56         55
57         50
58        123
59         14
60         40
61         75
62         71
63         33
64         45
65         72
66        127
67        233
68        129
69        140
70        122
71        299
72        139
73         35
74         66
75         44
76        208
77         28
78        171
79        102
80         38
81         27
82         58
83       3149
84         63
85         58
86         38
87         96
88        156
89         25
90         70
91         38
92         34
93        103
94         29
95        637
96        142
97         40
98        151
99         49
100        80
101       245
102       120
103        47
104        96
105        58
106        77
107       124
108        70
109       244
110        92
111        72
112        71
113        80
114        44
115        69
116        53
117        76
118        57
119        28
120       132
121        75
122        52
123        56
124        21
         ... 
11189      42
11190     123
11191      77
11192      98
11193      41
11194      58
11195     123
11196      51
11197     192
11198      65
11199      49
11200     111
11201      73
11202      35
11203      66
11204      50
11205      68
11206      99
11207     143
11208      54
11209     314
11210      52
11211      29
11212      28
11213      24
11214      62
11215      41
11216      51
11217      26
11218     241
11219      69
11220      54
11221      20
11222     316
11223      27
11224      19
11225      32
11226      81
11227     159
11228      44
11229     112
11230     251
11231      21
11232     243
11233      57
11234      12
11235     335
11236      63
11237      31
11238      37
11239      47
11240      53
11241      33
11242     115
11243      92
11244      88
11245      67
11246      77
11247      99
11248      87
11249     100
11250      27
11251      54
11252      69
11253      97
11254      65
11255      50
11256      66
11257      46
11258      43
11259     101
11260      46
11261     143
11262      45
11263     102
11264     303
11265      86
11266     124
11267     702
11268      49
11269      65
11270      41
11271      46
11272     117
11273     882
11274     169
11275     716
11276      62
11277      70
11278      30
11279      89
11280      98
11281      25
11282     128
11283      59
11284     268
11285      44
11286      30
11287      60
11288      52
11289      82
11290      69
11291      42
11292      40
11293      38
11294      18
11295      53
11296      61
11297     122
11298     207
11299      19
11300      68
11301      87
11302      38
11303      39
11304     114
11305     305
11306      73
11307      41
11308      58
11309      51
11310      64
11311     778
11312      48
11313     992
Name: doc_length, dtype: int64

In [22]:
print(vocab.shape)#[V]
print(doc_topic_dists.shape)
print(doc_lengths.shape)


(50000,)
(11314, 10)
(11314,)

In [15]:
topic_freq  = (doc_topic_dists.T * doc_lengths).T.sum(axis=0) # elementwise multiplication and sum all the rows
print(topic_freq.shape)    #[K x M] * [M] = [K x M] = [M x K] = [K,]
# doc_topic_dists.T
topic_freq
# (doc_topic_dists.T * doc_lengths).T


(10,)
Out[15]:
topic
0      2350.276861
1      1332.903437
2      2522.722597
3    989058.050533
4      1593.339492
5      1850.267334
6      2001.928812
7      2223.548656
8      1361.881698
9      2757.080580
dtype: float64

In [16]:
topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
print(topic_proportion.shape)


(10,)

In [17]:
topic_order      = topic_proportion.index
# reorder all data based on new ordering of topics
topic_freq       = topic_freq[topic_order]
topic_term_dists = topic_term_dists.ix[topic_order]
doc_topic_dists  = doc_topic_dists[topic_order]

In [18]:
topic_order


Out[18]:
Int64Index([3, 9, 2, 0, 7, 6, 5, 4, 8, 1], dtype='int64', name='topic')

In [20]:
topic_term_dists.T.shape


Out[20]:
(50000, 10)

In [24]:
# token counts for each term-topic combination (widths of red bars)
term_topic_freq = (topic_term_dists.T * topic_freq).T 
#([K x V].T * [K x 1]).T
#([V x K] * [K x 1]).T
# [V x K].T
# [K x V]
term_topic_freq.shape


Out[24]:
(10, 50000)

In [ ]:
term_frequency = np.sum(term_topic_freq, axis=0)

Relevance Calculation


In [ ]:
def _find_relevance(log_ttd, log_lift, R, lambda_):
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift #TODO log added here
    return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)


def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
    return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])

Let $\phi_{wk}$ (topic_term_dists) denote the probability of term $w \in {1, ..., V }$ for topic $k \in {1, ..., K}$

Let $p_w$ (term_proportion) denote the marginal probability of term $w$ in the corpus.

Relevance of term $w$ to topic $k$ given a weight parameter $\lambda$ (where $0 \leq \lambda \leq 1$) as:

$$ r(w,k\ |\ \lambda) = \lambda \log(\phi_{wk}) + \log(1-\lambda)\log(\frac{\phi_{wk}}{p_w}) $$

Saliency Calculation

distinctiveness ($w$) = $\sum_T\ P(T\vert w)\ \log(\frac{P(T\vert w)}{P(T)})$

saliency($w$) = $P(w) * distinctiveness ($w$)$

$P(T\vert w)$ ---> topic_given_term
$P(T)$ ---> topic_proportion
$P(w)$ ---> term_proportion


In [ ]:
print(topic_term_dists.shape)
# topic_term_dists

In [ ]:
print(topic_proportion.shape)
# topic_proportion

In [ ]:
print(term_proportion.shape)
# term_proportion

In [ ]:
R=30
lambda_step = 0.01
n_jobs = -1

# marginal distribution over terms (width of blue bars)
term_proportion = term_frequency / term_frequency.sum()

# compute the distinctiveness and saliency of the terms:
# this determines the R terms that are displayed when no topic is selected
topic_given_term = topic_term_dists / topic_term_dists.sum()
kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness

# Order the terms for the "default" view by decreasing saliency:
default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \
                                  'Freq': term_frequency, 'Total': term_frequency, \
                                  'Category': 'Default'}). \
                                  sort_values(by='saliency', ascending=False)\
                                    .head(R).drop('saliency', 1)
# Rounding Freq and Total to integer values to match LDAvis code:
default_term_info['Freq'] = np.floor(default_term_info['Freq'])
default_term_info['Total'] = np.floor(default_term_info['Total'])
ranks = np.arange(R, 0, -1)
default_term_info['logprob'] = default_term_info['loglift'] = ranks

## compute relevance and top terms for each topic
log_lift = np.log(topic_term_dists / term_proportion)
log_ttd = np.log(topic_term_dists)

lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)

def topic_top_term_df(tup):
    new_topic_id, (original_topic_id, topic_terms) = tup
    term_ix = topic_terms.unique()
#     print('===========')
#     print('new_topic_id: ', new_topic_id)
#     print('--------')
#     print('original_topic_id:' , original_topic_id)
#     print('--------')
#     print('term_ix: ', term_ix)
#     print('-========')

    return pd.DataFrame({'Term': vocab[term_ix], \
                   'Freq': term_topic_freq.loc[original_topic_id, term_ix], \
                   'Total': term_frequency[term_ix], \
                   'logprob': log_ttd.loc[original_topic_id, term_ix].round(4), \
                   'loglift': log_lift.loc[original_topic_id, term_ix].round(4), \
                   'Category': 'Topic%d' % new_topic_id})

top_terms = _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq)

topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
topic_info =  pd.concat([default_term_info] + list(topic_dfs))
topic_info

In [ ]:
# def _token_table(topic_info, term_topic_freq, vocab, term_frequency):
# last, to compute the areas of the circles when a term is highlighted
# we must gather all unique terms that could show up (for every combination
# of topic and value of lambda) and compute its distribution over topics.

# term-topic frequency table of unique terms across all topics and all values of lambda
term_ix = topic_info.index.unique()
term_ix = np.sort(term_ix)

top_topic_terms_freq = term_topic_freq[term_ix]
# use the new ordering for the topics
K = len(term_topic_freq)
top_topic_terms_freq.index = range(1, K + 1)
top_topic_terms_freq.index.name = 'Topic'

# we filter to Freq >= 0.5 to avoid sending too much data to the browser
token_table = pd.DataFrame({'Freq': top_topic_terms_freq.unstack()}). \
             reset_index().set_index('term'). \
             query('Freq >= 0.5')

token_table['Freq'] = token_table['Freq'].round()
token_table['Term'] = vocab[token_table.index.values].values
# Normalize token frequencies:
token_table['Freq'] = token_table.Freq / term_frequency[token_table.index]
token_table = token_table.sort_values(by=['Term', 'Topic'])

print()
token_table

Topic Coordinates


In [26]:
topic_term_dists


Out[26]:
term 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 ... 49925 49926 49927 49928 49929 49930 49931 49932 49933 49934 49935 49936 49937 49938 49939 49940 49941 49942 49943 49944 49945 49946 49947 49948 49949 49950 49951 49952 49953 49954 49955 49956 49957 49958 49959 49960 49961 49962 49963 49964 49965 49966 49967 49968 49969 49970 49971 49972 49973 49974 49975 49976 49977 49978 49979 49980 49981 49982 49983 49984 49985 49986 49987 49988 49989 49990 49991 49992 49993 49994 49995 49996 49997 49998 49999
topic
0 34.560076 23.052394 35.011782 15.229357 18.862407 21.696336 12.041139 19.608960 10.776458 23.010937 15.508145 14.207191 10.866341 10.992659 10.535417 9.556278 14.158643 18.374985 9.974904 6.319570 7.204872 7.815420 6.280495 13.116165 7.838096 6.460555 10.659491 8.282834 4.431570 15.375777 6.455284 7.225659 9.755187 14.810625 7.427099 3.704050 5.676146 10.384648 8.780114 6.520003 8.981168 7.829013 9.590536 8.193141 5.235972 5.624796 9.294603 7.756465 10.863049 5.229376 12.857805 5.914746 9.216167 7.581182 4.980683 8.968172 5.774038 6.506058 5.813927 7.624951 5.747068 1.924129 2.085067 7.341583 5.151361 8.472655 7.226199 3.960461 8.701531 10.669252 4.907496 6.523535 3.105377 6.238830 5.844484 ... 0.625724 1.029929 1.017088 0.931565 0.920934 0.762889 0.751911 0.873610 0.939472 0.755020 0.825627 0.917172 0.769667 1.021140 0.918545 1.126019 0.896102 1.000272 1.088472 1.022719 0.850058 0.900774 1.258427 1.018967 0.763810 0.897174 0.834409 1.031664 0.821515 0.926126 0.920350 0.728411 0.776758 0.785560 0.769508 1.014240 1.057380 0.964175 0.911873 0.861532 0.801979 0.714600 0.708247 0.891596 0.782352 0.896941 0.806620 0.886933 0.771092 0.976889 1.369195 0.930031 1.149220 0.782902 0.798179 0.706706 0.947690 0.958424 0.892332 0.902915 0.772424 1.085815 0.825636 0.816371 1.380099 0.833757 0.678400 0.792306 0.916583 0.878585 0.929071 0.930318 0.928379 0.851077 0.731892
1 50.337722 40.581697 41.140381 15.087618 33.025241 22.846752 19.277132 18.825574 23.046049 27.702032 22.896842 20.329934 14.287076 18.290987 9.025927 14.427910 15.712707 16.706891 12.434514 10.515107 25.544847 18.070308 11.166432 9.016734 14.800357 7.597889 8.258921 9.401391 10.298447 7.877626 9.106611 21.045833 12.037226 12.942835 16.126885 6.919506 12.485776 12.509401 11.687824 14.292532 10.509668 8.118996 9.468621 12.616041 12.414472 9.431784 12.656220 9.535022 5.577135 7.839322 13.209660 7.978555 10.510647 6.044113 9.088396 7.046644 18.436527 11.436126 6.253316 8.271624 6.530339 13.176651 6.771588 5.434604 19.229041 3.917880 10.120773 16.016697 7.076446 7.626771 6.331805 20.473737 24.325816 6.781279 10.221805 ... 0.839282 0.919271 0.819840 0.772406 0.881008 1.022791 0.979500 0.874041 0.708877 0.969268 0.868584 0.889366 0.764791 0.938343 0.829809 0.956575 0.861433 0.803894 1.454012 0.813326 0.816473 0.737059 0.814294 0.858895 0.973541 1.025707 0.809228 0.928308 1.116179 0.834379 0.873502 0.746603 0.874483 0.997578 0.944364 0.975360 1.014332 0.845255 0.990822 0.815321 0.824637 0.817899 0.940212 0.843902 0.870331 0.966135 0.919941 0.998555 0.889019 0.835961 0.911019 0.890673 1.063340 0.814293 0.926136 0.912161 0.825771 0.989603 0.865833 0.817442 0.899520 0.904136 0.890219 0.957529 0.945074 0.909135 1.013328 0.907039 0.967374 0.707008 0.811445 1.031009 0.952512 1.026861 0.864728
2 34.399639 23.519070 13.655633 24.389804 11.009536 12.211154 15.824990 16.277737 11.225549 30.053909 12.697881 11.624102 5.584157 7.127661 9.204468 5.399320 9.017233 11.528700 7.302804 7.692432 7.276529 8.592872 7.313048 7.735513 5.820175 6.861334 6.030154 9.911728 6.975535 12.711965 4.845529 6.531035 6.221902 6.691088 6.915521 7.526189 5.687589 4.648638 3.196965 5.655204 6.899286 5.018733 6.686863 4.691510 6.616092 5.552907 10.099042 3.983911 9.342515 6.577858 3.712910 4.697486 5.061583 5.112267 6.210239 15.941712 2.585294 8.352803 6.295825 6.035192 5.496462 6.631243 3.856289 6.041307 23.030694 6.096826 5.363703 10.739196 5.496608 5.087616 4.222902 12.748787 2.082916 3.353945 9.055495 ... 0.796024 0.909079 0.926489 0.855503 0.944568 0.896995 0.862610 1.072966 0.738476 0.913067 0.877246 0.797810 0.777069 0.804129 0.973098 0.790396 0.889272 1.044082 0.848854 0.822262 0.962644 0.935022 1.176447 1.123527 0.874715 0.697053 1.098977 0.926931 0.883549 0.782220 0.929458 0.797841 1.009072 0.753575 0.820716 0.864211 0.889806 0.876695 0.797402 0.959936 0.916876 0.907625 0.781050 0.783876 0.917208 0.898019 0.869306 0.924573 1.075971 0.854858 0.821360 1.081370 1.030548 0.971347 0.842678 0.962225 0.822437 0.825612 0.964396 0.843651 0.909091 0.982232 0.974882 0.983783 0.819541 0.865117 1.000803 0.851075 0.898850 0.989933 0.909373 0.795862 0.717025 0.721607 0.954055
3 1029.528805 751.784326 655.640311 540.066993 566.134127 526.255165 460.171499 423.016148 369.444783 369.459691 347.886831 343.305314 342.713871 304.650466 299.409544 299.081958 280.996848 296.683790 285.677603 263.715589 238.912917 187.595707 223.893573 215.325411 221.997247 229.465106 224.574896 238.883827 202.423430 220.845476 228.819394 244.126373 182.529016 216.217914 198.995771 182.554719 215.859120 184.885569 195.653349 184.349345 218.817100 188.416197 175.793018 193.517135 184.795571 189.197386 189.286282 177.265277 166.930898 166.316745 173.668666 171.954935 183.790617 143.281182 158.223858 158.960054 167.227459 157.083343 154.335008 155.998689 150.034526 197.276212 155.415637 155.133175 164.253435 191.317473 145.015560 184.473258 147.251819 148.245306 137.579956 173.830229 191.475584 129.039037 146.246642 ... 1.313064 1.455776 0.907258 0.847273 0.813487 0.853852 0.940162 0.924069 0.851067 0.790654 0.742876 0.819138 1.492818 0.864653 0.886967 1.159601 0.821838 0.931346 0.883935 0.930908 0.877490 0.931882 1.037034 1.057502 0.870280 0.918149 1.405080 1.456379 1.127419 0.873330 0.865133 0.946465 0.936295 0.784012 0.973883 1.329945 0.956890 1.400776 0.758596 0.907328 0.994154 0.874232 0.878277 0.798990 1.088962 1.151723 0.898940 1.421349 0.987259 0.766984 0.987189 0.723241 0.849505 0.877373 1.286612 1.477315 0.954484 1.116482 0.945982 0.873253 0.872454 0.878087 1.436687 0.916522 0.949573 0.855859 0.809900 0.928804 1.487982 0.918141 1.390670 1.031652 0.755490 0.864685 0.925891
4 84.686830 50.569903 29.790239 46.609463 29.066011 32.100742 39.628684 41.644559 18.627329 21.382471 40.316177 26.448719 29.018087 21.111763 16.463535 24.787983 22.299294 16.576938 14.431890 20.670333 9.707518 19.589928 15.344833 29.048718 19.539289 19.608818 15.094792 17.649770 11.077402 8.265068 23.008406 13.799867 21.349845 5.284080 31.803037 33.916721 16.231378 15.762746 17.526287 14.598748 11.415723 22.321454 25.170574 10.124354 12.428174 13.801095 10.034727 23.666243 9.723196 13.367494 13.117179 15.064447 13.802612 11.304332 14.544449 8.693906 10.518363 11.497571 12.575327 9.872245 13.518083 13.345436 8.711541 13.919656 7.661993 14.323676 7.835069 5.923976 10.305760 15.881917 9.607894 13.861560 2.536607 10.120047 9.251958 ... 0.956811 0.908525 0.797821 0.859530 0.917984 0.922327 0.969551 0.789626 0.903601 0.961633 0.885067 0.807612 0.862245 0.876353 0.976884 0.880854 0.791405 0.840732 0.870292 0.718133 0.852915 0.855864 0.789180 0.777344 0.956919 0.891875 0.855127 0.919265 0.878362 0.910308 0.904615 0.971993 0.907860 0.803285 0.977555 0.710161 0.944781 0.967877 0.994523 0.922489 0.954861 0.794897 0.783929 0.881262 0.856886 1.198302 1.081608 1.018026 1.094986 0.808721 0.742701 1.009561 0.822764 0.991128 0.759882 0.823071 0.786464 0.848688 0.913733 0.913808 0.921651 0.841117 0.826702 0.826018 0.875004 0.924390 0.912291 0.766137 0.875925 0.973028 0.957894 0.944202 0.959102 0.802573 1.010276

5 rows × 50000 columns


In [27]:
# def _topic_coordinates(mds, topic_term_dists, topic_proportion):
mds = js_PCoA
K = topic_term_dists.shape[0]
mds_res = mds(topic_term_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'topics': range(1, K + 1), \
                      'cluster': 1, 'Freq': topic_proportion * 100})
# note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
topic_coordinates = mds_df
topic_coordinates


Out[27]:
Freq cluster topics x y
topic
3 97.957317 1 1 -0.046095 0.003568
4 0.685661 1 2 -0.030449 -0.009181
0 0.662217 1 3 -0.052126 0.000821
2 0.353436 1 4 0.139155 -0.000147
1 0.341370 1 5 -0.010484 0.004940

In [ ]:


In [ ]:
client_topic_order = [x + 1 for x in topic_order]

In [ ]:
plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}

In [ ]:
class PreparedData(namedtuple('PreparedData', ['topic_coordinates', 'topic_info', 'token_table',\
                                               'R', 'lambda_step', 'plot_opts', 'topic_order'])):
    def to_dict(self):
       return {'mdsDat': self.topic_coordinates.to_dict(orient='list'),
               'tinfo': self.topic_info.to_dict(orient='list'),
               'token.table': self.token_table.to_dict(orient='list'),
               'R': self.R,
               'lambda.step': self.lambda_step,
               'plot.opts': self.plot_opts,
               'topic.order': self.topic_order}

    def to_json(self):
       return json.dumps(self.to_dict(), cls=NumPyEncoder)

In [ ]:
# topic_coordinates, topic_info, token_table, R, lambda_step, plot_opts, client_topic_order

In [ ]:
pp = PreparedData(topic_coordinates, topic_info, token_table, R, lambda_step, plot_opts, client_topic_order)
pp.to_dict()

In [ ]:
fileobj = open("/tmp/lda1.json", 'w')
json.dump(pp.to_dict(), fileobj, cls=NumPyEncoder)

In [ ]:
# topic_coordinates.to_dict(orient='list')
# topic_info.to_dict(orient='list')
# token_table.to_dict(orient='list')
topic_order

PCA Implentation Exploration


In [ ]:
def _jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    e1 = entropy(_P, _M)
    e2 = entropy(_Q, _M)
    res = 0.5 * ( e1 + e2 )
#     print('e1 ', e1)
#     print('e2 ', e2)
    print('res ', res)
    return res

In [ ]:
# topic_term_dists

In [ ]:
ss = pdist(topic_term_dists, metric=_jensen_shannon)
ss.shape

In [ ]:
print(ss)

In [ ]:
ss.shape[0]

d = int(np.ceil(np.sqrt(ss.shape[0] * 2)))

d

In [ ]:
d * (d - 1) / 2

In [ ]:
ss[:10]

In [ ]:
pair_dists = pyLDAvis._prepare.squareform(ss)

In [ ]:
pair_dists[:10]

In [ ]:
pyLDAvis._prepare.js_PCoA(topic_term_dists)

In [ ]:
pair_dists = np.asarray(pair_dists, np.float64)

In [ ]:
n = pair_dists.shape[0]
n

In [ ]:
H = np.eye(n) - np.ones((n, n)) / n
H

In [ ]:
B = - H.dot(pair_dists ** 2).dot(H) / 2
B

In [ ]:
eigvals, eigvecs = np.linalg.eig(B)
eigvecs

In [ ]:
ix = eigvals.argsort()[::-1][:2]
ix

In [ ]:
eigvals = eigvals[ix]

In [ ]:
eigvecs = eigvecs[:, ix]
eigvecs

In [ ]:
eigvals[np.isclose(eigvals, 0)] = 0
np.any(eigvals < 0)

In [ ]:
if np.any(eigvals < 0):
    ix_neg = eigvals < 0
    eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
    eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

In [ ]:
eigvals.shape
type(eigvals)

In [ ]:
eigvecs.shape

In [ ]:
np.sqrt(eigvals) * eigvecs

In [ ]:
eigvecs * np.sqrt(eigvals)

In [ ]:


In [ ]: