In [8]:
# This notebook is used to decide on a tolerable level of corruptableness.
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from scipy.stats import entropy as KL_divergence

from slda.topic_models import SLDA
from modules.helpers import plot_images

In [11]:
# Generate topics
# We assume a vocabulary of 'rows'^2 terms, and create 'rows'*2 "topics",
# where each topic assigns exactly 'rows' consecutive terms equal probability.
rows = 3
V = rows * rows
K = rows * 2
N = K * K
D = 10000
seed = 42
topics = []
topic_base = np.concatenate((np.ones((1, rows)) * (1/rows),
                             np.zeros((rows-1, rows))), axis=0).ravel()
for i in range(rows):
     topics.append(np.roll(topic_base, i * rows))
topic_base = np.concatenate((np.ones((rows, 1)) * (1/rows),
                             np.zeros((rows, rows-1))), axis=1).ravel()
for i in range(rows):
    topics.append(np.roll(topic_base, i))
topics = np.array(topics)

# Generate documents from topics
# We generate D documents from these V topics by sampling D topic
# distributions, one for each document, from a Dirichlet distribution with
# parameter α=(1,…,1)
alpha = np.ones(K)
np.random.seed(seed)
thetas = np.random.dirichlet(alpha, size=D)
topic_assignments = np.array([np.random.choice(range(K), size=N, p=theta)
                              for theta in thetas])
word_assignments = np.array([[np.random.choice(range(V), size=1,
                                               p=topics[topic_assignments[d, n]])[0]
                              for n in range(N)] for d in range(D)])
doc_term_matrix = np.array([np.histogram(word_assignments[d], bins=V,
                                         range=(0, V - 1))[0] for d in range(D)])

# Generate responses
# Choose prameter values
nu2 = K
sigma2 = 1
np.random.seed(seed)
eta = np.random.normal(scale=nu2, size=K)
y = [np.dot(eta, thetas[i]) for i in range(D)] + np.random.normal(scale=sigma2, size=D)

# Estimate parameters
_K = K
_alpha = alpha
_beta = np.repeat(0.01, V)
_mu = 0
_nu2 = nu2
_sigma2 = sigma2
n_iter = 2000
slda = SLDA(_K, _alpha, _beta, _mu, _nu2, _sigma2, n_iter, seed=42)

slda.fit(doc_term_matrix, y)
results = slda.phi


2016-06-17 10:16:11.188787 start iterations
2016-06-17 10:16:11.813727 0:00:00.624940 elapsed, iter   10, LL -216976.8816, 33.93% change from last
2016-06-17 10:16:12.407294 0:00:01.218507 elapsed, iter   20, LL -93085.9421, 57.10% change from last
2016-06-17 10:16:12.972042 0:00:01.783255 elapsed, iter   30, LL -20819.4907, 77.63% change from last
2016-06-17 10:16:13.555434 0:00:02.366647 elapsed, iter   40, LL 6192.5139, 129.74% change from last
2016-06-17 10:16:14.146688 0:00:02.957901 elapsed, iter   50, LL 30247.2120, 388.45% change from last
2016-06-17 10:16:14.722809 0:00:03.534022 elapsed, iter   60, LL 51492.1390, 70.24% change from last
2016-06-17 10:16:15.284716 0:00:04.095929 elapsed, iter   70, LL 68998.2432, 34.00% change from last
2016-06-17 10:16:15.844190 0:00:04.655403 elapsed, iter   80, LL 82676.4953, 19.82% change from last
2016-06-17 10:16:16.402596 0:00:05.213809 elapsed, iter   90, LL 90288.8849, 9.21% change from last
2016-06-17 10:16:16.998246 0:00:05.809459 elapsed, iter  100, LL 96834.4230, 7.25% change from last
2016-06-17 10:16:17.566181 0:00:06.377394 elapsed, iter  110, LL 100722.3202, 4.01% change from last
2016-06-17 10:16:18.132555 0:00:06.943768 elapsed, iter  120, LL 104628.4521, 3.88% change from last
2016-06-17 10:16:18.697737 0:00:07.508950 elapsed, iter  130, LL 107317.3514, 2.57% change from last
2016-06-17 10:16:19.255843 0:00:08.067056 elapsed, iter  140, LL 109722.6454, 2.24% change from last
2016-06-17 10:16:19.833629 0:00:08.644842 elapsed, iter  150, LL 112011.7490, 2.09% change from last
2016-06-17 10:16:20.400912 0:00:09.212125 elapsed, iter  160, LL 111903.3026, -0.10% change from last
2016-06-17 10:16:20.958011 0:00:09.769224 elapsed, iter  170, LL 112354.5135, 0.40% change from last
2016-06-17 10:16:21.537752 0:00:10.348965 elapsed, iter  180, LL 113744.2562, 1.24% change from last
2016-06-17 10:16:22.104733 0:00:10.915946 elapsed, iter  190, LL 113580.6946, -0.14% change from last
2016-06-17 10:16:22.670920 0:00:11.482133 elapsed, iter  200, LL 115195.2296, 1.42% change from last
2016-06-17 10:16:23.232015 0:00:12.043228 elapsed, iter  210, LL 113730.5202, -1.27% change from last
2016-06-17 10:16:23.805497 0:00:12.616710 elapsed, iter  220, LL 113985.2813, 0.22% change from last
2016-06-17 10:16:24.375116 0:00:13.186329 elapsed, iter  230, LL 114241.9406, 0.23% change from last
2016-06-17 10:16:24.938512 0:00:13.749725 elapsed, iter  240, LL 113385.6861, -0.75% change from last
2016-06-17 10:16:25.503695 0:00:14.314908 elapsed, iter  250, LL 114357.6188, 0.86% change from last
2016-06-17 10:16:26.067005 0:00:14.878218 elapsed, iter  260, LL 114628.9424, 0.24% change from last
2016-06-17 10:16:26.630766 0:00:15.441979 elapsed, iter  270, LL 115073.0046, 0.39% change from last
2016-06-17 10:16:27.194091 0:00:16.005304 elapsed, iter  280, LL 116377.7356, 1.13% change from last
2016-06-17 10:16:27.758679 0:00:16.569892 elapsed, iter  290, LL 116289.4610, -0.08% change from last
2016-06-17 10:16:28.322390 0:00:17.133603 elapsed, iter  300, LL 117000.5093, 0.61% change from last
2016-06-17 10:16:28.892049 0:00:17.703262 elapsed, iter  310, LL 117298.7736, 0.25% change from last
2016-06-17 10:16:29.456483 0:00:18.267696 elapsed, iter  320, LL 117501.3540, 0.17% change from last
2016-06-17 10:16:30.028098 0:00:18.839311 elapsed, iter  330, LL 119827.7299, 1.98% change from last
2016-06-17 10:16:30.591490 0:00:19.402703 elapsed, iter  340, LL 119445.0457, -0.32% change from last
2016-06-17 10:16:31.154477 0:00:19.965690 elapsed, iter  350, LL 120013.9414, 0.48% change from last
2016-06-17 10:16:31.718214 0:00:20.529427 elapsed, iter  360, LL 120083.5113, 0.06% change from last
2016-06-17 10:16:32.281092 0:00:21.092305 elapsed, iter  370, LL 119892.7459, -0.16% change from last
2016-06-17 10:16:32.843958 0:00:21.655171 elapsed, iter  380, LL 120507.5201, 0.51% change from last
2016-06-17 10:16:33.407396 0:00:22.218609 elapsed, iter  390, LL 119570.9471, -0.78% change from last
2016-06-17 10:16:33.973088 0:00:22.784301 elapsed, iter  400, LL 120894.4855, 1.11% change from last
2016-06-17 10:16:34.540481 0:00:23.351694 elapsed, iter  410, LL 121060.0436, 0.14% change from last
2016-06-17 10:16:35.105566 0:00:23.916779 elapsed, iter  420, LL 120576.8100, -0.40% change from last
2016-06-17 10:16:35.671219 0:00:24.482432 elapsed, iter  430, LL 121023.0592, 0.37% change from last
2016-06-17 10:16:36.234190 0:00:25.045403 elapsed, iter  440, LL 121259.6824, 0.20% change from last
2016-06-17 10:16:36.799526 0:00:25.610739 elapsed, iter  450, LL 120436.3144, -0.68% change from last
2016-06-17 10:16:37.362917 0:00:26.174130 elapsed, iter  460, LL 120289.7983, -0.12% change from last
2016-06-17 10:16:37.929424 0:00:26.740637 elapsed, iter  470, LL 120208.5852, -0.07% change from last
2016-06-17 10:16:38.496965 0:00:27.308178 elapsed, iter  480, LL 120626.9294, 0.35% change from last
2016-06-17 10:16:39.063448 0:00:27.874661 elapsed, iter  490, LL 120373.1844, -0.21% change from last
2016-06-17 10:16:39.636110 0:00:28.447323 elapsed, iter  500, LL 120246.0116, -0.11% change from last
2016-06-17 10:16:40.203036 0:00:29.014249 elapsed, iter  510, LL 120011.5546, -0.19% change from last
2016-06-17 10:16:40.770946 0:00:29.582159 elapsed, iter  520, LL 119624.3206, -0.32% change from last
2016-06-17 10:16:41.337525 0:00:30.148738 elapsed, iter  530, LL 120638.7601, 0.85% change from last
2016-06-17 10:16:41.906537 0:00:30.717750 elapsed, iter  540, LL 120389.4681, -0.21% change from last
2016-06-17 10:16:42.474658 0:00:31.285871 elapsed, iter  550, LL 121069.6663, 0.56% change from last
2016-06-17 10:16:43.045617 0:00:31.856830 elapsed, iter  560, LL 120809.6212, -0.21% change from last
2016-06-17 10:16:43.612289 0:00:32.423502 elapsed, iter  570, LL 121214.8635, 0.34% change from last
2016-06-17 10:16:44.182653 0:00:32.993866 elapsed, iter  580, LL 121063.2415, -0.13% change from last
2016-06-17 10:16:44.751382 0:00:33.562595 elapsed, iter  590, LL 120240.6529, -0.68% change from last
2016-06-17 10:16:45.318780 0:00:34.129993 elapsed, iter  600, LL 120728.7708, 0.41% change from last
2016-06-17 10:16:45.887735 0:00:34.698948 elapsed, iter  610, LL 120975.4723, 0.20% change from last
2016-06-17 10:16:46.457549 0:00:35.268762 elapsed, iter  620, LL 120991.3317, 0.01% change from last
2016-06-17 10:16:47.024446 0:00:35.835659 elapsed, iter  630, LL 121912.3001, 0.76% change from last
2016-06-17 10:16:47.595030 0:00:36.406243 elapsed, iter  640, LL 122308.5787, 0.33% change from last
2016-06-17 10:16:48.167215 0:00:36.978428 elapsed, iter  650, LL 122454.6992, 0.12% change from last
2016-06-17 10:16:48.739004 0:00:37.550217 elapsed, iter  660, LL 122759.1140, 0.25% change from last
2016-06-17 10:16:49.307458 0:00:38.118671 elapsed, iter  670, LL 122628.7569, -0.11% change from last
2016-06-17 10:16:49.874774 0:00:38.685987 elapsed, iter  680, LL 123879.2584, 1.02% change from last
2016-06-17 10:16:50.442556 0:00:39.253769 elapsed, iter  690, LL 122994.3297, -0.71% change from last
2016-06-17 10:16:51.009017 0:00:39.820230 elapsed, iter  700, LL 122303.9297, -0.56% change from last
2016-06-17 10:16:51.576619 0:00:40.387832 elapsed, iter  710, LL 122589.2333, 0.23% change from last
2016-06-17 10:16:52.146468 0:00:40.957681 elapsed, iter  720, LL 122099.8356, -0.40% change from last
2016-06-17 10:16:52.714296 0:00:41.525509 elapsed, iter  730, LL 122457.3283, 0.29% change from last
2016-06-17 10:16:53.285872 0:00:42.097085 elapsed, iter  740, LL 122396.4282, -0.05% change from last
2016-06-17 10:16:53.855333 0:00:42.666546 elapsed, iter  750, LL 122695.6230, 0.24% change from last
2016-06-17 10:16:54.421517 0:00:43.232730 elapsed, iter  760, LL 122279.5718, -0.34% change from last
2016-06-17 10:16:54.989578 0:00:43.800791 elapsed, iter  770, LL 122641.1034, 0.30% change from last
2016-06-17 10:16:55.558783 0:00:44.369996 elapsed, iter  780, LL 123558.1685, 0.75% change from last
2016-06-17 10:16:56.126617 0:00:44.937830 elapsed, iter  790, LL 122731.6531, -0.67% change from last
2016-06-17 10:16:56.693274 0:00:45.504487 elapsed, iter  800, LL 122640.3098, -0.07% change from last
2016-06-17 10:16:57.261179 0:00:46.072392 elapsed, iter  810, LL 122602.1529, -0.03% change from last
2016-06-17 10:16:57.830787 0:00:46.642000 elapsed, iter  820, LL 122653.9978, 0.04% change from last
2016-06-17 10:16:58.400402 0:00:47.211615 elapsed, iter  830, LL 121917.2254, -0.60% change from last
2016-06-17 10:16:58.970271 0:00:47.781484 elapsed, iter  840, LL 122583.8901, 0.55% change from last
2016-06-17 10:16:59.539856 0:00:48.351069 elapsed, iter  850, LL 121712.4930, -0.71% change from last
2016-06-17 10:17:00.109966 0:00:48.921179 elapsed, iter  860, LL 122224.8495, 0.42% change from last
2016-06-17 10:17:00.678006 0:00:49.489219 elapsed, iter  870, LL 122315.7811, 0.07% change from last
2016-06-17 10:17:01.249135 0:00:50.060348 elapsed, iter  880, LL 123384.3683, 0.87% change from last
2016-06-17 10:17:01.814850 0:00:50.626063 elapsed, iter  890, LL 122764.2635, -0.50% change from last
2016-06-17 10:17:02.382993 0:00:51.194206 elapsed, iter  900, LL 123168.1781, 0.33% change from last
2016-06-17 10:17:02.949200 0:00:51.760413 elapsed, iter  910, LL 123596.9983, 0.35% change from last
2016-06-17 10:17:03.518347 0:00:52.329560 elapsed, iter  920, LL 123437.0300, -0.13% change from last
2016-06-17 10:17:04.086628 0:00:52.897841 elapsed, iter  930, LL 123605.3327, 0.14% change from last
2016-06-17 10:17:04.654543 0:00:53.465756 elapsed, iter  940, LL 123096.7614, -0.41% change from last
2016-06-17 10:17:05.224890 0:00:54.036103 elapsed, iter  950, LL 122856.6831, -0.20% change from last
2016-06-17 10:17:05.792499 0:00:54.603712 elapsed, iter  960, LL 122738.9679, -0.10% change from last
2016-06-17 10:17:06.364477 0:00:55.175690 elapsed, iter  970, LL 122734.7339, -0.00% change from last
2016-06-17 10:17:06.933193 0:00:55.744406 elapsed, iter  980, LL 122913.9439, 0.15% change from last
2016-06-17 10:17:07.506523 0:00:56.317736 elapsed, iter  990, LL 122546.1620, -0.30% change from last
2016-06-17 10:17:08.073995 0:00:56.885208 elapsed, iter 1000, LL 122180.4553, -0.30% change from last
2016-06-17 10:17:08.641358 0:00:57.452571 elapsed, iter 1010, LL 123462.0895, 1.05% change from last
2016-06-17 10:17:09.209821 0:00:58.021034 elapsed, iter 1020, LL 123635.4041, 0.14% change from last
2016-06-17 10:17:09.788433 0:00:58.599646 elapsed, iter 1030, LL 123807.0492, 0.14% change from last
2016-06-17 10:17:10.356125 0:00:59.167338 elapsed, iter 1040, LL 123497.0312, -0.25% change from last
2016-06-17 10:17:10.929351 0:00:59.740564 elapsed, iter 1050, LL 123612.7088, 0.09% change from last
2016-06-17 10:17:11.498376 0:01:00.309589 elapsed, iter 1060, LL 124805.7116, 0.97% change from last
2016-06-17 10:17:12.071363 0:01:00.882576 elapsed, iter 1070, LL 124168.3738, -0.51% change from last
2016-06-17 10:17:12.640186 0:01:01.451399 elapsed, iter 1080, LL 123866.3408, -0.24% change from last
2016-06-17 10:17:13.208869 0:01:02.020082 elapsed, iter 1090, LL 124171.8343, 0.25% change from last
2016-06-17 10:17:13.774358 0:01:02.585571 elapsed, iter 1100, LL 123747.0678, -0.34% change from last
2016-06-17 10:17:14.341657 0:01:03.152870 elapsed, iter 1110, LL 123845.0247, 0.08% change from last
2016-06-17 10:17:14.914067 0:01:03.725280 elapsed, iter 1120, LL 123337.5551, -0.41% change from last
2016-06-17 10:17:15.482667 0:01:04.293880 elapsed, iter 1130, LL 124370.5306, 0.84% change from last
2016-06-17 10:17:16.051912 0:01:04.863125 elapsed, iter 1140, LL 123784.6530, -0.47% change from last
2016-06-17 10:17:16.620955 0:01:05.432168 elapsed, iter 1150, LL 123781.0273, -0.00% change from last
2016-06-17 10:17:17.189656 0:01:06.000869 elapsed, iter 1160, LL 124517.8942, 0.60% change from last
2016-06-17 10:17:17.762109 0:01:06.573322 elapsed, iter 1170, LL 123831.3444, -0.55% change from last
2016-06-17 10:17:18.333590 0:01:07.144803 elapsed, iter 1180, LL 124398.3425, 0.46% change from last
2016-06-17 10:17:18.904589 0:01:07.715802 elapsed, iter 1190, LL 123942.3118, -0.37% change from last
2016-06-17 10:17:19.474686 0:01:08.285899 elapsed, iter 1200, LL 124058.8005, 0.09% change from last
2016-06-17 10:17:20.041413 0:01:08.852626 elapsed, iter 1210, LL 124608.8600, 0.44% change from last
2016-06-17 10:17:20.614130 0:01:09.425343 elapsed, iter 1220, LL 123975.6499, -0.51% change from last
2016-06-17 10:17:21.186124 0:01:09.997337 elapsed, iter 1230, LL 123439.3763, -0.43% change from last
2016-06-17 10:17:21.754895 0:01:10.566108 elapsed, iter 1240, LL 123970.7550, 0.43% change from last
2016-06-17 10:17:22.326081 0:01:11.137294 elapsed, iter 1250, LL 123917.3518, -0.04% change from last
2016-06-17 10:17:22.895774 0:01:11.706987 elapsed, iter 1260, LL 122946.9741, -0.78% change from last
2016-06-17 10:17:23.463239 0:01:12.274452 elapsed, iter 1270, LL 123742.9786, 0.65% change from last
2016-06-17 10:17:24.035142 0:01:12.846355 elapsed, iter 1280, LL 123949.0070, 0.17% change from last
2016-06-17 10:17:24.603165 0:01:13.414378 elapsed, iter 1290, LL 124040.9321, 0.07% change from last
2016-06-17 10:17:25.173732 0:01:13.984945 elapsed, iter 1300, LL 124149.7225, 0.09% change from last
2016-06-17 10:17:25.744579 0:01:14.555792 elapsed, iter 1310, LL 123389.2526, -0.61% change from last
2016-06-17 10:17:26.320648 0:01:15.131861 elapsed, iter 1320, LL 123868.3728, 0.39% change from last
2016-06-17 10:17:26.953728 0:01:15.764941 elapsed, iter 1330, LL 124009.8577, 0.11% change from last
2016-06-17 10:17:27.543241 0:01:16.354454 elapsed, iter 1340, LL 123809.0311, -0.16% change from last
2016-06-17 10:17:28.162216 0:01:16.973429 elapsed, iter 1350, LL 124915.3733, 0.89% change from last
2016-06-17 10:17:28.748286 0:01:17.559499 elapsed, iter 1360, LL 123784.5014, -0.91% change from last
2016-06-17 10:17:29.322062 0:01:18.133275 elapsed, iter 1370, LL 124278.7805, 0.40% change from last
2016-06-17 10:17:29.893340 0:01:18.704553 elapsed, iter 1380, LL 124260.6208, -0.01% change from last
2016-06-17 10:17:30.463504 0:01:19.274717 elapsed, iter 1390, LL 123693.4228, -0.46% change from last
2016-06-17 10:17:31.041047 0:01:19.852260 elapsed, iter 1400, LL 124530.6940, 0.68% change from last
2016-06-17 10:17:31.612726 0:01:20.423939 elapsed, iter 1410, LL 123854.9941, -0.54% change from last
2016-06-17 10:17:32.182300 0:01:20.993513 elapsed, iter 1420, LL 123979.9944, 0.10% change from last
2016-06-17 10:17:32.766072 0:01:21.577285 elapsed, iter 1430, LL 123654.7942, -0.26% change from last
2016-06-17 10:17:33.340431 0:01:22.151644 elapsed, iter 1440, LL 124071.0989, 0.34% change from last
2016-06-17 10:17:33.916788 0:01:22.728001 elapsed, iter 1450, LL 123937.2521, -0.11% change from last
2016-06-17 10:17:34.488277 0:01:23.299490 elapsed, iter 1460, LL 123982.5169, 0.04% change from last
2016-06-17 10:17:35.060151 0:01:23.871364 elapsed, iter 1470, LL 123898.5219, -0.07% change from last
2016-06-17 10:17:35.634876 0:01:24.446089 elapsed, iter 1480, LL 124836.0058, 0.76% change from last
2016-06-17 10:17:36.203002 0:01:25.014215 elapsed, iter 1490, LL 123326.5152, -1.21% change from last
2016-06-17 10:17:36.774103 0:01:25.585316 elapsed, iter 1500, LL 123798.5520, 0.38% change from last
2016-06-17 10:17:37.342503 0:01:26.153716 elapsed, iter 1510, LL 124541.0899, 0.60% change from last
2016-06-17 10:17:37.907779 0:01:26.718992 elapsed, iter 1520, LL 124282.8094, -0.21% change from last
2016-06-17 10:17:38.475469 0:01:27.286682 elapsed, iter 1530, LL 124014.9571, -0.22% change from last
2016-06-17 10:17:39.041498 0:01:27.852711 elapsed, iter 1540, LL 124087.1109, 0.06% change from last
2016-06-17 10:17:39.611847 0:01:28.423060 elapsed, iter 1550, LL 123818.5520, -0.22% change from last
2016-06-17 10:17:40.187002 0:01:28.998215 elapsed, iter 1560, LL 123742.0535, -0.06% change from last
2016-06-17 10:17:40.755004 0:01:29.566217 elapsed, iter 1570, LL 124452.9615, 0.57% change from last
2016-06-17 10:17:41.325921 0:01:30.137134 elapsed, iter 1580, LL 124106.5017, -0.28% change from last
2016-06-17 10:17:41.896237 0:01:30.707450 elapsed, iter 1590, LL 124720.7422, 0.49% change from last
2016-06-17 10:17:42.464571 0:01:31.275784 elapsed, iter 1600, LL 123883.0017, -0.67% change from last
2016-06-17 10:17:43.034572 0:01:31.845785 elapsed, iter 1610, LL 124436.3048, 0.45% change from last
2016-06-17 10:17:43.607062 0:01:32.418275 elapsed, iter 1620, LL 124016.6301, -0.34% change from last
2016-06-17 10:17:44.178616 0:01:32.989829 elapsed, iter 1630, LL 124628.0510, 0.49% change from last
2016-06-17 10:17:44.747825 0:01:33.559038 elapsed, iter 1640, LL 123712.1261, -0.73% change from last
2016-06-17 10:17:45.316638 0:01:34.127851 elapsed, iter 1650, LL 124305.0197, 0.48% change from last
2016-06-17 10:17:45.884755 0:01:34.695968 elapsed, iter 1660, LL 124481.4108, 0.14% change from last
2016-06-17 10:17:46.453457 0:01:35.264670 elapsed, iter 1670, LL 124152.0250, -0.26% change from last
2016-06-17 10:17:47.021559 0:01:35.832772 elapsed, iter 1680, LL 124402.7050, 0.20% change from last
2016-06-17 10:17:47.588500 0:01:36.399713 elapsed, iter 1690, LL 123524.8733, -0.71% change from last
2016-06-17 10:17:48.167620 0:01:36.978833 elapsed, iter 1700, LL 123907.8363, 0.31% change from last
2016-06-17 10:17:48.734350 0:01:37.545563 elapsed, iter 1710, LL 123634.8199, -0.22% change from last
2016-06-17 10:17:49.301021 0:01:38.112234 elapsed, iter 1720, LL 123768.3047, 0.11% change from last
2016-06-17 10:17:49.868024 0:01:38.679237 elapsed, iter 1730, LL 124334.4660, 0.46% change from last
2016-06-17 10:17:50.436252 0:01:39.247465 elapsed, iter 1740, LL 124058.5830, -0.22% change from last
2016-06-17 10:17:51.003365 0:01:39.814578 elapsed, iter 1750, LL 124029.8576, -0.02% change from last
2016-06-17 10:17:51.570498 0:01:40.381711 elapsed, iter 1760, LL 123843.3632, -0.15% change from last
2016-06-17 10:17:52.138269 0:01:40.949482 elapsed, iter 1770, LL 123741.2880, -0.08% change from last
2016-06-17 10:17:52.705147 0:01:41.516360 elapsed, iter 1780, LL 124648.7431, 0.73% change from last
2016-06-17 10:17:53.278104 0:01:42.089317 elapsed, iter 1790, LL 123554.1491, -0.88% change from last
2016-06-17 10:17:53.844801 0:01:42.656014 elapsed, iter 1800, LL 123045.2284, -0.41% change from last
2016-06-17 10:17:54.414140 0:01:43.225353 elapsed, iter 1810, LL 122000.6207, -0.85% change from last
2016-06-17 10:17:54.981097 0:01:43.792310 elapsed, iter 1820, LL 122523.5145, 0.43% change from last
2016-06-17 10:17:55.552681 0:01:44.363894 elapsed, iter 1830, LL 122537.8522, 0.01% change from last
2016-06-17 10:17:56.120585 0:01:44.931798 elapsed, iter 1840, LL 122383.7704, -0.13% change from last
2016-06-17 10:17:56.686831 0:01:45.498044 elapsed, iter 1850, LL 122131.8802, -0.21% change from last
2016-06-17 10:17:57.252694 0:01:46.063907 elapsed, iter 1860, LL 122360.0314, 0.19% change from last
2016-06-17 10:17:57.825187 0:01:46.636400 elapsed, iter 1870, LL 123163.2662, 0.66% change from last
2016-06-17 10:17:58.397877 0:01:47.209090 elapsed, iter 1880, LL 122704.2766, -0.37% change from last
2016-06-17 10:17:58.964630 0:01:47.775843 elapsed, iter 1890, LL 121533.1833, -0.95% change from last
2016-06-17 10:17:59.531877 0:01:48.343090 elapsed, iter 1900, LL 120901.1846, -0.52% change from last
2016-06-17 10:18:00.101577 0:01:48.912790 elapsed, iter 1910, LL 122375.2460, 1.22% change from last
2016-06-17 10:18:00.669404 0:01:49.480617 elapsed, iter 1920, LL 121499.6500, -0.72% change from last
2016-06-17 10:18:01.236049 0:01:50.047262 elapsed, iter 1930, LL 121442.0202, -0.05% change from last
2016-06-17 10:18:01.802181 0:01:50.613394 elapsed, iter 1940, LL 121421.8337, -0.02% change from last
2016-06-17 10:18:02.370661 0:01:51.181874 elapsed, iter 1950, LL 122091.4707, 0.55% change from last
2016-06-17 10:18:02.938054 0:01:51.749267 elapsed, iter 1960, LL 122607.0223, 0.42% change from last
2016-06-17 10:18:03.507370 0:01:52.318583 elapsed, iter 1970, LL 122675.3934, 0.06% change from last
2016-06-17 10:18:04.076950 0:01:52.888163 elapsed, iter 1980, LL 123920.9902, 1.02% change from last
2016-06-17 10:18:04.644315 0:01:53.455528 elapsed, iter 1990, LL 122127.3170, -1.45% change from last

In [12]:
for res in results:
    minimized_KL = 1
    for topic in topics:
        KL = KL_divergence(topic, res)
        if KL < minimized_KL:
            minimized_KL = KL
    print(minimized_KL)


0.000158519287203
0.000747583884782
0.000357432246038
0.000820463963172
0.000313906421071
0.000355526395597

In [ ]:
0.000158519287203
0.000747583884782
0.000357432246038
0.000820463963172
0.000313906421071
0.000355526395597

In [4]:
plot_images(plt, results, (rows, rows), (2, rows))
plt.figure()
plt.plot(slda.loglikelihoods)
plt.figure()
plt.plot(np.diff(slda.loglikelihoods)[-100:])


Out[4]:
[<matplotlib.lines.Line2D at 0x10fe8a748>]

In [42]:
topic = topics[0]

In [43]:
def normalize(lst):
    s = sum(lst)
    return([l/s for l in lst])

In [57]:
# Generate Dirichlet distributions and find the KL divergence
# with the true topic.
size = 1000000
dirichlets = []
KLs = []
for i in range(size):
    dirichlet = normalize(np.random.dirichlet(np.repeat(0.2, V)) + 1e-7)
    KLs.append(KL_divergence(topic, dirichlet))
    dirichlets.append(dirichlet)
dirichlets = np.array(dirichlets)

print('min:', min(KLs), 'idx:', np.argmin(KLs))
print('max:', max(KLs), 'idx:', np.argmax(KLs))
plt.figure()
plot_images(plt, [dirichlets[np.argmin(KLs)], dirichlets[np.argmax(KLs)]], (rows, rows), (1, 2))
plt.figure()
s = pd.Series(KLs)
s[s<0.2].hist(bins=50)


min: 0.00517462040329 idx: 25471
max: 15.0194731496 idx: 456050
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d5dadd8>
<matplotlib.figure.Figure at 0x13721c358>

In [58]:
# Generate Dirichlet distributions where the topic has a non-zero
# probability and find the KL divergence with the true topic.
size = 1000000
dirichlets = []
KLs = []
for i in range(size):
    dirichlet = normalize(np.random.dirichlet(topic) + 1e-7)
    KLs.append(KL_divergence(topic, dirichlet))
    dirichlets.append(dirichlet)
dirichlets = np.array(dirichlets)

print('min:', min(KLs), 'idx:', np.argmin(KLs))
print('max:', max(KLs), 'idx:', np.argmax(KLs))
plt.figure()
plot_images(plt, [dirichlets[np.argmin(KLs)], dirichlets[np.argmax(KLs)]], (rows, rows), (1, 2))
plt.figure()
s = pd.Series(KLs)
s[s<0.2].hist(bins=50)


min: 5.18933020771e-06 idx: 90027
max: 9.64630822163 idx: 804638
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x13eaa6f28>
<matplotlib.figure.Figure at 0x12dd85198>

In [59]:
# Add more and more noise to a topic and find the KL divergence with
# the true topic.
size = 1000000
noise_topic = []
KLs = []
for i in range(0, size):
    r = np.abs(np.random.normal(0, 0.1, V)*(i/size)) + 1e-07
    t = normalize(topic + r)
    noise_topic.append(t)
    KLs.append(KL_divergence(topic, t))
noise_topic = np.array(noise_topic)

print('min:', min(KLs), 'idx:', np.argmin(KLs))
print('max:', max(KLs), 'idx:', np.argmax(KLs))
plt.figure()
plot_images(plt, [noise_topic[np.argmin(KLs)], noise_topic[np.argmax(KLs)]], (rows, rows), (1, 2))
plt.figure()
s = pd.Series(KLs)
s[s<0.2].hist(bins=50)


min: 5.99999640023e-07 idx: 0
max: 0.724789128027 idx: 995567
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x13d5fee80>
<matplotlib.figure.Figure at 0x1161a7240>

In [60]:
# Add more and more noise to a topic but only where the topic is non-zero
# probability and find the KL divergence with the topic.
size = 1000000
noise_topic = []
KLs = []
for i in range(0, size):
    r = np.abs(np.random.normal(0, 0.1, rows)*(i/size)) + 1e-07
    r = np.pad(r, (0, V-rows), 'constant', constant_values=(0,1e-07))
    t = normalize(topic + r)
    noise_topic.append(t)
    KLs.append(KL_divergence(topic, t))
noise_topic = np.array(noise_topic)

print('min:', min(KLs), 'idx:', np.argmin(KLs))
print('max:', max(KLs), 'idx:', np.argmax(KLs))
plt.figure()
plot_images(plt, [noise_topic[np.argmin(KLs)], noise_topic[np.argmax(KLs)]], (rows, rows), (1, 2))
plt.figure()
s = pd.Series(KLs)
s[s<0.2].hist(bins=50)


min: 4.85313449661e-07 idx: 896837
max: 0.0673032571001 idx: 916183
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x145fc0a90>
<matplotlib.figure.Figure at 0x13ef415c0>

In [ ]: