Can I undetand what a Normal mixture model is and how to use it?

Los Alamos National Laboratory ISR-1 Space Science and Applications Brian Larsen, Ph.D. balarsen@lanl.gov 505-665-7691



In [1]:

    
%matplotlib inline
#%matplotlib notebook
%load_ext version_information
%load_ext autoreload



In [51]:

    
import datetime
import os
import sys
import warnings

warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacepy.datamodel as dm
import spacepy.toolbox as tb
import tqdm
import pymc3 as mc3
import seaborn as sns
sns.set()

%version_information matplotlib, numpy, pandas









    Out[51]:




Software Version
Python 3.7.5 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]
IPython 7.9.0
OS Darwin 18.7.0 x86_64 i386 64bit
matplotlib 3.1.1
numpy 1.17.3
pandas 0.25.3
Fri Nov 08 16:49:27 2019 CST



In [76]:

    
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['savefig.dpi'] = plt.rcParams['figure.dpi'] # 72
%config InlineBackend.figure_format = 'retina'



In [81]:

    
# Lets make to Normal distributions and add them together
locs = np.asarray([3,10])
sds = np.asarray([2,2])
D1 = np.random.normal(locs[0], sds[0], size=100)
D2 = np.random.normal(locs[1], sds[1], size=80)



In [82]:

    
# bins = np.linspace(-24,24, 100)
sns.distplot(D1, kde=True, label='D1')
sns.distplot(D2, kde=True, label='D2')

# plt.hist(D1, bins=bins, histtype='step', lw=3, label='D1')
# plt.hist(D2, bins=bins, histtype='step', lw=3, label='D2')
D = np.append(D1,D2)
sns.distplot(D, kde=True, label='D1+D2')

plt.legend()









    Out[82]:





<matplotlib.legend.Legend at 0x1c3510af50>



In [113]:

    
D.shape









    Out[113]:





(180,)



In [114]:

    
SEED = 20161210
with mc3.Model() as model:
    w = mc3.Dirichlet('w', np.ones_like(locs))

    mu = mc3.Uniform('mu', -10., 20., shape=locs.size)
    sd = mc3.Gamma('sd', 1., 1., shape=locs.size)

    x_obs = mc3.NormalMixture('x_obs', w, mu, sd=sd, observed=D)
    trace = mc3.sample(1000, tune=4000, random_seed=SEED)[100:]









    



Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sd, mu, w]
Sampling 2 chains: 100%|██████████| 10000/10000 [00:10<00:00, 925.65draws/s]



In [115]:

    
mc3.traceplot(trace, combined=False);



In [116]:

    
mc3.summary(trace)



In [117]:

    
fig, ax = plt.subplots(3, 2, figsize=(10,12))
axs = mc3.plot_posterior(trace, point_estimate='median', ax=ax)
ax = ax.flatten()
ax[0].axvline(8000/(10000+8000), c='r', lw=3)
ax[1].axvline(1-8000/(10000+8000), c='r', lw=3)

ax[2].axvline(10, c='r', lw=3)
ax[3].axvline(3, c='r', lw=3)

ax[4].axvline(2, c='r', lw=3)
ax[5].axvline(2, c='r', lw=3)









    Out[117]:





<matplotlib.lines.Line2D at 0x1c2ddea590>



In [118]:

    
ppc = mc3.sample_posterior_predictive(trace, samples=205, model=model)









    



100%|██████████| 205/205 [00:01<00:00, 147.77it/s]



In [119]:

    
hist, bins = np.histogram(ppc['x_obs'], bins='doane', normed=True)
hist2 = []
for i in tqdm.tqdm_notebook(range(ppc['x_obs'].shape[0])):
    h, _ = np.histogram(ppc['x_obs'][i], bins=bins, normed=True)
    hist2.append(h)



In [120]:

    
hist2 = np.asarray(hist2)
hist2.shape









    Out[120]:





(205, 21)



In [121]:

    
perc = np.percentile(hist2, [2.5, 97.5, 50], axis=0)
perc.shape









    Out[121]:





(3, 21)



In [123]:

    
# plt.plot(tb.bin_edges_to_center(bins), hist2[2,:], lw=4, label='MCMC')
plt.fill_between(tb.bin_edges_to_center(bins), hist2[0,:], hist2[1,:], color='red', alpha=0.2)
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');
plt.legend();



In [130]:

    
plt.plot(tb.bin_edges_to_center(bins), hist2.T, c='r', alpha=0.1);
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');

What happens if we try and fit 1 more than we had?



In [93]:

    
np.ones_like(locs), locs.size









    Out[93]:





(array([1, 1]), 2)



In [94]:

    
SEED = 20161210
with mc3.Model() as model:
    w = mc3.Dirichlet('w', np.ones(3))

    mu = mc3.Uniform('mu', -10., 20., shape=3)
    sd = mc3.Gamma('sd', 1., 1., shape=3)

    x_obs = mc3.NormalMixture('x_obs', w, mu, sd=sd, observed=D)
    trace = mc3.sample(1000, tune=4000, random_seed=SEED)[100:]









    



Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sd, mu, w]
Sampling 2 chains: 100%|██████████| 10000/10000 [00:45<00:00, 220.68draws/s]
The acceptance probability does not match the target. It is 0.7213512362490077, but should be close to 0.8. Try to increase the number of tuning steps.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
The estimated number of effective samples is smaller than 200 for some parameters.



In [95]:

    
mc3.traceplot(trace, combined=False);



In [96]:

    
mc3.summary(trace)



In [97]:

    
ppc = mc3.sample_posterior_predictive(trace, samples=205, model=model)









    



100%|██████████| 205/205 [00:01<00:00, 201.07it/s]



In [110]:

    
hist, bins = np.histogram(ppc['x_obs'], bins='doane', normed=True)
hist2 = []
for i in tqdm.tqdm_notebook(range(ppc['x_obs'].shape[0])):
    h, _ = np.histogram(ppc['x_obs'][i], bins=bins, normed=True)
    hist2.append(h)
hist2 = np.asarray(hist2)
perc = np.percentile(hist2, [2.5, 97.5, 50], axis=0)



In [111]:

    
hist2.shape









    Out[111]:





(205, 21)



In [112]:

    
plt.plot(tb.bin_edges_to_center(bins), hist2[2,:], lw=4, label='MCMC')
plt.fill_between(tb.bin_edges_to_center(bins), hist2[0,:], hist2[1,:], color='red', alpha=0.2)
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');
plt.legend();



In [100]:

    
fig, ax = plt.subplots(5, 2, figsize=(10,12))
axs = mc3.plot_posterior(trace, point_estimate='median', ax=ax)

how about way more?



In [101]:

    
SEED = 20161210
with mc3.Model() as model:
    w = mc3.Dirichlet('w', np.ones(10))

    mu = mc3.Uniform('mu', -10., 20., shape=10)
    sd = mc3.Gamma('sd', 1., 1., shape=10)

    x_obs = mc3.NormalMixture('x_obs', w, mu, sd=sd, observed=D)
    trace = mc3.sample(1000, tune=4000, random_seed=SEED)[100:]









    



Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sd, mu, w]
Sampling 2 chains: 100%|██████████| 10000/10000 [08:22<00:00, 19.88draws/s]
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.7061952590064255, but should be close to 0.8. Try to increase the number of tuning steps.
There were 5 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.6571826146114126, but should be close to 0.8. Try to increase the number of tuning steps.
The estimated number of effective samples is smaller than 200 for some parameters.



In [102]:

    
ppc = mc3.sample_posterior_predictive(trace, samples=205, model=model)









    



100%|██████████| 205/205 [00:01<00:00, 162.88it/s]



In [107]:

    
ppc['x_obs'].shape









    Out[107]:





(205, 180)



In [108]:

    
hist, bins = np.histogram(ppc['x_obs'], bins='doane', normed=True)
hist2 = []
for i in tqdm.tqdm_notebook(range(ppc['x_obs'].shape[0])):
    h, _ = np.histogram(ppc['x_obs'][i], bins=bins, normed=True)
    hist2.append(h)
hist2 = np.asarray(hist2)
perc = np.percentile(hist2, [2.5, 97.5, 50], axis=0)



In [109]:

    
plt.plot(tb.bin_edges_to_center(bins), hist2[2,:], lw=4, label='MCMC')
plt.fill_between(tb.bin_edges_to_center(bins), hist2[0,:], hist2[1,:], color='red', alpha=0.2)
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');
plt.legend();



In [106]:

    
mc3.summary(trace)









    Out[106]:







  
    
      
      mean
      sd
      mc_error
      hpd_2.5
      hpd_97.5
      n_eff
      Rhat
    
  
  
    
      w__0
      0.103413
      0.092281
      0.004059
      1.230685e-04
      0.283837
      417.655581
      1.003953
    
    
      w__1
      0.112620
      0.100001
      0.005543
      3.837580e-04
      0.333868
      181.795993
      1.001259
    
    
      w__2
      0.096115
      0.085142
      0.003713
      2.631887e-06
      0.259823
      338.618326
      1.000245
    
    
      w__3
      0.102036
      0.089301
      0.003314
      1.783121e-04
      0.290462
      626.489415
      0.999445
    
    
      w__4
      0.105493
      0.091619
      0.004363
      8.718963e-05
      0.289211
      212.498151
      1.011922
    
    
      w__5
      0.091962
      0.090634
      0.004842
      1.484206e-04
      0.281233
      314.275621
      0.999489
    
    
      w__6
      0.091259
      0.085936
      0.004334
      1.467980e-04
      0.259275
      306.767458
      0.999471
    
    
      w__7
      0.097675
      0.087628
      0.003907
      2.979057e-05
      0.281999
      367.613137
      0.999445
    
    
      w__8
      0.099904
      0.093334
      0.004155
      4.213182e-07
      0.306513
      312.554912
      0.999758
    
    
      w__9
      0.099523
      0.088337
      0.003975
      2.293775e-06
      0.279621
      463.061962
      0.999672
    
    
      mu__0
      5.519461
      4.823795
      0.282729
      -2.924454e+00
      13.343928
      169.515744
      0.999611
    
    
      mu__1
      5.771182
      4.665659
      0.303333
      -1.004758e+00
      15.083957
      100.816117
      1.030044
    
    
      mu__2
      6.258603
      4.916008
      0.307224
      -2.825964e+00
      13.664397
      158.988185
      1.002461
    
    
      mu__3
      6.272901
      4.810111
      0.346107
      -1.052012e+00
      14.464774
      126.530596
      0.999456
    
    
      mu__4
      6.769064
      4.584259
      0.316118
      -5.674351e-01
      14.181433
      121.427436
      1.004440
    
    
      mu__5
      5.704304
      4.912238
      0.317270
      -2.819183e+00
      13.862016
      127.027148
      1.021489
    
    
      mu__6
      6.315250
      4.845812
      0.340286
      -2.695756e+00
      13.857734
      105.854806
      0.999463
    
    
      mu__7
      5.950838
      4.617742
      0.286154
      -1.282780e+00
      14.839604
      192.592435
      1.001289
    
    
      mu__8
      5.815389
      4.924744
      0.349777
      -2.672676e+00
      13.808999
      118.478840
      1.002273
    
    
      mu__9
      6.068652
      5.037358
      0.307899
      -2.661900e+00
      14.140602
      202.949354
      1.000678
    
    
      sd__0
      1.248565
      0.851715
      0.029768
      2.529689e-02
      2.940917
      588.548692
      1.000228
    
    
      sd__1
      1.226828
      0.820384
      0.036100
      3.348875e-02
      2.742105
      244.461101
      1.009128
    
    
      sd__2
      1.154684
      0.853156
      0.035898
      4.263769e-02
      2.790097
      414.619506
      0.999522
    
    
      sd__3
      1.215543
      0.878116
      0.033197
      3.586062e-02
      2.810847
      571.926755
      1.000950
    
    
      sd__4
      1.205965
      0.875738
      0.041868
      3.368216e-02
      2.828123
      284.534333
      1.003973
    
    
      sd__5
      1.142458
      0.899993
      0.042239
      2.168344e-02
      2.866468
      363.169700
      1.000789
    
    
      sd__6
      1.139376
      0.842455
      0.037575
      2.854562e-02
      2.733343
      397.091167
      0.999659
    
    
      sd__7
      1.203247
      0.889579
      0.038163
      2.405603e-03
      2.864871
      439.640356
      0.999812
    
    
      sd__8
      1.229480
      0.832186
      0.042222
      1.662147e-02
      2.751693
      232.588962
      0.999590
    
    
      sd__9
      1.240563
      0.841788
      0.033949
      2.906315e-02
      2.761328
      360.820344
      0.999893



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
with mc3.Model() as model:
    w = mc3.Dirichlet('w', np.ones_like(locs))

    mu1 = mc3.Uniform('mu1', -10., 20.)
    mu2 = mc3.Uniform('mu2', mu1, 20.)
    sd = mc3.Gamma('sd', 1., 1., shape=locs.size)

    x_obs = mc3.NormalMixture('x_obs', w, [mu1, mu2], sd=sd, observed=D)
    start = mc3.find_MAP()
    trace = mc3.sample(5000, n_init=10000, tune=1000, random_seed=SEED, start=start)[1000:]



In [ ]:

    
mc3.traceplot(trace, combined=True);
mc3.summary(trace)



In [ ]:

    
ppc = mc3.sample_ppc(trace, samples=5000, model=model)
hist, bins = np.histogram(ppc['x_obs'], bins='doane', normed=True)
plt.plot(tb.bin_edges_to_center(bins), hist, lw=4, label='MCMC')
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');
plt.legend()

And do the same with mixture



In [ ]:

    
D.shape



In [ ]:

    
with mc3.Model() as model:
    w = mc3.Dirichlet('w', np.ones_like(locs))

    mu = mc3.Uniform('mu', -10., 20., shape=locs.size)
    sd = mc3.Gamma('sd', 1., 1., shape=locs.size)
    
    x_obs = mc3.Mixture('x_obs', w=w, comp_dists=mc3.Normal.dist(mu=mu, sd=sd), observed=D)

    start = mc3.find_MAP()
    trace = mc3.sample(5000, n_init=10000, tune=1000, random_seed=SEED, start=start)[1000:]



In [ ]:

    
def test_mixture_list_of_poissons(self):
    with Model() as model:
        w = Dirichlet('w', np.ones_like(self.pois_w))
 
        mu = Gamma('mu', 1., 1., shape=self.pois_w.size)
 
        x_obs = Mixture('x_obs', w,
                        [Poisson.dist(mu[0]), Poisson.dist(mu[1])],
                        observed=self.pois_x)
 
        step = Metropolis()
        trace = sample(5000, step, random_seed=self.random_seed, progressbar=False)
 
    assert_allclose(np.sort(trace['w'].mean(axis=0)),
                    np.sort(self.pois_w),
                    rtol=0.1, atol=0.1)
    assert_allclose(np.sort(trace['mu'].mean(axis=0)),
                    np.sort(self.pois_mu),
                    rtol=0.1, atol=0.1)



In [ ]:

    
ppc = mc3.sample_ppc(trace, samples=5000, model=model)
hist, bins = np.histogram(ppc['x_obs'], bins='doane', normed=True)
plt.plot(tb.bin_edges_to_center(bins), hist, lw=4, label='MCMC')
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');
plt.legend()



In [ ]:

    
mc3.traceplot(trace, combined=True);
mc3.summary(trace)



In [ ]:



In [ ]:

    
with mc3.Model() as model:
    w = mc3.Dirichlet('w', np.ones_like(locs))

    mu1 = mc3.Uniform('mu1', -10., 20.,)
    mu2 = mc3.Uniform('mu2', mu1, 20., )
    
    sd = mc3.Gamma('sd', 1., 1., shape=locs.size)
    
    x_obs = mc3.Mixture('x_obs', w=w, comp_dists=mc3.Normal.dist(mu=[mu1,mu2], sd=sd, shape=locs.size), 
                        observed=D)

    start = mc3.find_MAP()
    trace = mc3.sample(5000, n_init=10000, tune=1000, random_seed=SEED, start=start)[1000:]



In [ ]:

    
mc3.traceplot(trace, combined=True);
mc3.summary(trace)



In [ ]:

    
ppc = mc3.sample_ppc(trace, samples=5000, model=model)
hist, bins = np.histogram(ppc['x_obs'], bins='doane', normed=True)
plt.plot(tb.bin_edges_to_center(bins), hist, lw=4, label='MCMC')
plt.hist(D, bins=bins, histtype='step', lw=3, normed=True, label='Data');
plt.legend()



In [ ]:



In [ ]:



In [ ]:

	mean	sd	mc_error	hpd_2.5	hpd_97.5	n_eff	Rhat
w__0	0.418748	0.049813	0.001729	0.322656	0.515661	690.123426	0.999978
w__1	0.581252	0.049813	0.001729	0.484339	0.677344	690.123426	0.999978
mu__0	10.570824	0.379134	0.012052	9.827874	11.287996	797.915218	1.000247
mu__1	2.808082	0.321692	0.011146	2.214258	3.486714	660.365005	1.000139
sd__0	2.128989	0.280798	0.009118	1.576299	2.647154	859.876193	0.999844
sd__1	2.223601	0.256571	0.008812	1.753544	2.699334	752.050276	1.000200

	mean	sd	mc_error	hpd_2.5	hpd_97.5	n_eff	Rhat
w__0	0.312153	0.200202	0.015272	0.001565	0.609614	42.083284	1.056298
w__1	0.363992	0.192752	0.014489	0.001312	0.610719	69.848318	1.041630
w__2	0.323855	0.181285	0.012720	0.001502	0.596365	119.124245	1.001337
mu__0	6.280732	4.023692	0.346288	0.300453	12.667621	30.271210	1.021969
mu__1	6.119546	4.284265	0.383118	-0.177347	12.356174	19.325555	1.088465
mu__2	7.666325	4.152672	0.352901	0.176533	12.730619	31.077220	1.046554
sd__0	1.730737	0.734515	0.050198	0.185583	2.707843	66.163461	1.045203
sd__1	1.952408	0.595265	0.032802	0.434038	2.803932	210.931557	1.006848
sd__2	1.852558	0.658708	0.036656	0.265970	2.873346	198.383152	1.004821

Software	Version
Python	3.7.5 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]
IPython	7.9.0
OS	Darwin 18.7.0 x86_64 i386 64bit
matplotlib	3.1.1
numpy	1.17.3
pandas	0.25.3
Fri Nov 08 16:49:27 2019 CST

	mean	sd	mc_error	hpd_2.5	hpd_97.5	n_eff	Rhat
w__0	0.103413	0.092281	0.004059	1.230685e-04	0.283837	417.655581	1.003953
w__1	0.112620	0.100001	0.005543	3.837580e-04	0.333868	181.795993	1.001259
w__2	0.096115	0.085142	0.003713	2.631887e-06	0.259823	338.618326	1.000245
w__3	0.102036	0.089301	0.003314	1.783121e-04	0.290462	626.489415	0.999445
w__4	0.105493	0.091619	0.004363	8.718963e-05	0.289211	212.498151	1.011922
w__5	0.091962	0.090634	0.004842	1.484206e-04	0.281233	314.275621	0.999489
w__6	0.091259	0.085936	0.004334	1.467980e-04	0.259275	306.767458	0.999471
w__7	0.097675	0.087628	0.003907	2.979057e-05	0.281999	367.613137	0.999445
w__8	0.099904	0.093334	0.004155	4.213182e-07	0.306513	312.554912	0.999758
w__9	0.099523	0.088337	0.003975	2.293775e-06	0.279621	463.061962	0.999672
mu__0	5.519461	4.823795	0.282729	-2.924454e+00	13.343928	169.515744	0.999611
mu__1	5.771182	4.665659	0.303333	-1.004758e+00	15.083957	100.816117	1.030044
mu__2	6.258603	4.916008	0.307224	-2.825964e+00	13.664397	158.988185	1.002461
mu__3	6.272901	4.810111	0.346107	-1.052012e+00	14.464774	126.530596	0.999456
mu__4	6.769064	4.584259	0.316118	-5.674351e-01	14.181433	121.427436	1.004440
mu__5	5.704304	4.912238	0.317270	-2.819183e+00	13.862016	127.027148	1.021489
mu__6	6.315250	4.845812	0.340286	-2.695756e+00	13.857734	105.854806	0.999463
mu__7	5.950838	4.617742	0.286154	-1.282780e+00	14.839604	192.592435	1.001289
mu__8	5.815389	4.924744	0.349777	-2.672676e+00	13.808999	118.478840	1.002273
mu__9	6.068652	5.037358	0.307899	-2.661900e+00	14.140602	202.949354	1.000678
sd__0	1.248565	0.851715	0.029768	2.529689e-02	2.940917	588.548692	1.000228
sd__1	1.226828	0.820384	0.036100	3.348875e-02	2.742105	244.461101	1.009128
sd__2	1.154684	0.853156	0.035898	4.263769e-02	2.790097	414.619506	0.999522
sd__3	1.215543	0.878116	0.033197	3.586062e-02	2.810847	571.926755	1.000950
sd__4	1.205965	0.875738	0.041868	3.368216e-02	2.828123	284.534333	1.003973
sd__5	1.142458	0.899993	0.042239	2.168344e-02	2.866468	363.169700	1.000789
sd__6	1.139376	0.842455	0.037575	2.854562e-02	2.733343	397.091167	0.999659
sd__7	1.203247	0.889579	0.038163	2.405603e-03	2.864871	439.640356	0.999812
sd__8	1.229480	0.832186	0.042222	1.662147e-02	2.751693	232.588962	0.999590
sd__9	1.240563	0.841788	0.033949	2.906315e-02	2.761328	360.820344	0.999893