In [1]:
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import matplotlib.pyplot as plt
## Use the ggplot style
plt.style.use('ggplot')
import numpy as np

First example:

Linear Regression, bayesian approach

Taken from the PyMC3 webpage (http://docs.pymc.io/notebooks/getting_started.html) We are interested in predicting outcomes Y as normally-distributed observations with an expected value μ that is a linear function of two predictor variables, X1 and X2

$$ Y \sim N(\mu, \sigma^2) $$$$ \mu = \alpha + \beta_1 X_1 + \beta_2X_2 $$

where α is the intercept, and βi is the coefficient for covariate Xi, while σ represents the observation error. Since we are constructing a Bayesian model, the unknown variables in the model must be assigned a prior distribution. We choose zero-mean normal priors with variance of 100 for both regression coefficients, which corresponds to weak information regarding the true parameter values. We choose a half-normal distribution (normal distribution bounded at zero) as the prior for σ.

$$ \alpha \sim N(0,100) $$$$ \beta_i \sim N(0,100) $$$$ \sigma \sim N(0,1) $$

Simulation


In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Initialize random number generator
np.random.seed(123)

# True parameter values
alpha, sigma = 1, 1
beta = [1, 2.5]

# Size of dataset
size = 100

# Predictor variable
X1 = np.random.randn(size)
X2 = np.random.randn(size) * 0.2

# Simulate outcome variable
Y = alpha + beta[0]*X1 + beta[1]*X2 + np.random.randn(size)*sigma

In [3]:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(10,4))
axes[0].scatter(X1, Y)
axes[1].scatter(X2, Y)
axes[0].set_ylabel('Y'); axes[0].set_xlabel('X1'); axes[1].set_xlabel('X2');



In [4]:
#import os
#os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [5]:
## Model Specification
import pymc3 as pm


/opt/conda/envs/biospytial/lib/python2.7/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

In [6]:
#basic_model = pm.Model()
#with basic_model:
    # The priors of the "unknown model"
#    alpha = pm.Normal('alpha',mu=0.0,sd=10)
    ## shape tell that it's a vector of size 2
#    beta = pm.Normal('beta',mu=0.0, sd=10,shape=2)
#    sigma = pm.HalfNormal('sigma', sd=1.0)
    # Expected value of outcome
#    mu = alpha + beta[0]*X1 + beta[1]*X2
    
    # Likelihood (sampling observations)
#    Y_obs = pm.Normal('Y_obs',mu=mu,sd=sigma,observed=Y)

    
basic_model = pm.Model()

with basic_model:

    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=10, shape=2)
    sigma = pm.HalfNormal('sigma', sd=1)

    # Expected value of outcome
    mu = alpha + beta[0]*X1 + beta[1]*X2

    # Likelihood (sampling distribution) of observations
    Y_obs = pm.Normal('Y_obs', mu=mu, sd=sigma, observed=Y)

In [7]:
%time map_estimate = pm.find_MAP(model=basic_model)


logp = -149.58, ||grad|| = 12.242: 100%|██████████| 19/19 [00:00<00:00, 2376.16it/s]  
CPU times: user 2.37 s, sys: 396 ms, total: 2.76 s
Wall time: 3.3 s


In [8]:
map_estimate


Out[8]:
{'alpha': array(0.90660093),
 'beta': array([0.94848596, 2.60711845]),
 'sigma': array(0.96298858),
 'sigma_log__': array(-0.03771373)}

In [9]:
from scipy import optimize

map_estimate = pm.find_MAP(model=basic_model, fmin=optimize.fmin_powell)

map_estimate


/opt/conda/envs/biospytial/lib/python2.7/site-packages/pymc3/tuning/starting.py:92: UserWarning: In future versions, set the optimization algorithm with a string. For example, use `method="L-BFGS-B"` instead of `fmin=sp.optimize.fmin_l_bfgs_b"`.
  warnings.warn('In future versions, set the optimization algorithm with a string. '
logp = -149.47:   3%|▎         | 170/5000 [00:00<00:00, 5665.91it/s]
Optimization terminated successfully.
         Current function value: 148.984564
         Iterations: 4
         Function evaluations: 176
Out[9]:
{'alpha': array(0.90907964),
 'beta': array([0.9514399 , 2.61452795]),
 'sigma': array(0.96568062),
 'sigma_log__': array(-0.03492212)}

The step method (Sampling method)


In [10]:
from scipy import optimize
with basic_model:
    trace = pm.sample()


/opt/conda/envs/biospytial/lib/python2.7/site-packages/pymc3/model.py:384: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if not np.issubdtype(var.dtype, float):
logp = -149.47:   4%|▎         | 176/5000 [00:00<00:03, 1526.93it/s]
100%|██████████| 1000/1000 [00:01<00:00, 669.34it/s]

In [11]:
trace['alpha'][-5:]


Out[11]:
array([0.91413563, 0.99144543, 0.86701888, 0.97633555, 0.97407124])

If we wanted to use the slice sampling algorithm to sigma instead of NUTS (which was assigned automatically), we could have specified this as the step argument for sample.


In [12]:
with basic_model:

    # obtain starting values via MAP
    start = pm.find_MAP(fmin=optimize.fmin_powell)

    # instantiate sampler
    step = pm.Slice()

    # draw 5000 posterior samples
    trace = pm.sample(5000, step=step, start=start)


logp = -149.47:   4%|▎         | 176/5000 [00:00<00:19, 245.67it/s] 
Optimization terminated successfully.
         Current function value: 148.984564
         Iterations: 4
         Function evaluations: 176
100%|██████████| 5500/5500 [00:39<00:00, 139.76it/s]

Posterior analysis

PyMC3 provides plotting and summarization functions for inspecting the sampling output. A simple posterior plot can be created using traceplot.


In [13]:
_ = pm.traceplot(trace)


Case study 1: Stochastic volatility

We present a case study of stochastic volatility, time varying stock market volatility, to illustrate PyMC3’s use in addressing a more realistic problem. The distribution of market returns is highly non-normal, which makes sampling the volatilities significantly more difficult. This example has 400+ parameters so using common sampling algorithms like Metropolis-Hastings would get bogged down, generating highly autocorrelated samples. Instead, we use NUTS, which is dramatically more efficient. The Model

Asset prices have time-varying volatility (variance of day over day returns). In some periods, returns are highly variable, while in others they are very stable. Stochastic volatility models address this with a latent volatility variable, which changes over time. The following model is similar to the one described in the NUTS paper (Hoffman 2014, p. 21).

$$ \sigma \sim exp(50) $$$$ \nu \sim exp(0.1) $$$$ s_i \sim N(s_{i-1},\sigma^{-2}) $$$$ log(y_i) \sim t(\nu,0,exp(-2s_i)) $$

Here, y is the daily return series which is modeled with a Student-t distribution with an unknown degrees of freedom parameter, and a scale parameter determined by a latent process s. The individual si are the individual daily log volatilities in the latent log volatility process.

The Data

Our data consist of daily returns of the S&P 500 during the 2008 financial crisis. Here, we use pandas-datareader to obtain the price data from Yahoo!-Finance; it can be installed with pip install pandas-datareader.


In [15]:
import pandas_datareader.data as web
import pandas as pd

In [41]:
import pandas_datareader.data as web
import datetime

# Original
start = datetime.datetime(2008, 5, 1)
end = datetime.datetime(2009,12,1)

##Meine parameters
start = datetime.datetime(2018, 1, 1)
end = datetime.datetime(2018,2,12)


returns = web.DataReader('SPY', 'google', start, end)['Close'].pct_change()

#f.ix['2010-01-04']

In [42]:
returns.plot(figsize=(10, 6))
plt.ylabel('daily returns in %');



In [43]:
with pm.Model() as sp500_model:
    nu = pm.Exponential('nu', 1./10, testval=5.)
    sigma = pm.Exponential('sigma', 1./.02, testval=.1)

    s = pm.GaussianRandomWalk('s', sigma**-2, shape=len(returns))
    volatility_process = pm.Deterministic('volatility_process', pm.math.exp(-2*s))

    r = pm.StudentT('r', nu, lam=1/volatility_process, observed=returns)

Model Fitting


In [44]:
with sp500_model:
    trace = pm.sample(2000)


ERROR:pymc3:There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
 40%|███▉      | 999/2500 [00:46<01:09, 21.65it/s]ERROR:pymc3:There were 817 divergences after tuning. Increase `target_accept` or reparameterize.
 68%|██████▊   | 1712/2500 [01:31<00:42, 18.75it/s]WARNING:pymc3:The acceptance probability does not match the target. It is 0.3743933679270309, but should be close to 0.8. Try to increase the number of tuning steps.
 95%|█████████▍| 2364/2500 [02:15<00:07, 17.44it/s]ERROR:pymc3:The estimated number of effective samples is smaller than 200 for some parameters.
100%|██████████| 2500/2500 [02:22<00:00, 17.51it/s]

In [45]:
pm.traceplot(trace, [nu, sigma]);



In [46]:
fig, ax = plt.subplots(figsize=(15, 8))
returns.plot(ax=ax)
ax.plot(returns.index, 1/np.exp(trace['s',::5].T), 'r', alpha=.03);
ax.set(title='volatility_process', xlabel='time', ylabel='volatility');
ax.legend(['S&P500', 'stochastic volatility process'])


Out[46]:
<matplotlib.legend.Legend at 0x7f9d7da3ef90>

Case study 2 Coal Mining disasters

Consider the following time series of recorded coal mining disasters in the UK from 1851 to 1962 (Jarrett, 1979). The number of disasters is thought to have been affected by changes in safety regulations during this period. Unfortunately, we also have pair of years with missing data, identified as missing by a NumPy MaskedArray using -999 as the marker value.

Next we will build a model for this series and attempt to estimate when the change occurred. At the same time, we will see how to handle missing data, use multiple samplers and sample from discrete random variables.


In [47]:
disaster_data = np.ma.masked_values([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
                            3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
                            2, 2, 3, 4, 2, 1, 3, -999, 2, 1, 1, 1, 1, 3, 0, 0,
                            1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
                            0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
                            3, 3, 1, -999, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
                            0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1], value=-999)
year = np.arange(1851, 1962)

plt.plot(year, disaster_data, 'o', markersize=8);
plt.ylabel("Disaster count")
plt.xlabel("Year")


Out[47]:
<matplotlib.text.Text at 0x7f9d7c7df050>
\begin{split}\begin{aligned} D_t &\sim \text{Pois}(r_t), r_t= \begin{cases} l, & \text{if } t \lt s \\ e, & \text{if } t \ge s \end{cases} \\ s &\sim \text{Unif}(t_l, t_h)\\ e &\sim \text{exp}(1)\\ l &\sim \text{exp}(1) \end{aligned}\end{split}

the parameters are defined as follows:

  • Dt: The number of disasters in year t
  • rt: The rate parameter of the Poisson distribution of disasters in year t.
  • s: The year in which the rate parameter changes (the switchpoint).
  • e: The rate parameter before the switchpoint s.
  • l: The rate parameter after the switchpoint s.
  • tl, th: The lower and upper boundaries of year t.

In [54]:
with pm.Model() as disaster_model:

    switchpoint = pm.DiscreteUniform('switchpoint', lower=year.min(), upper=year.max(), testval=1900)

    # Priors for pre- and post-switch rates number of disasters
    early_rate = pm.Exponential('early_rate', 1)
    late_rate = pm.Exponential('late_rate', 1)

    # Allocate appropriate Poisson rates to years before and after current
    rate = pm.math.switch(switchpoint >= year, early_rate, late_rate)

    disasters = pm.Poisson('disasters', rate, observed=disaster_data)

In [55]:
with disaster_model:
    trace = pm.sample(10000)


ERROR:pymc3:Tuning was enabled throughout the whole trace.
  0%|          | 0/10500 [00:00<?, ?it/s]/opt/conda/envs/biospytial/lib/python2.7/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
 33%|███▎      | 3456/10500 [00:27<00:55, 126.58it/s]ERROR:pymc3:Tuning was enabled throughout the whole trace.
 67%|██████▋   | 6998/10500 [00:54<00:27, 128.04it/s]WARNING:pymc3:The number of effective samples is smaller than 10% for some parameters.
100%|█████████▉| 10492/10500 [01:21<00:00, 129.19it/s]/opt/conda/envs/biospytial/lib/python2.7/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
100%|██████████| 10500/10500 [01:21<00:00, 129.18it/s]

In [56]:
pm.traceplot(trace)


Out[56]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f9d7f35fb90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f9d7ac8b850>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f9d7c7a8250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f9d75dbba90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f9d75d35e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f9d75d0da10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f9d75c86f90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f9d75d68090>]],
      dtype=object)

In [ ]: