Test datasets

http://statsmodels.sourceforge.net/0.6.0/datasets/index.html



In [1]:

    
import pandas as pd
import statsmodels.api as sm

# Normal response variable
stackloss_conversion = sm.datasets.get_rdataset("stackloss", "datasets")
#print (stackloss_conversion.__doc__)

# Lognormal response variable
engel_food = sm.datasets.engel.load_pandas()
#print (engel_food.data)

# Binary response variable
titanic_survival = sm.datasets.get_rdataset("Titanic", "datasets")
#print (titanic_survival.__doc__)

# Continuous 0-1 response variable
duncan_prestige = sm.datasets.get_rdataset("Duncan", "car")
#print (duncan_prestige.__doc__)

# Categorical response variable
iris_flowers = sm.datasets.get_rdataset("iris")
#print (iris_flowers.__doc__)

General guides to Bayesian regression

http://twiecki.github.io/blog/2015/11/10/mcmc-sampling/

PyMC

https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers http://conference.scipy.org/scipy2014/schedule/presentation/1662/



In [9]:

    
# Showing plots inline, rather than in a new window
%matplotlib inline 

# Modules
from pymc3 import  *

import numpy as np
from ggplot import *

# Generating data
size = 200
true_intercept = 1
true_slope = 2

x = np.linspace(0, 1, size)
# y = a + b*x
true_regression_line = true_intercept + true_slope * x
# add noise
y = true_regression_line + np.random.normal(scale=.5, size=size)


# Plotting data
sim_data = pd.DataFrame({"x" : x, "y" : y})

sim_plot = ggplot(sim_data, aes(x="x", y="y")) + geom_point() +\
    geom_abline(intercept=true_intercept, slope=true_slope)

print(sim_plot)









    



/usr/local/lib/python3.4/dist-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))






    












    



<ggplot: (8749054675029)>



In [ ]:

    
with Model() as model:
    # specify glm and pass in data. The resulting linear model, its likelihood and 
    # and all its parameters are automatically added to our model.
    glm.glm('y ~ x', data)
    step = NUTS() # Instantiate MCMC sampling algorithm
    trace = sample(2000, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling



In [ ]:

    
plt.figure(figsize=(7, 7))
traceplot(trace)
plt.tight_layout();


plt.figure(figsize=(7, 7))
plt.plot(x, y, 'x', label='data')
glm.plot_posterior_predictive(trace, samples=100, 
                              label='posterior predictive regression lines')
plt.plot(x, true_regression_line, label='true regression line', lw=3., c='y')

plt.title('Posterior predictive regression lines')
plt.legend(loc=0)
plt.xlabel('x')
plt.ylabel('y');

PyStan

https://pystan.readthedocs.org/en/latest/ https://climateecology.wordpress.com/2014/01/27/pystan-a-basic-tutorial-of-bayesian-data-analysis-in-python/

BayesPy

http://www.bayespy.org/intro.html

Install fails