In [1]:
import pandas as pd
import statsmodels.api as sm
# Normal response variable
stackloss_conversion = sm.datasets.get_rdataset("stackloss", "datasets")
#print (stackloss_conversion.__doc__)
# Lognormal response variable
engel_food = sm.datasets.engel.load_pandas()
#print (engel_food.data)
# Binary response variable
titanic_survival = sm.datasets.get_rdataset("Titanic", "datasets")
#print (titanic_survival.__doc__)
# Continuous 0-1 response variable
duncan_prestige = sm.datasets.get_rdataset("Duncan", "car")
#print (duncan_prestige.__doc__)
# Categorical response variable
iris_flowers = sm.datasets.get_rdataset("iris")
#print (iris_flowers.__doc__)
In [9]:
# Showing plots inline, rather than in a new window
%matplotlib inline
# Modules
from pymc3 import *
import numpy as np
from ggplot import *
# Generating data
size = 200
true_intercept = 1
true_slope = 2
x = np.linspace(0, 1, size)
# y = a + b*x
true_regression_line = true_intercept + true_slope * x
# add noise
y = true_regression_line + np.random.normal(scale=.5, size=size)
# Plotting data
sim_data = pd.DataFrame({"x" : x, "y" : y})
sim_plot = ggplot(sim_data, aes(x="x", y="y")) + geom_point() +\
geom_abline(intercept=true_intercept, slope=true_slope)
print(sim_plot)
In [ ]:
with Model() as model:
# specify glm and pass in data. The resulting linear model, its likelihood and
# and all its parameters are automatically added to our model.
glm.glm('y ~ x', data)
step = NUTS() # Instantiate MCMC sampling algorithm
trace = sample(2000, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling
In [ ]:
plt.figure(figsize=(7, 7))
traceplot(trace)
plt.tight_layout();
plt.figure(figsize=(7, 7))
plt.plot(x, y, 'x', label='data')
glm.plot_posterior_predictive(trace, samples=100,
label='posterior predictive regression lines')
plt.plot(x, true_regression_line, label='true regression line', lw=3., c='y')
plt.title('Posterior predictive regression lines')
plt.legend(loc=0)
plt.xlabel('x')
plt.ylabel('y');