This notebook explores and models the data collected from recordings of the natural acoustic environment over the urban-rural gradient near Innsbruck, Austria. The models are implemented as Bayesian models with the PyMC3 probabilistic programming library.
References:
https://github.com/fonnesbeck/multilevel_modeling
Gelman, A., & Hill, J. (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models (1st ed.). Cambridge University Press.
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
import pandas
import numpy
from os import path
In [6]:
%matplotlib inline
from matplotlib import pyplot
from matplotlib.patches import Rectangle
import seaborn
import mpld3
from mpld3 import plugins
In [7]:
from pymc3 import glm, Model, NUTS, sample, stats, \
forestplot, traceplot, plot_posterior, summary, \
Normal, Uniform, Deterministic, StudentT
from pymc3.backends import SQLite
In [8]:
from matplotlib import rcParams
In [9]:
rcParams['font.sans-serif']
Out[9]:
In [10]:
rcParams['font.sans-serif'] = ['Helvetica',
'Arial',
'Bitstream Vera Sans',
'DejaVu Sans',
'Lucida Grande',
'Verdana',
'Geneva',
'Lucid',
'Avant Garde',
'sans-serif']
In [11]:
data_filepath = "/Users/Jake/OneDrive/Documents/alpine soundscapes/data/dataset.csv"
In [12]:
trace_output_path = "/Users/Jake/OneDrive/Documents/alpine soundscapes/data/model traces/biophony"
In [13]:
seaborn_blue = seaborn.color_palette()[0]
In [14]:
data = pandas.read_csv(data_filepath)
data = data.loc[data.site<=30]
sort data by site and then by visit
In [16]:
data_sorted = data.sort_values(by=['site', 'sound']).reset_index(drop=True)
transform variables (mean center)
In [18]:
column_list = ['sel', 'sel_anthrophony', 'sel_biophony', 'biophony', 'week',
'building_50m', 'pavement_50m', 'forest_50m', 'field_50m',
'building_100m', 'pavement_100m', 'forest_100m', 'field_100m',
'building_200m', 'pavement_200m', 'forest_200m', 'field_200m',
'building_500m', 'pavement_500m', 'forest_500m', 'field_500m',
'd2n_50m', 'd2n_100m', 'd2n_200m', 'd2n_500m',
'temperature', 'wind_speed', 'pressure', 'bus_stop',
'construction', 'crossing', 'cycleway', 'elevator', 'escape', 'footway',
'living_street', 'motorway', 'motorway_link', 'path', 'pedestrian',
'platform', 'primary_road', 'primary_link', 'proposed', 'residential',
'rest_area', 'secondary', 'secondary_link', 'service', 'services',
'steps', 'tertiary', 'tertiary_link', 'track', 'unclassified', 'combo']
data_centered = data_sorted.copy()
for column in column_list:
data_centered[column] = data_sorted[column] - data_sorted[column].mean()
# copy land_composite values
#land_composites = ['land_composite_50m', 'land_composite_100m', 'land_composite_200m', 'land_composite_500m']
#for column in land_composites:
# data_centered[column] = data_sorted[column]
create sites variable for PyMC3 models
In [20]:
sites = numpy.copy(data_sorted.site.values) - 1
In [22]:
with Model() as model0:
# Priors
mu_grand = Normal('mu_grand', mu=0., tau=0.0001)
sigma_a = Uniform('sigma_a', lower=0, upper=100)
tau_a = sigma_a**-2
# Random intercepts
a = Normal('a', mu=mu_grand, tau=tau_a, shape=len(set(sites)))
# Model error
sigma_y = Uniform('sigma_y', lower=0, upper=100)
tau_y = sigma_y**-2
# Expected value
y_hat = a[sites]
# Data likelihood
y_like = Normal('y_like', mu=y_hat, tau=tau_y, observed=data_centered.biophony)
# sample model
backend = SQLite(path.join(trace_output_path, "model0.sqlite"))
model0_samples = sample(draws=10000, step=NUTS(), random_seed=1, trace=backend)
In [26]:
fig, ax = pyplot.subplots()
# organize results
model0_data = pandas.DataFrame({'site': data_sorted.site.unique(),
'site_name': data_sorted.site_name.unique()})
model0_data['d2n_200m'] = data_sorted.d2n_200m.unique()
model0_data['quantiles'] = [stats.quantiles(model0_samples.a[:5000, i]) for i in range(len(set(sites)))]
# plot quantiles
for i, row in model0_data.sort_values(by='d2n_200m').iterrows():
x = row['d2n_200m']
ax.plot([x, x], [row['quantiles'][2.5], row['quantiles'][97.5]], color='black', linewidth=0.5)
ax.plot([x, x], [row['quantiles'][25], row['quantiles'][75]], color='black', linewidth=1)
ax.scatter([x], [row['quantiles'][50]], color='black', marker='o')
# format plot
l1 = ax.set_xlim([-0.1, 0.75])
xl = ax.set_xlabel("d2n within 200 meters (percent area)")
yl = ax.set_ylabel("percent biophony (difference from grand mean)")
In [31]:
site_predictors = [
# 'building_50m', 'pavement_50m', 'forest_50m', 'field_50m',
# 'building_100m', 'pavement_100m', 'forest_100m', 'field_100m',
# 'building_200m', 'pavement_200m', 'forest_200m', 'field_200m',
# 'building_500m', 'pavement_500m', 'forest_500m', 'field_500m',
'd2n_50m',
'd2n_100m',
'd2n_200m', 'd2n_500m',
]
for predictor in site_predictors:
with Model() as model_1:
# intercept
g_a = Normal('g_a', mu=0, tau=0.001)
g_as = Normal('g_as', mu=0, tau=0.001)
sigma_a = Uniform('sigma_a', lower=0, upper=100)
tau_a = sigma_a**-2
mu_a = g_a + (g_as * data_centered.groupby('site')[predictor].mean())
a = Normal('a', mu=mu_a, tau=tau_a, shape=len(set(sites)))
# slope
g_b = Normal('g_b', mu=0, tau=0.001)
g_bs = Normal('g_bs', mu=0, tau=0.001)
sigma_b = Uniform('sigma_b', lower=0, upper=100)
tau_b = sigma_b**-2
mu_b = g_b + (g_bs * data_centered.groupby('site')[predictor].mean())
b = Normal('b', mu=mu_b, tau=tau_b, shape=len(set(sites)))
# model error (data-level)
sigma_y = Uniform('sigma_y', lower=0, upper=100)
tau_y = sigma_y**-2
# expected values
y_hat = a[sites] + (b[sites] * data_centered.week)
# likelihood
y_like = Normal('y_like', mu=y_hat, tau=tau_y, observed=data_centered.biophony)
# simulated
#y_sim = Normal('y_sim', mu=y_hat, tau=tau_y, shape=y_hat.tag.test_value.shape)
# sample model
backend = SQLite(path.join(trace_output_path, "model1_{}.sqlite".format(predictor)))
model_1_samples = sample(draws=10000, step=NUTS(), random_seed=1, trace=backend)
In [32]:
len(model_1_samples['a'][:5000][:,1])
Out[32]:
In [34]:
fig = pyplot.figure()
fig.set_figwidth(6.85)
fig.set_figheight(6.85/2)
ax_a = pyplot.subplot2grid((1, 2), (0, 0), rowspan=1, colspan=1)
ax_b = pyplot.subplot2grid((1, 2), (0, 1), rowspan=1, colspan=1, sharex=ax_a)
fig.subplots_adjust(left=0, bottom=0, right=1, top=1)
# organize results
model_1_data = pandas.DataFrame({'site': data_sorted.site.unique(),
'site_name': data_sorted.site_name.unique()})
model_1_data['d2n_200m'] = data_sorted.d2n_200m.unique()
model_1_data['quantiles_a'] = [stats.quantiles(model_1_samples['a'][:5000][:, i]) for i in range(len(set(sites)))]
model_1_data['quantiles_b'] = [stats.quantiles(model_1_samples['b'][:5000][:, i]) for i in range(len(set(sites)))]
# plot quantiles
for i, row in model_1_data.sort_values(by='d2n_200m').iterrows():
x = row['d2n_200m']
#ax_a.plot([x, x], [row['quantiles_a'][2.5], row['quantiles_a'][97.5]], color='black', linewidth=0.5)
ax_a.plot([x, x], [row['quantiles_a'][25], row['quantiles_a'][75]], color='black', linewidth=1)
ax_a.scatter([x], [row['quantiles_a'][50]], color='black', marker='o')
# format plot
l1 = ax_a.set_xlim([0, 1])
xl = ax_a.set_xlabel("d2n within 200 meters (percent area)")
yl = ax_a.set_ylabel("percent biophony (difference from grand mean)")
# plot quantiles
for i, row in model_1_data.sort_values(by='d2n_200m').iterrows():
x = row['d2n_200m']
#ax_b.plot([x, x], [row['quantiles_b'][2.5], row['quantiles_b'][97.5]], color='black', linewidth=0.5)
ax_b.plot([x, x], [row['quantiles_b'][25], row['quantiles_b'][75]], color='black', linewidth=1)
ax_b.scatter([x], [row['quantiles_b'][50]], color='black', marker='o')
# format plot
l1 = ax_b.set_xlim([0, 1])
xl = ax_b.set_xlabel("d2n within 200 meters (percent area)")
yl = ax_b.set_ylabel("rate of change of biophony (percent/week)")
In [ ]:
measurement_predictors = [
'temperature', 'wind_speed', 'precipitation', 'pressure',
]
for predictor in measurement_predictors:
with Model() as model2a:
# intercept
g_a = Normal('g_a', mu=0, tau=0.001)
g_as = Normal('g_as', mu=0, tau=0.001)
sigma_a = Uniform('sigma_a', lower=0, upper=100)
tau_a = sigma_a**-2
mu_a = g_a + (g_as * data_centered.groupby('site')['forest_200m'].mean())
a = Normal('a', mu=mu_a, tau=tau_a, shape=len(set(sites)))
# time slope
g_b = Normal('g_b', mu=0, tau=0.001)
g_bs = Normal('g_bs', mu=0, tau=0.001)
sigma_b = Uniform('sigma_b', lower=0, upper=100)
tau_b = sigma_b**-2
mu_b = g_b + (g_bs * data_centered.groupby('site')['forest_200m'].mean())
b = Normal('b', mu=mu_b, tau=tau_b, shape=len(set(sites)))
# temp slope
#g_c = Normal('g_c', mu=0, tau=0.001)
#g_cs = Normal('g_cs', mu=0, tau=0.001)
#sigma_c = Uniform('sigma_c', lower=0, upper=100)
#tau_c = sigma_c**-2
#mu_c = g_c + (g_cs * data_centered.groupby('site')['forest_200m'].mean())
#c = Normal('c', mu=mu_c, tau=tau_c, shape=len(set(sites)))
c = Uniform('c', lower=-100, upper=100)
# model error (data-level)
sigma_y = Uniform('sigma_y', lower=0, upper=100)
tau_y = sigma_y**-2
# expected values
y_hat = a[sites] + (b[sites] * data_centered.week) + (c * data_centered[predictor])
# likelihood
y_like = Normal('y_like', mu=y_hat, tau=tau_y, observed=data_centered.biophony)
# simulated
#y_sim = Normal('y_sim', mu=y_hat, tau=tau_y, shape=y_hat.tag.test_value.shape)
# sample model
backend = SQLite(path.join(trace_output_path, "model2a_{0}.sqlite".format(predictor)))
model_2_samples = sample(draws=10000, step=NUTS(), random_seed=1, trace=backend)
In [ ]:
measurement_predictors = [
'temperature', 'wind_speed', 'precipitation', 'pressure',
]
for predictor in measurement_predictors:
with Model() as model2b:
# intercept
g_a = Normal('g_a', mu=0, tau=0.001)
g_as = Normal('g_as', mu=0, tau=0.001)
sigma_a = Uniform('sigma_a', lower=0, upper=100)
tau_a = sigma_a**-2
mu_a = g_a + (g_as * data_centered.groupby('site')['forest_200m'].mean())
a = Normal('a', mu=mu_a, tau=tau_a, shape=len(set(sites)))
# time slope
g_b = Normal('g_b', mu=0, tau=0.001)
g_bs = Normal('g_bs', mu=0, tau=0.001)
sigma_b = Uniform('sigma_b', lower=0, upper=100)
tau_b = sigma_b**-2
mu_b = g_b + (g_bs * data_centered.groupby('site')['forest_200m'].mean())
b = Normal('b', mu=mu_b, tau=tau_b, shape=len(set(sites)))
# predictor slope
g_c = Normal('g_c', mu=0, tau=0.001)
g_cs = Normal('g_cs', mu=0, tau=0.001)
sigma_c = Uniform('sigma_c', lower=0, upper=100)
tau_c = sigma_c**-2
mu_c = g_c + (g_cs * data_centered.groupby('site')['forest_200m'].mean())
c = Normal('c', mu=mu_c, tau=tau_c, shape=len(set(sites)))
# model error (data-level)
sigma_y = Uniform('sigma_y', lower=0, upper=100)
tau_y = sigma_y**-2
# expected values
y_hat = a[sites] + (b[sites] * data_centered.week) + (c[sites] * data_centered[predictor])
# likelihood
y_like = Normal('y_like', mu=y_hat, tau=tau_y, observed=data_centered.biophony)
# simulated
#y_sim = Normal('y_sim', mu=y_hat, tau=tau_y, shape=y_hat.tag.test_value.shape)
# sample model
backend = SQLite(path.join(trace_output_path, "model2b_{0}.sqlite".format(predictor)))
model_2_samples = sample(draws=10000, step=NUTS(), random_seed=1, trace=backend)
In [ ]:
fig, ax = pyplot.subplots()
# organize results
model_2_data = pandas.DataFrame({'site': data_sorted.site.unique(),
'site_name': data_sorted.site_name.unique()})
model_2_data['forest_200m'] = data_sorted.forest_200m.unique()
model_2_data['quantiles'] = [stats.quantiles(model_2_samples['c'][:1000][:, i]) for i in range(len(set(sites)))]
# plot quantiles
for i, row in model_2_data.sort_values(by='forest_200m').iterrows():
x = row['forest_200m']
ax.plot([x, x], [row['quantiles'][2.5], row['quantiles'][97.5]], color='black', linewidth=0.5)
ax.plot([x, x], [row['quantiles'][25], row['quantiles'][75]], color='black', linewidth=1)
ax.scatter([x], [row['quantiles'][50]], color='black', marker='o')
# format plot
l1 = ax.set_xlim([0, 100])
xl = ax.set_xlabel("forest land cover within 200 meters (percent area)")
yl = ax.set_ylabel("percent biophony (difference from grand mean)")