This notebook explores and models the data collected from recordings of the natural acoustic environment over the urban-rural gradient near Innsbruck, Austria. The models are implemented as Bayesian models with the PyMC3 probabilistic programming library.
References:
https://github.com/fonnesbeck/multilevel_modeling
Gelman, A., & Hill, J. (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models (1st ed.). Cambridge University Press.
In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
import pandas
import numpy
from os import path
In [3]:
%matplotlib inline
from matplotlib import pyplot
from matplotlib.patches import Rectangle
import seaborn
import mpld3
from mpld3 import plugins
In [4]:
from pymc3 import glm, Model, NUTS, sample, stats, \
forestplot, traceplot, plot_posterior, summary, \
Normal, Uniform, Deterministic, StudentT
from pymc3.backends import SQLite
In [5]:
from matplotlib import rcParams
In [6]:
rcParams['font.sans-serif']
Out[6]:
In [7]:
rcParams['font.sans-serif'] = ['Helvetica',
'Arial',
'Bitstream Vera Sans',
'DejaVu Sans',
'Lucida Grande',
'Verdana',
'Geneva',
'Lucid',
'Avant Garde',
'sans-serif']
In [8]:
data_filepath = "/Users/Jake/OneDrive/Documents/alpine soundscapes/data/dataset.csv"
In [9]:
trace_output_path = "/Users/Jake/OneDrive/Documents/alpine soundscapes/data/model traces/biophony"
In [10]:
seaborn_blue = seaborn.color_palette()[0]
In [11]:
data = pandas.read_csv(data_filepath)
data = data.loc[data.site<=30]
sort data by site and then by visit
In [12]:
data_sorted = data.sort_values(by=['site', 'sound']).reset_index(drop=True)
create composite land cover variable
In [13]:
data_sorted['land_composite_50m'] = data_sorted.forest_50m - (data_sorted.building_50m + data_sorted.pavement_50m)
data_sorted['land_composite_100m'] = data_sorted.forest_100m - (data_sorted.building_100m + data_sorted.pavement_100m)
data_sorted['land_composite_200m'] = data_sorted.forest_200m - (data_sorted.building_200m + data_sorted.pavement_200m)
data_sorted['land_composite_500m'] = data_sorted.forest_500m - (data_sorted.building_500m + data_sorted.pavement_500m)
transform variables (mean center)
In [14]:
column_list = ['anthrophony', 'biophony', 'total', 'week',
'building_50m', 'pavement_50m', 'forest_50m', 'field_50m',
'building_100m', 'pavement_100m', 'forest_100m', 'field_100m',
'building_200m', 'pavement_200m', 'forest_200m', 'field_200m',
'building_500m', 'pavement_500m', 'forest_500m', 'field_500m',
'land_composite_50m', 'land_composite_100m', 'land_composite_200m', 'land_composite_500m',
'temperature', 'wind_speed', 'pressure', 'bus_stop',
'construction', 'crossing', 'cycleway', 'elevator', 'escape', 'footway',
'living_street', 'motorway', 'motorway_link', 'path', 'pedestrian',
'platform', 'primary_road', 'primary_link', 'proposed', 'residential',
'rest_area', 'secondary', 'secondary_link', 'service', 'services',
'steps', 'tertiary', 'tertiary_link', 'track', 'unclassified', 'combo']
data_centered = data_sorted
for column in column_list:
data_centered[column] = data_sorted[column] - data_sorted[column].mean()
create sites variable for PyMC3 models
In [15]:
sites = numpy.copy(data_sorted.site.values) - 1
In [70]:
with Model() as model_1:
# intercept
g_a = Normal('g_a', mu=0, tau=0.001)
g_as = Normal('g_as', mu=0, tau=0.001)
sigma_a = Uniform('sigma_a', lower=0, upper=100)
tau_a = sigma_a**-2
mu_a = g_a + (g_as * data_centered.groupby('site')['land_composite_200m'].mean())
a = Normal('a', mu=mu_a, tau=tau_a, shape=len(set(sites)))
# slope
g_b = Normal('g_b', mu=0, tau=0.001)
g_bs = Normal('g_bs', mu=0, tau=0.001)
sigma_b = Uniform('sigma_b', lower=0, upper=100)
tau_b = sigma_b**-2
mu_b = g_b + (g_bs * data_centered.groupby('site')['land_composite_200m'].mean())
b = Normal('b', mu=mu_b, tau=tau_b, shape=len(set(sites)))
# model error (data-level)
sigma_y = Uniform('sigma_y', lower=0, upper=100)
tau_y = sigma_y**-2
# expected values
y_hat = a[sites] + (b[sites] * data_centered.week)
# likelihood
y_like = Normal('y_like', mu=y_hat, tau=tau_y, observed=data_centered.biophony)
# simulated
y_sim = Normal('y_sim', mu=y_hat, tau=tau_y, shape=y_hat.tag.test_value.shape)
# sample model
backend = SQLite(path.join(trace_output_path, "model3.sqlite"))
model_1_samples = sample(draws=10000, step=NUTS(), random_seed=1, trace=backend)
In [68]:
# mean site-level parameter means
g_a = trace['g_a'].mean()
g_as = trace['g_as'].mean()
g_b = trace['g_b'].mean()
g_bs = trace['g_bs'].mean()
# calculate predicted biophony
y_hat = numpy.empty(len(data_centered))
for i, row in data_centered.iterrows():
a = g_a + (g_as * row['land_composite_200m'])
b = g_b + (g_bs * row['land_composite_200m'])
y_hat[i] = a + (b * row['week'])
fig, ax = pyplot.subplots()
fig.set_figwidth(15)
fig.set_figheight(5)
#ax.scatter(data_centered['land_composite_200m'], y_hat, color='black')
# plot simulated
for s in numpy.random.randint(0, 4999, 20):
y_sim = trace.y_sim[s, :]
y_e = y_hat - y_sim
ax.scatter(data_centered['biophony'], y_e, color=seaborn_blue, alpha=0.1)