In [148]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
%matplotlib inline
In [49]:
# Pre-declare the data columns
data_columns = ['tracker_id', 'dB', 'year', 'month', 'day', 'hour', 'minute', 'second', 'distance']
In [50]:
# Function to convert the data's date columns into a single datetime object.
def to_datetime(x):
"""
Parameters
==========
- x : a row in the dataframe of data.
Returns
=======
- datetime version of the date.
"""
return datetime(x['year'], x['month'], x['day'], x['hour'], x['minute'], x['second'])
def read_data(handle):
"""
Parameters
==========
- handle : path to the data
Returns
=======
- df : pandas dataframe
"""
df = pd.read_csv(handle)
# df.columns = data_columns
df['date'] = df.apply(lambda x:to_datetime(x), axis=1)
df.set_index('date', inplace=True)
return df
In [51]:
# Read in the data, set the date
southern = read_data('southern_pi.csv')
origin = read_data('origin_pi.csv')
eastern = read_data('eastern_pi.csv')
In [93]:
origin.dropna().head()
Out[93]:
In [71]:
# Let's plot the data for each of the devices, from each of the base stations.
dfs = [(origin, 'origin'), (eastern, 'eastern'), (southern, 'southern')]
def plot_signal_vs_distance(device):
fig = plt.figure(figsize=(9,3))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
axes = [ax1, ax2, ax3]
for i, (df, title) in enumerate(dfs):
df.dropna().query('tracker_id=="{0}"'.format(device)).plot(x='distance', y='dB', kind='scatter', ax=axes[i])
axes[i].set_title(title)
plt.tight_layout()
device = '68:9E:19:11:A6:DB'
plot_signal_vs_distance(device)
In [73]:
device = 'F4:B8:5E:C4:56:22'
plot_signal_vs_distance(device)
As we can see here, the data for device 68:9E:19:11:A6:DB
was much better than the device F4:B8:5E:C4:56:22
. There seems to be a stronger correlation with distance on device 68
than device F4
. I will attempt Bayesian modelling to quantify the uncertainty around each distance measurement for device 68
.
In [160]:
# In order to have more data put together, I will concatenate the 3 data frames.
device = '68:9E:19:11:A6:DB'
# device = 'F4:B8:5E:C4:56:22'
all_data = origin.dropna().append(eastern.dropna()).append(southern.dropna()).query('tracker_id=="{0}"'.format(device))
all_data.plot(x='distance', y='dB', kind='scatter', title='{0} data points'.format(len(all_data)))
Out[160]:
In [161]:
sns.violinplot(x=all_data['distance'], y=all_data['dB'])
Out[161]:
Looking at the distribution of data points, it looks like it will be difficult for us to resolve distances less than 24 ft.
I think a second experiment is in order.
In [181]:
import pymc3 as pm
with pm.Model() as model:
# Hyperparameters for A and B
# mu_a = pm.Normal('mu_A', mu=0, sd=10)
# sig_a = pm.Uniform('sigma_A', lower=0, upper=10)
# mu_b = pm.Normal('mu_B', mu=0, sd=10)
# sig_b = pm.Uniform('sigma_B', lower=0, upper=10)
# Estimate the intercepts
A = pm.Normal('A', mu=-0.2, sd=3)
B = pm.Normal('B', mu=-50, sd=3)
# Error term
err = pm.Normal('error', mu=-30, sd=10)
# Model prediction:
dB = A * all_data['distance'] + B + err
# dB likelihood
db_like = pm.Normal('dB', mu=dB, sd=10, observed=all_data['dB'])
In [182]:
with model:
start = pm.find_MAP()
step = pm.NUTS()
trace = pm.sample(10000, step)
In [183]:
pm.traceplot(trace)
Out[183]:
In [184]:
pm.summary(trace)
In [ ]: