An example of exploratory data analysis on data collected from Etcheverry Roof. Here is a link to download the data.
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import bokeh
import numpy as np
import sklearn
import pandas as pd
from bokeh.io import output_notebook, show
from bokeh.models import HoverTool
from bokeh.plotting import figure
output_notebook()
from scipy.stats import pearsonr
import scipy
from math import isnan
import seaborn as sns # makes matplotlib beautiful
In [2]:
etch_roof_pg_url = 'https://radwatch.berkeley.edu/sites/default/files/dosenet/etch_roof.csv' # pocket geiger data
data_pg = pd.read_csv(etch_roof_pg_url)
data_pg.head()
Out[2]:
In [3]:
etch_roof_co2_url = 'https://radwatch.berkeley.edu/sites/default/files/dosenet/etch_roof_adc.csv' # CO2 data
data_co2 = pd.read_csv(etch_roof_co2_url)
data_co2.head()
Out[3]:
In [4]:
# weather data
etch_roof_weather_url = 'https://radwatch.berkeley.edu/sites/default/files/dosenet/etch_roof_weather.csv'
data_weather = pd.read_csv(etch_roof_weather_url)
data_weather.head()
Out[4]:
In [5]:
data_weather['datetime'] = pd.to_datetime(data_weather['deviceTime_utc'])
p = figure(plot_width=960, plot_height=480, title='Etcheverry Rooftop Temperature', x_axis_type='datetime')
p.line(data_weather['datetime'], data_weather['temperature'], line_width=2)
p.add_tools(HoverTool(tooltips=[('Time', '$x'), ('Temp', '$y')]))
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Temperature'
show(p)
In [6]:
def near_time_search(list_, time, delta):
"""Binary search algorithm to find the datetime closest to
`time` within the list/array `list_`.
:param list_: SORTED list of timestamps
:param time: timestamp being searched for
:return: Nan if no timestamp found within `delta` of `time`
timestamp closest to `time`
>>> near_time_search([1, 2, 3], 2, 1)
2
>>> near_time_search([3.4, 3.8, 4, 6], 4.1, 0.3)
4
>>> near_time_search([3.4, 3.8, 4, 6], 4.1, 0.05)
nan
>>> near_time_search([3.4, 3.8, 4, 6], 10, 3.5)
nan
>>> near_time_search([3.4, 3.8, 4, 6], 10, 7) # should return `6`, not `4`
6
>>> near_time_search([3.4, 3.8, 4, 6], 5, 7) # should return `6`, not `4`
6
>>> near_time_search(np.ones((1_000_000,)), 0.8, 0.2)
1.0
"""
lo = 0
hi = len(list_)
mid = (lo + hi) // 2
while lo < mid < hi:
if list_[mid] < time:
lo = mid + 1
elif list_[mid] > time:
hi = mid - 1
else:
return list_[mid]
mid = (lo + hi) // 2
if (mid < hi) and (abs(list_[mid] - time) <= delta):
return list_[mid]
elif abs(list_[mid - 1] - time) <= delta:
return list_[mid - 1]
return float('nan')
def nts_with_arr(list_, time_arr, delta):
result = []
for elem in time_arr:
# print(elem)
result.append(near_time_search(list_, elem, delta))
return result
# run doctests to see whether the function behaves as it should
import doctest
doctest.testmod()
Out[6]:
First, we need to find corresponding $temperature$ and $CO_2$ observations based on the timestamps of the obsevations. We will use the near_time_search
function for this.
In [7]:
# the data needs to be sorted in ascending order according to time (it is
# sorted in descending order)
data_co2.sort_values('deviceTime_unix', inplace=True)
data_weather.sort_values('deviceTime_unix', inplace=True)
co2_temp_df = pd.DataFrame(columns=['co2_time', 'temp_time', 'co2', 'temp'])
co2_temp_df['co2_time'] = data_co2['deviceTime_unix']
co2_temp_df['temp_time'] = nts_with_arr(data_weather['deviceTime_unix'].tolist(),
co2_temp_df['co2_time'].tolist(),
delta=300)
time_diff = (co2_temp_df['temp_time'] - co2_temp_df['co2_time'])
print(time_diff)
Interestingly, there are many instances where the $temperature$ and $CO_2$ timestamps are different by about 300 seconds. These probably correspond to instances where one of the sensors did not make a measurement, so we would want these to be NaN
and thus excluded from our calculation of correlation.
In [8]:
# let's see how many differences are +-300 or NaN
time_diff.value_counts(dropna=False)
Out[8]:
We don't want to consider elements with time difference more than a few seconds, because those will spuriously lead us to count the same observation twice (think why!). Pandas lets us easily remove rows with duplicate values using the DataFrame.drop_duplicates
function.
In [9]:
co2_temp_df['temp_time'] = nts_with_arr(data_weather['deviceTime_unix'].tolist(),
co2_temp_df['co2_time'].tolist(),
delta=300) # <-- delta=30 instead of 300
# print("==========================================================================")
# print("Let's see how many repeated timestamps we have with `delta` of 300 seconds.")
# print("--------------------------------------------------------------------------")
# print(co2_temp_df['temp_time'].value_counts(dropna=False))
# co2_temp_df['pg_time'] = nts_with_arr(data_pg['deviceTime_unix'].tolist(),
# co2_temp_df['co2_time'].tolist(),
# delta=1500)
# Now let's remove the duplicates
co2_temp_df = co2_temp_df.drop_duplicates(subset=['temp_time'], keep='first')
print(co2_temp_df)
Now let's fill in the $temperature$ and $CO_2$ concentration values.
In [10]:
# first let's remove rows where `temp_time` is `NaN`
# make the entire row `NaN` if `temp_time` is `NaN`
for idx, _, temp_time, _, _ in co2_temp_df.itertuples():
if isnan(temp_time):
co2_temp_df.loc[idx, 'co2_time'] = float('nan')
# drop rows that are filled with `NaN`s only
co2_temp_df = co2_temp_df.dropna(how='all')
In [11]:
# now let's fill in the values
for idx, _, _, _, _ in co2_temp_df.itertuples():
try:
co2_temp_df.loc[idx, 'co2'] = data_co2.loc[idx, 'co2_ppm']
co2_temp_df.loc[idx, 'temp'] = data_weather.loc[idx, 'temperature']
except KeyError as ke:
# print('Skipped index %d' % idx)
continue
co2_temp_df
Out[11]:
Let's clean up NaN
s one last time before we find the ~correlations~!!!
In [12]:
co2_temp_df = co2_temp_df.dropna()
co2_temp_df
Out[12]:
Now let's find the correlation between $temperature$ and $CO_2$ concentration (finally!!!) using the handy scipy.stats.pearsonr
function, which also returns the two-tailed p-value for the two series of data being analyzed.
In [13]:
r_value, p_value = pearsonr(co2_temp_df['temp'], co2_temp_df['co2'])
print(f"Pearson's correlation coefficient: {r_value}")
print(f"Two-tailed p-value: {p_value}")
A correlation coefficient $<0.30$ suggests there is no correlation between temperature and $CO_2$ concentration. Let's verify that graphically.
In [14]:
plt.scatter(co2_temp_df['temp'], co2_temp_df['co2'])
plt.xlabel("Temperature (degrees Celcius))")
plt.ylabel("CO2 concentration (ppm)")
plt.show()
Clearly, there is no correlation between temperature and $CO_2$ concentration.
In [15]:
co2_temp_df = co2_temp_df.assign(humidity=pd.Series(np.full(co2_temp_df.shape[0], float('nan'))).values)
co2_temp_df = co2_temp_df.assign(pressure=pd.Series(np.full(co2_temp_df.shape[0], float('nan'))).values)
In [16]:
for idx, _, _, _, _, _, _ in co2_temp_df.itertuples():
try:
co2_temp_df.loc[idx, 'pressure'] = data_weather.loc[idx, 'humidity']
co2_temp_df.loc[idx, 'humidity'] = data_weather.loc[idx, 'pressure']
except KeyError as ke:
print('Skipped index %d' % idx)
continue
In [17]:
co2_temp_df.dropna()
co2_temp_df['temp'] = co2_temp_df['temp'].astype(float)
co2_temp_df['co2'] = co2_temp_df['co2'].astype(float)
correlation_matrix = co2_temp_df.corr()
with sns.axes_style("dark"):
sns.heatmap(data=correlation_matrix.loc['co2': 'pressure', 'co2': 'pressure'],
vmin=-1.0, vmax=1.0, annot=True, cbar=True, linecolor='#000000FF', linewidth='0.1')
Okay, so there is weak correlation ($0.7\geq|r|\geq0.3$) between $humidity$ and $pressure$, and very weak correlation ($r<0.3$) for the rest.
In [18]:
co2_temp_df = co2_temp_df.assign(radiation=pd.Series(np.full(co2_temp_df.shape[0], float('nan'))).values)
# co2_temp_df['pg_time'] = nts_with_arr(co2_temp_df['co2_time'].astype(int).tolist(),
# data_pg['deviceTime_unix'].tolist(),
# delta=300)
pg_tt = pd.Series(nts_with_arr(co2_temp_df['co2_time'].astype(int).tolist(),
data_pg['deviceTime_unix'].tolist(),
delta=300))
# time_diff = (data_pg['deviceTime_unix'] - co2_temp_df['co2_time'])
pg_tt.dropna(inplace=True)
pg_tt.drop_duplicates(inplace=True)
co2_temp_df['pg_time'] = nts_with_arr(pg_tt.tolist(),
co2_temp_df['co2_time'].astype(int).tolist(),
delta=300)
print(pg_tt.value_counts(dropna=False))
In [19]:
data_pg['deviceTime_unix']
Out[19]:
In [20]:
co2_temp_df['pg_time'].value_counts(dropna=False)
Out[20]:
In [21]:
# for idx, c02_time, _, _, _, _, _, _ in co2_temp_df.itertuples():
# try:
# co2_temp_df.loc[idx, 'radiation'] = data_pg.loc[near_time_search(list_=), 'humidity']
# except KeyError as ke:
# print('Skipped index %d' % idx)
# continue