In [1]:
%matplotlib inline
import logging
import pickle
import pandas as pd
import numpy as np
import math
logger = logging.getLogger()
logger.setLevel(logging.INFO)
In [2]:
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_final.p', 'rb'))
In [3]:
print readings.shape
print stations.shape
print weather.shape
In [4]:
fill_gaps = True
In [48]:
def find_next(df, start_loc):
if start_loc + 1 == len(df):
return None
else:
return df.loc[start_loc + 1]
def get_fillings(df):
fillings=[]
for idx, start in df.iterrows():
end = find_next(df, idx)
if end is None:
break
big_gap = (end.Timestamp - start.Timestamp).seconds > (60 * 5)
if big_gap:
gap_fillings = pd.date_range(start=start.Timestamp, end=end.Timestamp, freq='5min', tz='UTC')[1:]
if (end.Timestamp - gap_fillings[-1]).seconds < (60 * 2 + 30):
gap_fillings = gap_fillings[:-1]
for timestamp in gap_fillings:
fillings.append({'Id': start.Id, 'Timestamp': timestamp, 'Source': 'ARTIFICIAL'})
return fillings
In [6]:
if fill_gaps:
# prepare to find gaps
readings['Source'] = 'REAL'
readings.sort_values(by=['Timestamp'], inplace=True)
stations_ids = stations.Id.unique()
# find the gaps for each station
fillings = []
for station_id in stations_ids:
station_df = readings[readings.Id == station_id].reset_index(drop=True)
station_fillings = get_fillings(station_df)
fillings.append(station_fillings)
# add the gaps to the original dataset
readings = pd.concat([readings, pd.DataFrame(sum(fillings, []))])
# fill the missing values using a fill forward strategy
readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings.fillna(method='ffill', inplace=True)
readings.reset_index(drop=True, inplace=True)
Use binary search to look for the closest date to the given reading.
In [9]:
def binarySearch(data, val):
"""Find the closest val in data"""
lo, hi = 0, len(data) - 1
best_ind = lo
while lo <= hi:
mid = lo + (hi - lo) / 2
if data.iat[mid] < val:
lo = mid + 1
elif data.iat[mid] > val:
hi = mid - 1
else:
best_ind = mid
break
# check if data[mid] is closer to val than data[best_ind]
if abs(data.iat[mid] - val) < abs(data.iat[best_ind] - val):
best_ind = mid
return best_ind
In [30]:
readings.head()
Out[30]:
In [44]:
weather[15:25]
Out[44]:
In [57]:
readings.head()
Out[57]:
In [45]:
readings['Timestamp'][0:5].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])
Out[45]:
In [46]:
readings['WeatherIdx'] = readings['Timestamp'].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])
In [49]:
readings_weather = pd.merge(readings, weather, right_index=True, left_on='WeatherIdx')
readings_weather['DifferenceS'] = (readings_weather['Timestamp_x'] - readings_weather['Timestamp_y']) / pd.np.timedelta64(1, 's')
readings_weather['DifferenceS'] = readings_weather['DifferenceS'].apply(math.fabs)
In [50]:
readings_weather_view = readings_weather[['Timestamp_x', 'Timestamp_y', 'DifferenceS']]
In [51]:
readings_weather_view.sort_values(by=['DifferenceS'], ascending=False).head()
Out[51]:
In [52]:
readings_weather_view.describe()
Out[52]:
In [53]:
readings_weather.rename(columns={'Timestamp_x': 'Timestamp'}, inplace=True)
readings_weather.drop(['Timestamp_y', 'WeatherIdx', 'DifferenceS'], axis=1, inplace=True)
In [54]:
readings_weather.info()
In [56]:
pickle.dump(readings_weather, open("data/parsed/readings_weather_filled_dataset.p", "wb"))