In [1]:
    
%matplotlib inline
import logging
import pickle
import pandas as pd
import numpy as np
import math
logger = logging.getLogger()
logger.setLevel(logging.INFO)
    
In [2]:
    
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_final.p', 'rb'))
    
In [3]:
    
print readings.shape
print stations.shape
print weather.shape
    
    
In [4]:
    
fill_gaps = True
    
In [48]:
    
def find_next(df, start_loc):
    if start_loc + 1 == len(df):
        return None
    else:    
        return df.loc[start_loc + 1]
def get_fillings(df):
    fillings=[]
    for idx, start in df.iterrows():
        end = find_next(df, idx)
        if end is None:
            break
            
        big_gap = (end.Timestamp - start.Timestamp).seconds > (60 * 5)
        if big_gap:
            gap_fillings = pd.date_range(start=start.Timestamp, end=end.Timestamp, freq='5min', tz='UTC')[1:]
            if (end.Timestamp - gap_fillings[-1]).seconds < (60 * 2 + 30):
                gap_fillings = gap_fillings[:-1]
                
            for timestamp in gap_fillings:                
                fillings.append({'Id': start.Id, 'Timestamp': timestamp, 'Source': 'ARTIFICIAL'})
                
    return fillings
    
In [6]:
    
if fill_gaps:
    # prepare to find gaps
    readings['Source'] = 'REAL'
    readings.sort_values(by=['Timestamp'], inplace=True)
    
    stations_ids = stations.Id.unique()
    
    # find the gaps for each station
    fillings = []
    for station_id in stations_ids:
        station_df = readings[readings.Id == station_id].reset_index(drop=True)
        station_fillings = get_fillings(station_df)
        fillings.append(station_fillings)
        
    # add the gaps to the original dataset
    readings = pd.concat([readings, pd.DataFrame(sum(fillings, []))])
    
    # fill the missing values using a fill forward strategy
    readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
    readings.fillna(method='ffill', inplace=True)
    readings.reset_index(drop=True, inplace=True)
    
Use binary search to look for the closest date to the given reading.
In [9]:
    
def binarySearch(data, val):
    """Find the closest val in data"""
    
    lo, hi = 0, len(data) - 1
    best_ind = lo
    while lo <= hi:
        mid = lo + (hi - lo) / 2
        if data.iat[mid] < val:
            lo = mid + 1
        elif data.iat[mid] > val:
            hi = mid - 1
        else:
            best_ind = mid
            break
        # check if data[mid] is closer to val than data[best_ind] 
        if abs(data.iat[mid] - val) < abs(data.iat[best_ind] - val):
            best_ind = mid
    return best_ind
    
In [30]:
    
readings.head()
    
    Out[30]:
In [44]:
    
weather[15:25]
    
    Out[44]:
In [57]:
    
readings.head()
    
    Out[57]:
In [45]:
    
readings['Timestamp'][0:5].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])
    
    Out[45]:
In [46]:
    
readings['WeatherIdx'] = readings['Timestamp'].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])
    
In [49]:
    
readings_weather = pd.merge(readings, weather, right_index=True, left_on='WeatherIdx')
readings_weather['DifferenceS'] = (readings_weather['Timestamp_x'] - readings_weather['Timestamp_y']) / pd.np.timedelta64(1, 's')
readings_weather['DifferenceS'] = readings_weather['DifferenceS'].apply(math.fabs)
    
In [50]:
    
readings_weather_view = readings_weather[['Timestamp_x', 'Timestamp_y', 'DifferenceS']]
    
In [51]:
    
readings_weather_view.sort_values(by=['DifferenceS'], ascending=False).head()
    
    Out[51]:
In [52]:
    
readings_weather_view.describe()
    
    Out[52]:
In [53]:
    
readings_weather.rename(columns={'Timestamp_x': 'Timestamp'}, inplace=True)
readings_weather.drop(['Timestamp_y', 'WeatherIdx', 'DifferenceS'], axis=1, inplace=True)
    
In [54]:
    
readings_weather.info()
    
    
In [56]:
    
pickle.dump(readings_weather, open("data/parsed/readings_weather_filled_dataset.p", "wb"))