Merge Datasets

Set Up


In [1]:
%matplotlib inline

import logging
import pickle
import pandas as pd
import numpy as np
import math

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_final.p', 'rb'))

In [3]:
print readings.shape
print stations.shape
print weather.shape


(1483149, 6)
(779, 13)
(3008, 16)

Fill Gaps


In [4]:
fill_gaps = True

In [48]:
def find_next(df, start_loc):
    if start_loc + 1 == len(df):
        return None
    else:    
        return df.loc[start_loc + 1]

def get_fillings(df):
    fillings=[]
    for idx, start in df.iterrows():
        end = find_next(df, idx)
        if end is None:
            break
            
        big_gap = (end.Timestamp - start.Timestamp).seconds > (60 * 5)
        if big_gap:
            gap_fillings = pd.date_range(start=start.Timestamp, end=end.Timestamp, freq='5min', tz='UTC')[1:]
            if (end.Timestamp - gap_fillings[-1]).seconds < (60 * 2 + 30):
                gap_fillings = gap_fillings[:-1]
                
            for timestamp in gap_fillings:                
                fillings.append({'Id': start.Id, 'Timestamp': timestamp, 'Source': 'ARTIFICIAL'})
                
    return fillings

In [6]:
if fill_gaps:
    # prepare to find gaps
    readings['Source'] = 'REAL'
    readings.sort_values(by=['Timestamp'], inplace=True)
    
    stations_ids = stations.Id.unique()
    
    # find the gaps for each station
    fillings = []
    for station_id in stations_ids:
        station_df = readings[readings.Id == station_id].reset_index(drop=True)
        station_fillings = get_fillings(station_df)
        fillings.append(station_fillings)
        
    # add the gaps to the original dataset
    readings = pd.concat([readings, pd.DataFrame(sum(fillings, []))])
    
    # fill the missing values using a fill forward strategy
    readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
    readings.fillna(method='ffill', inplace=True)
    readings.reset_index(drop=True, inplace=True)

Merge Readings and Weather

Use binary search to look for the closest date to the given reading.


In [9]:
def binarySearch(data, val):
    """Find the closest val in data"""
    
    lo, hi = 0, len(data) - 1
    best_ind = lo
    while lo <= hi:
        mid = lo + (hi - lo) / 2
        if data.iat[mid] < val:
            lo = mid + 1
        elif data.iat[mid] > val:
            hi = mid - 1
        else:
            best_ind = mid
            break
        # check if data[mid] is closer to val than data[best_ind] 
        if abs(data.iat[mid] - val) < abs(data.iat[best_ind] - val):
            best_ind = mid
    return best_ind

In [30]:
readings.head()


Out[30]:
Id NbBikes NbDocks NbEmptyDocks NbUnusableDocks Source Timestamp
0 BikePoints_1 12.0 19.0 6.0 1.0 REAL 2016-05-16 05:41:16.870000128+00:00
1 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 05:46:16.870000128+00:00
2 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 05:51:16.870000128+00:00
3 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 05:56:16.870000128+00:00
4 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 06:01:16.870000128+00:00

In [44]:
weather[15:25]


Out[44]:
Condition DewPt Fog Hail Humidity Pressure Rain Snow Temp Thunder Timestamp Tornado Visibility WindDirD WindDirE WindSpeed
15 NaN 8.0 True True 92.0 1021.0 True True 9.0 True 2016-05-16 04:00:00+00:00 True 10.0 30.0 NNE 3.7
16 Unknown 7.0 True True 93.0 1021.0 True True 8.0 True 2016-05-16 04:20:00+00:00 True 10.0 350.0 North 7.4
17 Unknown 7.0 True True 93.0 1021.0 True True 8.0 True 2016-05-16 04:50:00+00:00 True 9.0 360.0 North 7.4
18 NaN 7.0 True True 92.0 1021.0 True True 9.0 True 2016-05-16 05:00:00+00:00 True 11.0 10.0 North 9.3
19 Partly Cloudy 7.0 True True 93.0 1021.0 True True 8.0 True 2016-05-16 05:20:00+00:00 True 10.0 360.0 North 7.4
20 Unknown 7.0 True True 87.0 1021.0 True True 9.0 True 2016-05-16 05:50:00+00:00 True 10.0 350.0 North 9.3
21 NaN 8.0 True True 83.0 1022.0 True True 10.0 True 2016-05-16 06:00:00+00:00 True 11.0 10.0 North 9.3
22 Partly Cloudy 7.0 True True 82.0 1021.0 True True 10.0 True 2016-05-16 06:20:00+00:00 True 10.0 360.0 North 13.0
23 Partly Cloudy 7.0 True True 82.0 1021.0 True True 10.0 True 2016-05-16 06:50:00+00:00 True 10.0 20.0 NNE 11.1
24 Scattered Clouds 8.0 True True 77.0 1022.0 True True 11.0 True 2016-05-16 07:00:00+00:00 True 13.0 20.0 NNE 9.3

In [57]:
readings.head()


Out[57]:
Id NbBikes NbDocks NbEmptyDocks NbUnusableDocks Source Timestamp WeatherIdx
0 BikePoints_1 12.0 19.0 6.0 1.0 REAL 2016-05-16 05:41:16.870000128+00:00 20
1 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 05:46:16.870000128+00:00 20
2 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 05:51:16.870000128+00:00 20
3 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 05:56:16.870000128+00:00 21
4 BikePoints_1 12.0 19.0 6.0 1.0 ARTIFICIAL 2016-05-16 06:01:16.870000128+00:00 21

In [45]:
readings['Timestamp'][0:5].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])


Out[45]:
0    20
1    20
2    20
3    21
4    21
Name: Timestamp, dtype: int64

In [46]:
readings['WeatherIdx'] = readings['Timestamp'].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])

In [49]:
readings_weather = pd.merge(readings, weather, right_index=True, left_on='WeatherIdx')
readings_weather['DifferenceS'] = (readings_weather['Timestamp_x'] - readings_weather['Timestamp_y']) / pd.np.timedelta64(1, 's')
readings_weather['DifferenceS'] = readings_weather['DifferenceS'].apply(math.fabs)

In [50]:
readings_weather_view = readings_weather[['Timestamp_x', 'Timestamp_y', 'DifferenceS']]

In [51]:
readings_weather_view.sort_values(by=['DifferenceS'], ascending=False).head()


Out[51]:
Timestamp_x Timestamp_y DifferenceS
7076497 2016-06-26 23:56:49.023000064+00:00 2016-06-26 22:50:00+00:00 4009.023
4700828 2016-06-26 23:56:49.023000064+00:00 2016-06-26 22:50:00+00:00 4009.023
5160747 2016-06-26 23:56:49.023000064+00:00 2016-06-26 22:50:00+00:00 4009.023
3048556 2016-06-26 23:56:49.023000064+00:00 2016-06-26 22:50:00+00:00 4009.023
1016565 2016-06-26 23:56:49.023000064+00:00 2016-06-26 22:50:00+00:00 4009.023

In [52]:
readings_weather_view.describe()


Out[52]:
DifferenceS
count 9.221613e+06
mean 3.518722e+02
std 2.386331e+02
min 9.999872e-03
25% 1.507070e+02
50% 3.048800e+02
75% 5.156200e+02
max 4.009023e+03

In [53]:
readings_weather.rename(columns={'Timestamp_x': 'Timestamp'}, inplace=True)
readings_weather.drop(['Timestamp_y', 'WeatherIdx', 'DifferenceS'], axis=1, inplace=True)

In [54]:
readings_weather.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9221613 entries, 0 to 8696776
Data columns (total 22 columns):
Id                 object
NbBikes            float64
NbDocks            float64
NbEmptyDocks       float64
NbUnusableDocks    float64
Source             object
Timestamp          datetime64[ns, UTC]
Condition          object
DewPt              float32
Fog                bool
Hail               bool
Humidity           float32
Pressure           float32
Rain               bool
Snow               bool
Temp               float32
Thunder            bool
Tornado            bool
Visibility         float32
WindDirD           float32
WindDirE           object
WindSpeed          float32
dtypes: bool(6), datetime64[ns, UTC](1), float32(7), float64(4), object(4)
memory usage: 1002.6+ MB

In [56]:
pickle.dump(readings_weather, open("data/parsed/readings_weather_filled_dataset.p", "wb"))