Merge Datasets

Set Up



In [1]:

    
%matplotlib inline

import logging
import pickle
import pandas as pd
import numpy as np
import math

logger = logging.getLogger()
logger.setLevel(logging.INFO)



In [2]:

    
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_final.p', 'rb'))



In [3]:

    
print readings.shape
print stations.shape
print weather.shape









    



(1483149, 6)
(779, 13)
(3008, 16)

Fill Gaps



In [4]:

    
fill_gaps = True



In [48]:

    
def find_next(df, start_loc):
    if start_loc + 1 == len(df):
        return None
    else:    
        return df.loc[start_loc + 1]

def get_fillings(df):
    fillings=[]
    for idx, start in df.iterrows():
        end = find_next(df, idx)
        if end is None:
            break
            
        big_gap = (end.Timestamp - start.Timestamp).seconds > (60 * 5)
        if big_gap:
            gap_fillings = pd.date_range(start=start.Timestamp, end=end.Timestamp, freq='5min', tz='UTC')[1:]
            if (end.Timestamp - gap_fillings[-1]).seconds < (60 * 2 + 30):
                gap_fillings = gap_fillings[:-1]
                
            for timestamp in gap_fillings:                
                fillings.append({'Id': start.Id, 'Timestamp': timestamp, 'Source': 'ARTIFICIAL'})
                
    return fillings



In [6]:

    
if fill_gaps:
    # prepare to find gaps
    readings['Source'] = 'REAL'
    readings.sort_values(by=['Timestamp'], inplace=True)
    
    stations_ids = stations.Id.unique()
    
    # find the gaps for each station
    fillings = []
    for station_id in stations_ids:
        station_df = readings[readings.Id == station_id].reset_index(drop=True)
        station_fillings = get_fillings(station_df)
        fillings.append(station_fillings)
        
    # add the gaps to the original dataset
    readings = pd.concat([readings, pd.DataFrame(sum(fillings, []))])
    
    # fill the missing values using a fill forward strategy
    readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
    readings.fillna(method='ffill', inplace=True)
    readings.reset_index(drop=True, inplace=True)

Merge Readings and Weather

Use binary search to look for the closest date to the given reading.



In [9]:

    
def binarySearch(data, val):
    """Find the closest val in data"""
    
    lo, hi = 0, len(data) - 1
    best_ind = lo
    while lo <= hi:
        mid = lo + (hi - lo) / 2
        if data.iat[mid] < val:
            lo = mid + 1
        elif data.iat[mid] > val:
            hi = mid - 1
        else:
            best_ind = mid
            break
        # check if data[mid] is closer to val than data[best_ind] 
        if abs(data.iat[mid] - val) < abs(data.iat[best_ind] - val):
            best_ind = mid
    return best_ind



In [30]:

    
readings.head()









    Out[30]:






  
    
      
      Id
      NbBikes
      NbDocks
      NbEmptyDocks
      NbUnusableDocks
      Source
      Timestamp
    
  
  
    
      0
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      REAL
      2016-05-16 05:41:16.870000128+00:00
    
    
      1
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 05:46:16.870000128+00:00
    
    
      2
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 05:51:16.870000128+00:00
    
    
      3
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 05:56:16.870000128+00:00
    
    
      4
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 06:01:16.870000128+00:00



In [44]:

    
weather[15:25]









    Out[44]:






  
    
      
      Condition
      DewPt
      Fog
      Hail
      Humidity
      Pressure
      Rain
      Snow
      Temp
      Thunder
      Timestamp
      Tornado
      Visibility
      WindDirD
      WindDirE
      WindSpeed
    
  
  
    
      15
      NaN
      8.0
      True
      True
      92.0
      1021.0
      True
      True
      9.0
      True
      2016-05-16 04:00:00+00:00
      True
      10.0
      30.0
      NNE
      3.7
    
    
      16
      Unknown
      7.0
      True
      True
      93.0
      1021.0
      True
      True
      8.0
      True
      2016-05-16 04:20:00+00:00
      True
      10.0
      350.0
      North
      7.4
    
    
      17
      Unknown
      7.0
      True
      True
      93.0
      1021.0
      True
      True
      8.0
      True
      2016-05-16 04:50:00+00:00
      True
      9.0
      360.0
      North
      7.4
    
    
      18
      NaN
      7.0
      True
      True
      92.0
      1021.0
      True
      True
      9.0
      True
      2016-05-16 05:00:00+00:00
      True
      11.0
      10.0
      North
      9.3
    
    
      19
      Partly Cloudy
      7.0
      True
      True
      93.0
      1021.0
      True
      True
      8.0
      True
      2016-05-16 05:20:00+00:00
      True
      10.0
      360.0
      North
      7.4
    
    
      20
      Unknown
      7.0
      True
      True
      87.0
      1021.0
      True
      True
      9.0
      True
      2016-05-16 05:50:00+00:00
      True
      10.0
      350.0
      North
      9.3
    
    
      21
      NaN
      8.0
      True
      True
      83.0
      1022.0
      True
      True
      10.0
      True
      2016-05-16 06:00:00+00:00
      True
      11.0
      10.0
      North
      9.3
    
    
      22
      Partly Cloudy
      7.0
      True
      True
      82.0
      1021.0
      True
      True
      10.0
      True
      2016-05-16 06:20:00+00:00
      True
      10.0
      360.0
      North
      13.0
    
    
      23
      Partly Cloudy
      7.0
      True
      True
      82.0
      1021.0
      True
      True
      10.0
      True
      2016-05-16 06:50:00+00:00
      True
      10.0
      20.0
      NNE
      11.1
    
    
      24
      Scattered Clouds
      8.0
      True
      True
      77.0
      1022.0
      True
      True
      11.0
      True
      2016-05-16 07:00:00+00:00
      True
      13.0
      20.0
      NNE
      9.3



In [57]:

    
readings.head()









    Out[57]:






  
    
      
      Id
      NbBikes
      NbDocks
      NbEmptyDocks
      NbUnusableDocks
      Source
      Timestamp
      WeatherIdx
    
  
  
    
      0
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      REAL
      2016-05-16 05:41:16.870000128+00:00
      20
    
    
      1
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 05:46:16.870000128+00:00
      20
    
    
      2
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 05:51:16.870000128+00:00
      20
    
    
      3
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 05:56:16.870000128+00:00
      21
    
    
      4
      BikePoints_1
      12.0
      19.0
      6.0
      1.0
      ARTIFICIAL
      2016-05-16 06:01:16.870000128+00:00
      21



In [45]:

    
readings['Timestamp'][0:5].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])









    Out[45]:





0    20
1    20
2    20
3    21
4    21
Name: Timestamp, dtype: int64



In [46]:

    
readings['WeatherIdx'] = readings['Timestamp'].apply(lambda val: weather['Timestamp'].index[binarySearch(weather['Timestamp'], val.tz_localize('UTC'))])



In [49]:

    
readings_weather = pd.merge(readings, weather, right_index=True, left_on='WeatherIdx')
readings_weather['DifferenceS'] = (readings_weather['Timestamp_x'] - readings_weather['Timestamp_y']) / pd.np.timedelta64(1, 's')
readings_weather['DifferenceS'] = readings_weather['DifferenceS'].apply(math.fabs)



In [50]:

    
readings_weather_view = readings_weather[['Timestamp_x', 'Timestamp_y', 'DifferenceS']]



In [51]:

    
readings_weather_view.sort_values(by=['DifferenceS'], ascending=False).head()









    Out[51]:






  
    
      
      Timestamp_x
      Timestamp_y
      DifferenceS
    
  
  
    
      7076497
      2016-06-26 23:56:49.023000064+00:00
      2016-06-26 22:50:00+00:00
      4009.023
    
    
      4700828
      2016-06-26 23:56:49.023000064+00:00
      2016-06-26 22:50:00+00:00
      4009.023
    
    
      5160747
      2016-06-26 23:56:49.023000064+00:00
      2016-06-26 22:50:00+00:00
      4009.023
    
    
      3048556
      2016-06-26 23:56:49.023000064+00:00
      2016-06-26 22:50:00+00:00
      4009.023
    
    
      1016565
      2016-06-26 23:56:49.023000064+00:00
      2016-06-26 22:50:00+00:00
      4009.023



In [52]:

    
readings_weather_view.describe()









    Out[52]:






  
    
      
      DifferenceS
    
  
  
    
      count
      9.221613e+06
    
    
      mean
      3.518722e+02
    
    
      std
      2.386331e+02
    
    
      min
      9.999872e-03
    
    
      25%
      1.507070e+02
    
    
      50%
      3.048800e+02
    
    
      75%
      5.156200e+02
    
    
      max
      4.009023e+03



In [53]:

    
readings_weather.rename(columns={'Timestamp_x': 'Timestamp'}, inplace=True)
readings_weather.drop(['Timestamp_y', 'WeatherIdx', 'DifferenceS'], axis=1, inplace=True)



In [54]:

    
readings_weather.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 9221613 entries, 0 to 8696776
Data columns (total 22 columns):
Id                 object
NbBikes            float64
NbDocks            float64
NbEmptyDocks       float64
NbUnusableDocks    float64
Source             object
Timestamp          datetime64[ns, UTC]
Condition          object
DewPt              float32
Fog                bool
Hail               bool
Humidity           float32
Pressure           float32
Rain               bool
Snow               bool
Temp               float32
Thunder            bool
Tornado            bool
Visibility         float32
WindDirD           float32
WindDirE           object
WindSpeed          float32
dtypes: bool(6), datetime64[ns, UTC](1), float32(7), float64(4), object(4)
memory usage: 1002.6+ MB



In [56]:

    
pickle.dump(readings_weather, open("data/parsed/readings_weather_filled_dataset.p", "wb"))

	Id	NbBikes	NbDocks	NbEmptyDocks	NbUnusableDocks	Source	Timestamp
0	BikePoints_1	12.0	19.0	6.0	1.0	REAL	2016-05-16 05:41:16.870000128+00:00
1	BikePoints_1	12.0	19.0	6.0	1.0	ARTIFICIAL	2016-05-16 05:46:16.870000128+00:00
2	BikePoints_1	12.0	19.0	6.0	1.0	ARTIFICIAL	2016-05-16 05:51:16.870000128+00:00
3	BikePoints_1	12.0	19.0	6.0	1.0	ARTIFICIAL	2016-05-16 05:56:16.870000128+00:00
4	BikePoints_1	12.0	19.0	6.0	1.0	ARTIFICIAL	2016-05-16 06:01:16.870000128+00:00

	Condition	DewPt	Fog	Hail	Humidity	Pressure	Rain	Snow	Temp	Thunder	Timestamp	Tornado	Visibility	WindDirD	WindDirE	WindSpeed
15	NaN	8.0	True	True	92.0	1021.0	True	True	9.0	True	2016-05-16 04:00:00+00:00	True	10.0	30.0	NNE	3.7
16	Unknown	7.0	True	True	93.0	1021.0	True	True	8.0	True	2016-05-16 04:20:00+00:00	True	10.0	350.0	North	7.4
17	Unknown	7.0	True	True	93.0	1021.0	True	True	8.0	True	2016-05-16 04:50:00+00:00	True	9.0	360.0	North	7.4
18	NaN	7.0	True	True	92.0	1021.0	True	True	9.0	True	2016-05-16 05:00:00+00:00	True	11.0	10.0	North	9.3
19	Partly Cloudy	7.0	True	True	93.0	1021.0	True	True	8.0	True	2016-05-16 05:20:00+00:00	True	10.0	360.0	North	7.4
20	Unknown	7.0	True	True	87.0	1021.0	True	True	9.0	True	2016-05-16 05:50:00+00:00	True	10.0	350.0	North	9.3
21	NaN	8.0	True	True	83.0	1022.0	True	True	10.0	True	2016-05-16 06:00:00+00:00	True	11.0	10.0	North	9.3
22	Partly Cloudy	7.0	True	True	82.0	1021.0	True	True	10.0	True	2016-05-16 06:20:00+00:00	True	10.0	360.0	North	13.0
23	Partly Cloudy	7.0	True	True	82.0	1021.0	True	True	10.0	True	2016-05-16 06:50:00+00:00	True	10.0	20.0	NNE	11.1
24	Scattered Clouds	8.0	True	True	77.0	1022.0	True	True	11.0	True	2016-05-16 07:00:00+00:00	True	13.0	20.0	NNE	9.3

	Timestamp_x	Timestamp_y	DifferenceS
7076497	2016-06-26 23:56:49.023000064+00:00	2016-06-26 22:50:00+00:00	4009.023
4700828	2016-06-26 23:56:49.023000064+00:00	2016-06-26 22:50:00+00:00	4009.023
5160747	2016-06-26 23:56:49.023000064+00:00	2016-06-26 22:50:00+00:00	4009.023
3048556	2016-06-26 23:56:49.023000064+00:00	2016-06-26 22:50:00+00:00	4009.023
1016565	2016-06-26 23:56:49.023000064+00:00	2016-06-26 22:50:00+00:00	4009.023

	DifferenceS
count	9.221613e+06
mean	3.518722e+02
std	2.386331e+02
min	9.999872e-03
25%	1.507070e+02
50%	3.048800e+02
75%	5.156200e+02
max	4.009023e+03