Build Modelling Dataset

Set Up


In [1]:
%matplotlib inline

import logging
import cPickle as pickle
import pandas as pd
import numpy as np
import time

from datetime import datetime

logger = logging.getLogger()
logger.setLevel(logging.INFO)

Clean Stations


In [2]:
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb')).set_index('Id')

readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings.set_index(['Id', 'Timestamp'], inplace=True)

In [3]:
# remove stations that have less than 21 days of data
stations_to_remove = []
for station_id in readings.index.get_level_values('Id').unique():
    nb_days = len(np.unique(readings.loc[station_id].index.strftime('%Y-%m-%d')))
    if nb_days < 29:
        stations_to_remove.append(station_id)
    
readings.drop(stations_to_remove, inplace=True)
stations.drop(stations_to_remove, inplace=True)
print 'Removed stations %s' % stations_to_remove


Removed stations [u'BikePoints_237', u'BikePoints_497', u'BikePoints_796', u'BikePoints_798', u'BikePoints_799', u'BikePoints_802', u'BikePoints_803', u'BikePoints_805', u'BikePoints_807', u'BikePoints_809', u'BikePoints_810', u'BikePoints_811', u'BikePoints_814', u'BikePoints_817']

In [6]:
pickle.dump(stations, open("data/parsed/stations_clean.p", "wb"))
print 'Saved'


Saved

Closest Stations


In [4]:
from geopy.distance import vincenty

# get the locations of all the stations in tuples
stations_locations = pd.Series(zip(stations.Latitude, stations.Longitude), index=stations.index)

# create a DF to hold the nearest stations
surrounding_stations = pd.DataFrame(columns=['Near%d' % (i+1) for i in range(0,20)], index=stations.index)
for station_id, station_loc in stations_locations.iteritems():
    ranking = stations_locations.apply(lambda x: vincenty(station_loc, x).kilometers).sort_values()
    surrounding_stations.loc[station_id] = ranking.index.values[1:21]

Resample Weather


In [5]:
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))

weather.set_index(['Timestamp'], inplace=True)
weather = weather.resample('5min').mean().ffill()

Preprocess Redistribution


In [6]:
distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb'))
distributed = distributed.query('NbBikes != 0').drop(['Name'], axis=1)
distributed.columns = ['DistNbBikes']

In [7]:
collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb'))
collected = collected.query('NbBikes != 0').drop(['Name'], axis=1)
collected.columns = ['CollNbBikes']

Merge Readings


In [8]:
dfs = []
for station_id in readings.index.get_level_values('Id').unique():
    # resample with a freq of 5 mins
    resampled_readings = readings.loc[station_id].resample('5min').mean().ffill()
    
    # merge weather
    merged = resampled_readings.merge(weather, how='left', left_index=True, right_index=True)
    
    # merge distributed
    if station_id in distributed.index:
        dist_resampled = distributed.loc[station_id].resample('5min').sum()
        merged = merged.merge(dist_resampled, how='left', left_index=True, right_index=True)
        
    # merge collected
    if station_id in collected.index:
        coll_resampled = collected.loc[station_id].resample('5min').sum()
        merged = merged.merge(coll_resampled, how='left', left_index=True, right_index=True)
        
    # merge surrounding
    surrounding_stations_nbbikes = []
    for surrounding_station_id in surrounding_stations.loc[station_id].values:
        surrounding_stations_nbbikes.append(readings.loc[surrounding_station_id].NbBikes.resample('5min').mean().ffill())
    surrounding_stations_nbbikes = pd.concat(surrounding_stations_nbbikes, axis=1)
    surrounding_stations_nbbikes.columns = surrounding_stations.columns
    merged = merged.merge(surrounding_stations_nbbikes, how='left', left_index=True, right_index=True)
    
    # set the id
    merged['Id'] = station_id

    dfs.append(merged.reset_index())

readings_weather = pd.concat(dfs, ignore_index=True)

Reduce Column Size


In [9]:
start_time = time.time()

readings_weather.NbBikes = readings_weather.NbBikes.astype('int16')
readings_weather.NbDocks = readings_weather.NbDocks.astype('int16')
readings_weather.NbEmptyDocks = readings_weather.NbEmptyDocks.astype('int16')
readings_weather.NbUnusableDocks = readings_weather.NbUnusableDocks.astype('int16')
readings_weather.CollNbBikes = readings_weather.CollNbBikes.astype('float16')
readings_weather.DistNbBikes = readings_weather.DistNbBikes.astype('float16')

readings_weather.DistNbBikes = readings_weather.DistNbBikes.fillna(0).astype('int16')
readings_weather.CollNbBikes = readings_weather.CollNbBikes.fillna(0).astype('int16')

# have to use float due to pandas gotcha with NaN in integer 
for col in surrounding_stations.columns:
    readings_weather[col] = readings_weather[col].astype('float16')
    
readings_weather.DewPt = readings_weather.DewPt.astype('float16')
readings_weather.Humidity = readings_weather.Humidity.astype('float16')
readings_weather.Pressure = readings_weather.Pressure.astype('float16')
readings_weather.Temp = readings_weather.Temp.astype('float16')
readings_weather.Visibility = readings_weather.Visibility.astype('float16')
readings_weather.WindDirD = readings_weather.WindDirD.astype('float16')
readings_weather.WindSpeed = readings_weather.WindSpeed.astype('float16')

readings_weather.Fog = readings_weather.Fog.astype('int8')
readings_weather.Rain = readings_weather.Rain.astype('int8')
readings_weather.Hail = readings_weather.Hail.astype('int8')
readings_weather.Snow = readings_weather.Snow.astype('int8')
readings_weather.Thunder = readings_weather.Thunder.astype('int8')
readings_weather.Tornado = readings_weather.Tornado.astype('int8')

end_time = time.time()
print 'Modifying data types took %s' % (end_time - start_time)


Modifying data types took 4.92580604553

Modify Timestamp


In [10]:
start_time = time.time()

readings_weather['Holiday'] = readings_weather.Timestamp.apply(lambda x: x.month == 5 and x.day == 30).astype('int8')
readings_weather['Weekday'] = readings_weather.Timestamp.apply(lambda x: x.dayofweek < 5).astype('int8')
readings_weather['Weekend'] = readings_weather.Timestamp.apply(lambda x: x.dayofweek > 4).astype('int8')
readings_weather['TimeOfYear'] = readings_weather.Timestamp.apply(lambda x: (x - datetime(2016,1,1)).total_seconds())
readings_weather['TimeOfDay'] = readings_weather.Timestamp.apply(lambda x: (x - x.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds())
readings_weather['WeekOfYear'] = readings_weather.Timestamp.apply(lambda x: x.isocalendar()[1])
readings_weather['DayOfWeek'] = readings_weather.Timestamp.apply(lambda x: x.weekday())

end_time = time.time()
print 'Changing timestamp took %s' % (end_time - start_time)


Changing timestamp took 386.990791082

Re-arrange


In [11]:
readings_weather.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings_weather.set_index(['Id', 'Timestamp'], inplace=True)

In [12]:
pickle.dump(readings_weather, open("data/parsed/readings_resampled.p", "wb"))
print 'Saved'


Saved

Autoregressive


In [2]:
readings_weather = pickle.load(open('data/parsed/readings_resampled.p', 'rb'))

In [3]:
def shift_cols(df, cols, periods, mode):
    for period in periods:    
        target_cols = []
        for col in cols:
            label = '%s%s%d' % (col, mode, period)
            target_cols.append(label)
            df[label] = df[col]
    
        station_ids = df.index.get_level_values('Id').unique()   
        for station_id in station_ids:   
            shifted = df.loc[station_id, target_cols].shift(periods=period)
            df.loc[station_id, target_cols] = shifted.values

In [4]:
start_time = time.time()

shift_cols(readings_weather, ['Temp', 'Humidity', 'Rain', 'Fog'], [2, 12], 'TMinus')

readings_weather.TempTMinus2 = readings_weather.TempTMinus2.astype('float16')
readings_weather.TempTMinus12 = readings_weather.TempTMinus12.astype('float16')
readings_weather.HumidityTMinus2 = readings_weather.HumidityTMinus2.astype('float16')
readings_weather.HumidityTMinus12 = readings_weather.HumidityTMinus12.astype('float16')
readings_weather.RainTMinus2 = readings_weather.RainTMinus2.astype('float16')
readings_weather.RainTMinus12 = readings_weather.RainTMinus12.astype('float16')
readings_weather.FogTMinus2 = readings_weather.FogTMinus2.astype('float16')
readings_weather.FogTMinus12 = readings_weather.FogTMinus12.astype('float16')

end_time = time.time()
print 'Adding previous weather columns took %s' % (end_time - start_time)


Adding previous weather columns took 226.483350039

In [5]:
start_time = time.time()

shift_cols(readings_weather, ['NbBikes'], [1, 2, 3, 12, 18], 'TMinus')

readings_weather.NbBikesTMinus1 = readings_weather.NbBikesTMinus1.astype('float16')
readings_weather.NbBikesTMinus2 = readings_weather.NbBikesTMinus2.astype('float16')
readings_weather.NbBikesTMinus3 = readings_weather.NbBikesTMinus3.astype('float16')
readings_weather.NbBikesTMinus12 = readings_weather.NbBikesTMinus12.astype('float16')
readings_weather.NbBikesTMinus18 = readings_weather.NbBikesTMinus18.astype('float16')

end_time = time.time()
print 'Adding autoregressive columns took %s' % (end_time - start_time)


Adding autoregressive columns took 335.951253891

In [6]:
start_time = time.time()
shift_cols(readings_weather, ['Near1'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near2'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near3'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near4'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near5'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near6'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near7'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near8'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near9'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near10'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near11'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near12'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near13'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near14'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near15'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near16'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near17'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near18'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near19'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near20'], [2, 3, 12, 18], 'TMinus')
end_time = time.time()

print 'Adding autoregressive columns took %s' % (end_time - start_time)


Adding autoregressive columns took 1853.17854595

In [7]:
pickle.dump(readings_weather, open("data/parsed/readings_autoregressive.p", "wb"))
print 'Saved'


Saved

Add Historic Avg


In [2]:
readings_weather = pickle.load(open('data/parsed/readings_autoregressive.p', 'rb'))

In [3]:
start_time = time.time()

readings_weather['HistAvg'] = np.NaN

for station_id in readings_weather.index.get_level_values('Id').unique():
    window = readings_weather.loc[station_id]
    
    station_hist_avgs = window.groupby(['Holiday', 'Weekday', 'TimeOfDay'])[['NbBikes']].mean()    
    
    station_ts = window.reset_index().set_index(['Holiday', 'Weekday', 'TimeOfDay'])[['Timestamp']]
    hist_avg_ts = station_ts.merge(station_hist_avgs, how='inner', left_index=True, right_index=True)
    hist_avg_ts = hist_avg_ts.reset_index(drop=True).set_index('Timestamp').sort_index().round().NbBikes

    readings_weather.loc[station_id].HistAvg = hist_avg_ts
    
readings_weather.HistAvg = readings_weather.HistAvg.astype('float16')

end_time = time.time()
print 'Adding historic average column took %s' % (end_time - start_time)


/home/jfconavarrete/Documents/Work/Config/anaconda2/lib/python2.7/site-packages/pandas/core/generic.py:2698: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
Adding historic average column took 87.8633668423

Add Cumulative Redistribution


In [4]:
def sum_cumulative(series, n):
    cumulative_sum = series
    for i in range(1, n + 1):
        cumulative_sum = series.shift(i).fillna(0) + cumulative_sum
    return cumulative_sum.values

In [5]:
start_time = time.time()

readings_weather['CollNbBikesCum2'] = np.NaN
readings_weather['CollNbBikesCum6'] = np.NaN
readings_weather['DistNbBikesCum2'] = np.NaN
readings_weather['DistNbBikesCum6'] = np.NaN

for station_id in readings_weather.index.get_level_values('Id').unique():
    window = readings_weather.loc[station_id]

    collected = window.CollNbBikes
    readings_weather.loc[station_id].CollNbBikesCum2 = sum_cumulative(collected, 2)
    readings_weather.loc[station_id].CollNbBikesCum6 = sum_cumulative(collected, 6)

    distributed = window.DistNbBikes
    readings_weather.loc[station_id].DistNbBikesCum2 = sum_cumulative(distributed, 2)
    readings_weather.loc[station_id].DistNbBikesCum6 = sum_cumulative(distributed, 6)
    
readings_weather.CollNbBikesCum2 = readings_weather.CollNbBikesCum2.astype('int16')
readings_weather.CollNbBikesCum6 = readings_weather.CollNbBikesCum6.astype('int16')
readings_weather.DistNbBikesCum2 = readings_weather.DistNbBikesCum2.astype('int16')
readings_weather.DistNbBikesCum6 = readings_weather.DistNbBikesCum6.astype('int16')
    
end_time = time.time()
print 'Adding cumulative redistribution columns took %s' % (end_time - start_time)


Adding cumulative redistribution columns took 228.005467892

In [6]:
pickle.dump(readings_weather, open("data/parsed/readings_full.p", "wb"))
print 'Saved'


Saved

Clean

Delete stations which experienced periods of inactivity.


In [2]:
readings_weather = pickle.load(open('data/parsed/readings_full.p', 'rb'))

In [3]:
missing_days = pickle.load(open('data/parsed/missing_days.p', 'rb'))

In [4]:
invalid_readings = []
for idx, row in missing_days.iterrows():
    if idx not in readings_weather.index:
        continue
    
    for missing_day in row.MissingDays:
        labels = readings_weather.loc[idx].loc[missing_day:missing_day.replace(hour=23, minute=59, second=59, microsecond=999999)].index
        for label in labels:
            invalid_readings.append((idx, label))

readings_weather.drop(invalid_readings, inplace=True)
print 'Removed %d invalid readings' % len(invalid_readings)


Removed 22464 invalid readings

Save


In [5]:
readings_weather.info(memory_usage='deep', verbose=True)


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9280524 entries, (BikePoints_1, 2016-05-15 13:35:00+00:00) to (BikePoints_99, 2016-06-26 23:25:00+00:00)
Data columns (total 104 columns):
CollNbBikes         int16
DewPt               float16
DistNbBikes         int16
Fog                 int8
Hail                int8
Humidity            float16
NbBikes             int16
NbDocks             int16
NbEmptyDocks        int16
NbUnusableDocks     int16
Near1               float16
Near10              float16
Near11              float16
Near12              float16
Near13              float16
Near14              float16
Near15              float16
Near16              float16
Near17              float16
Near18              float16
Near19              float16
Near2               float16
Near20              float16
Near3               float16
Near4               float16
Near5               float16
Near6               float16
Near7               float16
Near8               float16
Near9               float16
Pressure            float16
Rain                int8
Snow                int8
Temp                float16
Thunder             int8
Tornado             int8
Visibility          float16
WindDirD            float16
WindSpeed           float16
Holiday             int8
Weekday             int8
Weekend             int8
TimeOfYear          float64
TimeOfDay           float64
WeekOfYear          int64
DayOfWeek           int64
TempTMinus2         float16
HumidityTMinus2     float16
RainTMinus2         float16
FogTMinus2          float16
TempTMinus12        float16
HumidityTMinus12    float16
RainTMinus12        float16
FogTMinus12         float16
NbBikesTMinus1      float16
NbBikesTMinus2      float16
NbBikesTMinus3      float16
NbBikesTMinus12     float16
NbBikesTMinus18     float16
Near1TMinus2        float16
Near1TMinus3        float16
Near1TMinus12       float16
Near1TMinus18       float16
Near2TMinus2        float16
Near2TMinus3        float16
Near2TMinus12       float16
Near2TMinus18       float16
Near3TMinus2        float16
Near3TMinus3        float16
Near3TMinus12       float16
Near3TMinus18       float16
Near4TMinus2        float16
Near4TMinus3        float16
Near4TMinus12       float16
Near4TMinus18       float16
Near5TMinus2        float16
Near5TMinus3        float16
Near5TMinus12       float16
Near5TMinus18       float16
Near6TMinus2        float16
Near6TMinus3        float16
Near6TMinus12       float16
Near6TMinus18       float16
Near7TMinus2        float16
Near7TMinus3        float16
Near7TMinus12       float16
Near7TMinus18       float16
Near8TMinus2        float16
Near8TMinus3        float16
Near8TMinus12       float16
Near8TMinus18       float16
Near9TMinus2        float16
Near9TMinus3        float16
Near9TMinus12       float16
Near9TMinus18       float16
Near10TMinus2       float16
Near10TMinus3       float16
Near10TMinus12      float16
Near10TMinus18      float16
HistAvg             float16
CollNbBikesCum2     int16
CollNbBikesCum6     int16
DistNbBikesCum2     int16
DistNbBikesCum6     int16
dtypes: float16(81), float64(2), int16(10), int64(2), int8(9)
memory usage: 2.3 GB

In [6]:
pickle.dump(readings_weather, open("data/parsed/readings_clean.p", "wb"))
print 'Saved'


Saved

In [ ]: