In [1]:
%matplotlib inline
import logging
import cPickle as pickle
import pandas as pd
import numpy as np
import time
from datetime import datetime
logger = logging.getLogger()
logger.setLevel(logging.INFO)
In [2]:
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb')).set_index('Id')
readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings.set_index(['Id', 'Timestamp'], inplace=True)
In [3]:
# remove stations that have less than 21 days of data
stations_to_remove = []
for station_id in readings.index.get_level_values('Id').unique():
nb_days = len(np.unique(readings.loc[station_id].index.strftime('%Y-%m-%d')))
if nb_days < 29:
stations_to_remove.append(station_id)
readings.drop(stations_to_remove, inplace=True)
stations.drop(stations_to_remove, inplace=True)
print 'Removed stations %s' % stations_to_remove
In [6]:
pickle.dump(stations, open("data/parsed/stations_clean.p", "wb"))
print 'Saved'
In [4]:
from geopy.distance import vincenty
# get the locations of all the stations in tuples
stations_locations = pd.Series(zip(stations.Latitude, stations.Longitude), index=stations.index)
# create a DF to hold the nearest stations
surrounding_stations = pd.DataFrame(columns=['Near%d' % (i+1) for i in range(0,20)], index=stations.index)
for station_id, station_loc in stations_locations.iteritems():
ranking = stations_locations.apply(lambda x: vincenty(station_loc, x).kilometers).sort_values()
surrounding_stations.loc[station_id] = ranking.index.values[1:21]
In [5]:
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))
weather.set_index(['Timestamp'], inplace=True)
weather = weather.resample('5min').mean().ffill()
In [6]:
distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb'))
distributed = distributed.query('NbBikes != 0').drop(['Name'], axis=1)
distributed.columns = ['DistNbBikes']
In [7]:
collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb'))
collected = collected.query('NbBikes != 0').drop(['Name'], axis=1)
collected.columns = ['CollNbBikes']
In [8]:
dfs = []
for station_id in readings.index.get_level_values('Id').unique():
# resample with a freq of 5 mins
resampled_readings = readings.loc[station_id].resample('5min').mean().ffill()
# merge weather
merged = resampled_readings.merge(weather, how='left', left_index=True, right_index=True)
# merge distributed
if station_id in distributed.index:
dist_resampled = distributed.loc[station_id].resample('5min').sum()
merged = merged.merge(dist_resampled, how='left', left_index=True, right_index=True)
# merge collected
if station_id in collected.index:
coll_resampled = collected.loc[station_id].resample('5min').sum()
merged = merged.merge(coll_resampled, how='left', left_index=True, right_index=True)
# merge surrounding
surrounding_stations_nbbikes = []
for surrounding_station_id in surrounding_stations.loc[station_id].values:
surrounding_stations_nbbikes.append(readings.loc[surrounding_station_id].NbBikes.resample('5min').mean().ffill())
surrounding_stations_nbbikes = pd.concat(surrounding_stations_nbbikes, axis=1)
surrounding_stations_nbbikes.columns = surrounding_stations.columns
merged = merged.merge(surrounding_stations_nbbikes, how='left', left_index=True, right_index=True)
# set the id
merged['Id'] = station_id
dfs.append(merged.reset_index())
readings_weather = pd.concat(dfs, ignore_index=True)
In [9]:
start_time = time.time()
readings_weather.NbBikes = readings_weather.NbBikes.astype('int16')
readings_weather.NbDocks = readings_weather.NbDocks.astype('int16')
readings_weather.NbEmptyDocks = readings_weather.NbEmptyDocks.astype('int16')
readings_weather.NbUnusableDocks = readings_weather.NbUnusableDocks.astype('int16')
readings_weather.CollNbBikes = readings_weather.CollNbBikes.astype('float16')
readings_weather.DistNbBikes = readings_weather.DistNbBikes.astype('float16')
readings_weather.DistNbBikes = readings_weather.DistNbBikes.fillna(0).astype('int16')
readings_weather.CollNbBikes = readings_weather.CollNbBikes.fillna(0).astype('int16')
# have to use float due to pandas gotcha with NaN in integer
for col in surrounding_stations.columns:
readings_weather[col] = readings_weather[col].astype('float16')
readings_weather.DewPt = readings_weather.DewPt.astype('float16')
readings_weather.Humidity = readings_weather.Humidity.astype('float16')
readings_weather.Pressure = readings_weather.Pressure.astype('float16')
readings_weather.Temp = readings_weather.Temp.astype('float16')
readings_weather.Visibility = readings_weather.Visibility.astype('float16')
readings_weather.WindDirD = readings_weather.WindDirD.astype('float16')
readings_weather.WindSpeed = readings_weather.WindSpeed.astype('float16')
readings_weather.Fog = readings_weather.Fog.astype('int8')
readings_weather.Rain = readings_weather.Rain.astype('int8')
readings_weather.Hail = readings_weather.Hail.astype('int8')
readings_weather.Snow = readings_weather.Snow.astype('int8')
readings_weather.Thunder = readings_weather.Thunder.astype('int8')
readings_weather.Tornado = readings_weather.Tornado.astype('int8')
end_time = time.time()
print 'Modifying data types took %s' % (end_time - start_time)
In [10]:
start_time = time.time()
readings_weather['Holiday'] = readings_weather.Timestamp.apply(lambda x: x.month == 5 and x.day == 30).astype('int8')
readings_weather['Weekday'] = readings_weather.Timestamp.apply(lambda x: x.dayofweek < 5).astype('int8')
readings_weather['Weekend'] = readings_weather.Timestamp.apply(lambda x: x.dayofweek > 4).astype('int8')
readings_weather['TimeOfYear'] = readings_weather.Timestamp.apply(lambda x: (x - datetime(2016,1,1)).total_seconds())
readings_weather['TimeOfDay'] = readings_weather.Timestamp.apply(lambda x: (x - x.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds())
readings_weather['WeekOfYear'] = readings_weather.Timestamp.apply(lambda x: x.isocalendar()[1])
readings_weather['DayOfWeek'] = readings_weather.Timestamp.apply(lambda x: x.weekday())
end_time = time.time()
print 'Changing timestamp took %s' % (end_time - start_time)
In [11]:
readings_weather.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings_weather.set_index(['Id', 'Timestamp'], inplace=True)
In [12]:
pickle.dump(readings_weather, open("data/parsed/readings_resampled.p", "wb"))
print 'Saved'
In [2]:
readings_weather = pickle.load(open('data/parsed/readings_resampled.p', 'rb'))
In [3]:
def shift_cols(df, cols, periods, mode):
for period in periods:
target_cols = []
for col in cols:
label = '%s%s%d' % (col, mode, period)
target_cols.append(label)
df[label] = df[col]
station_ids = df.index.get_level_values('Id').unique()
for station_id in station_ids:
shifted = df.loc[station_id, target_cols].shift(periods=period)
df.loc[station_id, target_cols] = shifted.values
In [4]:
start_time = time.time()
shift_cols(readings_weather, ['Temp', 'Humidity', 'Rain', 'Fog'], [2, 12], 'TMinus')
readings_weather.TempTMinus2 = readings_weather.TempTMinus2.astype('float16')
readings_weather.TempTMinus12 = readings_weather.TempTMinus12.astype('float16')
readings_weather.HumidityTMinus2 = readings_weather.HumidityTMinus2.astype('float16')
readings_weather.HumidityTMinus12 = readings_weather.HumidityTMinus12.astype('float16')
readings_weather.RainTMinus2 = readings_weather.RainTMinus2.astype('float16')
readings_weather.RainTMinus12 = readings_weather.RainTMinus12.astype('float16')
readings_weather.FogTMinus2 = readings_weather.FogTMinus2.astype('float16')
readings_weather.FogTMinus12 = readings_weather.FogTMinus12.astype('float16')
end_time = time.time()
print 'Adding previous weather columns took %s' % (end_time - start_time)
In [5]:
start_time = time.time()
shift_cols(readings_weather, ['NbBikes'], [1, 2, 3, 12, 18], 'TMinus')
readings_weather.NbBikesTMinus1 = readings_weather.NbBikesTMinus1.astype('float16')
readings_weather.NbBikesTMinus2 = readings_weather.NbBikesTMinus2.astype('float16')
readings_weather.NbBikesTMinus3 = readings_weather.NbBikesTMinus3.astype('float16')
readings_weather.NbBikesTMinus12 = readings_weather.NbBikesTMinus12.astype('float16')
readings_weather.NbBikesTMinus18 = readings_weather.NbBikesTMinus18.astype('float16')
end_time = time.time()
print 'Adding autoregressive columns took %s' % (end_time - start_time)
In [6]:
start_time = time.time()
shift_cols(readings_weather, ['Near1'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near2'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near3'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near4'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near5'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near6'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near7'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near8'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near9'], [2, 3, 12, 18], 'TMinus')
shift_cols(readings_weather, ['Near10'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near11'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near12'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near13'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near14'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near15'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near16'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near17'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near18'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near19'], [2, 3, 12, 18], 'TMinus')
#shift_cols(readings_weather, ['Near20'], [2, 3, 12, 18], 'TMinus')
end_time = time.time()
print 'Adding autoregressive columns took %s' % (end_time - start_time)
In [7]:
pickle.dump(readings_weather, open("data/parsed/readings_autoregressive.p", "wb"))
print 'Saved'
In [2]:
readings_weather = pickle.load(open('data/parsed/readings_autoregressive.p', 'rb'))
In [3]:
start_time = time.time()
readings_weather['HistAvg'] = np.NaN
for station_id in readings_weather.index.get_level_values('Id').unique():
window = readings_weather.loc[station_id]
station_hist_avgs = window.groupby(['Holiday', 'Weekday', 'TimeOfDay'])[['NbBikes']].mean()
station_ts = window.reset_index().set_index(['Holiday', 'Weekday', 'TimeOfDay'])[['Timestamp']]
hist_avg_ts = station_ts.merge(station_hist_avgs, how='inner', left_index=True, right_index=True)
hist_avg_ts = hist_avg_ts.reset_index(drop=True).set_index('Timestamp').sort_index().round().NbBikes
readings_weather.loc[station_id].HistAvg = hist_avg_ts
readings_weather.HistAvg = readings_weather.HistAvg.astype('float16')
end_time = time.time()
print 'Adding historic average column took %s' % (end_time - start_time)
In [4]:
def sum_cumulative(series, n):
cumulative_sum = series
for i in range(1, n + 1):
cumulative_sum = series.shift(i).fillna(0) + cumulative_sum
return cumulative_sum.values
In [5]:
start_time = time.time()
readings_weather['CollNbBikesCum2'] = np.NaN
readings_weather['CollNbBikesCum6'] = np.NaN
readings_weather['DistNbBikesCum2'] = np.NaN
readings_weather['DistNbBikesCum6'] = np.NaN
for station_id in readings_weather.index.get_level_values('Id').unique():
window = readings_weather.loc[station_id]
collected = window.CollNbBikes
readings_weather.loc[station_id].CollNbBikesCum2 = sum_cumulative(collected, 2)
readings_weather.loc[station_id].CollNbBikesCum6 = sum_cumulative(collected, 6)
distributed = window.DistNbBikes
readings_weather.loc[station_id].DistNbBikesCum2 = sum_cumulative(distributed, 2)
readings_weather.loc[station_id].DistNbBikesCum6 = sum_cumulative(distributed, 6)
readings_weather.CollNbBikesCum2 = readings_weather.CollNbBikesCum2.astype('int16')
readings_weather.CollNbBikesCum6 = readings_weather.CollNbBikesCum6.astype('int16')
readings_weather.DistNbBikesCum2 = readings_weather.DistNbBikesCum2.astype('int16')
readings_weather.DistNbBikesCum6 = readings_weather.DistNbBikesCum6.astype('int16')
end_time = time.time()
print 'Adding cumulative redistribution columns took %s' % (end_time - start_time)
In [6]:
pickle.dump(readings_weather, open("data/parsed/readings_full.p", "wb"))
print 'Saved'
In [2]:
readings_weather = pickle.load(open('data/parsed/readings_full.p', 'rb'))
In [3]:
missing_days = pickle.load(open('data/parsed/missing_days.p', 'rb'))
In [4]:
invalid_readings = []
for idx, row in missing_days.iterrows():
if idx not in readings_weather.index:
continue
for missing_day in row.MissingDays:
labels = readings_weather.loc[idx].loc[missing_day:missing_day.replace(hour=23, minute=59, second=59, microsecond=999999)].index
for label in labels:
invalid_readings.append((idx, label))
readings_weather.drop(invalid_readings, inplace=True)
print 'Removed %d invalid readings' % len(invalid_readings)
In [5]:
readings_weather.info(memory_usage='deep', verbose=True)
In [6]:
pickle.dump(readings_weather, open("data/parsed/readings_clean.p", "wb"))
print 'Saved'
In [ ]: