In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.dates as mdates
from matplotlib import pyplot as plt
import seaborn as sns
# Set random
np.random.seed(42)
In [3]:
import sys
sys.path.append('../')
from prediction import (datareader, complete_data, cleanup, bikes_probability,
time_resampling)
In [4]:
%load_ext watermark
In [5]:
%watermark -d -v -p numpy,pandas,matplotlib -g -m -w
In [6]:
def plot_features_station(result, station, features_to_plot='paa', nb_row=350, draw_type='-'):
"""Plot available bikes and bike stands for a given station"""
data = result[result.station == station].tail(nb_row).copy()
fig, ax = plt.subplots(figsize=(18,5))
plt.plot(data.index, data.probability, draw_type, label='probability', alpha=0.8)
plt.plot(data.index, data[features_to_plot], draw_type, label=features_to_plot, alpha=0.6)
ax = plt.gca()
# set major ticks location every day
ax.xaxis.set_major_locator(mdates.DayLocator())
# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('\n\n\n%a %d.%m.%Y'))
# set minor ticks location every one hours
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
# set minor ticks format
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
plt.setp(ax.xaxis.get_minorticklabels(), rotation=45)
plt.legend(loc='best')
In [7]:
def plot_features_station_train_test(train, test, station, features_to_plot='paa', nb_row=350, draw_type='-'):
"""Plot available bikes and bike stands for a given station for a train / test dataset"""
train = train[train.station == station].tail(nb_row).copy()
test = test[test.station == station].copy()
fig, ax = plt.subplots(figsize=(18,5))
plt.plot(train.index, train.probability, draw_type, label=' Train probability', alpha=0.8)
plt.plot(train.index, train[features_to_plot], draw_type, label='Train ' + features_to_plot, alpha=0.6)
plt.plot(test.index, test.probability, draw_type, label=' Test probability', alpha=0.8)
plt.plot(test.index, test[features_to_plot], draw_type, label='Test ' + features_to_plot, alpha=0.6)
ax = plt.gca()
# set major ticks location every day
ax.xaxis.set_major_locator(mdates.DayLocator())
# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('\n\n\n%a %d.%m.%Y'))
# set minor ticks location every one hours
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
# set minor ticks format
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
plt.setp(ax.xaxis.get_minorticklabels(), rotation=45)
plt.legend(loc='best')
In [8]:
DATAFILE = '../data/lyon.csv'
In [9]:
raw = datareader(DATAFILE)
In [10]:
df_clean = cleanup(raw)
In [11]:
df_clean.head()
Out[11]:
In [12]:
df = (df_clean.pipe(time_resampling)
.pipe(complete_data)
.pipe(bikes_probability))
In [13]:
df.head()
Out[13]:
In [14]:
df.shape
Out[14]:
In [15]:
df.info()
In [16]:
# params of learning dataset creation
start = pd.Timestamp("2017-08-01T02:00:00") # Tuesday
predict_date = pd.Timestamp("2017-09-22T09:00:00") # wednesday
# predict the next 30 minutes
freq = '1H'
# number of predictions at 'predict_date'.
# Here, the next 30 minutes and the next hour (30 minutes + 30 minutes).
# If you want to predict the next 3 hours, every 30 minutes, thus set periods=6
periods = 1
Create a bool (0 classic day / 1 holidays)
In [17]:
from prediction import get_summer_holiday
In [18]:
df = get_summer_holiday(df.copy())
In [19]:
df.head(2)
Out[19]:
In [20]:
df.tail(2)
Out[20]:
Count day before and after special holiday (like assomption on 15/08)
In [21]:
from prediction import get_public_holiday
In [22]:
df = get_public_holiday(df.copy(), count_day=5)
In [23]:
df[df.ts >='2017-08-14 23:50:00'].head()
Out[23]:
In [24]:
from prediction import cluster_station_lyon
Cluster of station by ativite (mean on bike by hours of day)
You can find the process of clustering in file ../clustering-Lyon-Armand.ipynb
In [25]:
df = cluster_station_lyon(df.copy(), path_file='../data/cluster_lyon_armand.csv')
In [26]:
df.head()
Out[26]:
In [27]:
from prediction import cluster_station_geo_lyon
Cluster of Lyon's station by geography position
You can find the process of clustering in file ../Clustering-Lyon-geo-Armand.ipynb
In [28]:
df = cluster_station_lyon(df.copy(), path_file='../data/station_cluster_geo_armand.csv')
In [29]:
df.head()
Out[29]:
In [33]:
from prediction import mapping_hours
In [34]:
df['hours_binned'] = df.hour.apply(mapping_hours)
In [35]:
df.head()
Out[35]:
Ratio of open station by time
In [30]:
from prediction import get_statio_ratio_open_by_time
In [31]:
df_temp_1 = get_statio_ratio_open_by_time(df.copy())
In [32]:
df_temp_1.head()
Out[32]:
In [32]:
from prediction import get_statio_cluster_geo_ratio_open_by_time
Ratio of open station (on geography cluster) by hours
In [33]:
df_temp_2 = get_statio_cluster_geo_ratio_open_by_time(df.copy())
In [34]:
df_temp_2.head()
Out[34]:
In [39]:
data = df.sort_values(['station', 'ts']).set_index(["ts", "station"])
observation = 'probability'
label = data[observation].copy()
label.name = "future"
label = (label.reset_index(level=1)
.shift(-1, freq=freq)
.reset_index()
.set_index(["ts", "station"]))
result = data.merge(label, left_index=True, right_index=True)
result.reset_index(level=1, inplace=True)
if start is not None:
result = result[result.index >= start]
Step by step :
First step is to sort dataset by station and time ('ts')
In [36]:
data.head(15)
Out[36]:
Creating 'label' to shift probability to one hour.
probability
at 2017-07-09 01:00:00
become futur
at 2017-07-09 00:00:00
In [37]:
label[6:11]
Out[37]:
Merging label and data to create the learning dataset with target shifting
In [38]:
result[result.station == 1001][['station', 'bikes', 'stands', 'probability', 'future']].head(15)
Out[38]:
In [39]:
# Original learning dataset :
result.head()
Out[39]:
In [74]:
from prediction import create_shift_features
In [75]:
df_temp_3 = create_shift_features(result.copy(), features_name='bikes_shift_'+str(freq.replace('H', 'bin')), feature_to_shift='bikes',
features_grp='station', nb_shift=periods)
In [76]:
df_temp_3[['station', 'bikes', 'bikes_shift_1bin']].head(15)
Out[76]:
In [43]:
from prediction import create_cumul_trend_features
In [44]:
# Need to use df_temp with 'bikes_shift_1bin' values
df_temp_4 = create_cumul_trend_features(df_temp_3, features_name='bikes_shift_'+str(freq.replace('H', 'bin')))
In [45]:
df_temp_4[df_temp_4.station == 1001][['station', 'bikes', 'bikes_shift_1bin',
'cumsum_trend_sup', 'cumsum_trend_inf', 'cumsum_trend_equal']].head(8)
Out[45]:
In [46]:
from prediction import get_station_recently_closed
Sometime station are closed for maintenance, so they can't be use by users. Trying to catch this information to help the learning process
In [47]:
df[254350:254361][['station', 'ts', 'bikes', 'is_open', 'probability']]
Out[47]:
In [48]:
df_temp_5 = get_station_recently_closed(result, nb_hours=4)
In [49]:
df_temp_5[['station', 'bikes', 'is_open', 'probability', 'was_recently_open']].tail()
Out[49]:
If station is open since 4 hours, was_recently_open
has a value of 24 (4 * 6 (bin is egal to 10 min))
In [69]:
from prediction import filling_bike_on_geo_cluster
Create a features on filling bike by geo station. This give information if some zone (cluster) are empty or full
In [86]:
df_temp_6 = filling_bike_on_geo_cluster(df_temp_3.copy(), features_name='bikes_shift_'+str(freq.replace('H', 'bin')))
In [87]:
df_temp_6.tail()
Out[87]:
In [88]:
from prediction import get_paa_transformation
In [89]:
df_temp_7 = get_paa_transformation(result.copy(), features_to_compute='probability', segments=10)
In [91]:
df_temp_7[df_temp_7.station == 1005][['station', 'bikes', 'probability', 'future', 'paa']].tail(22)
Out[91]:
In [92]:
plot_features_station(df_temp_7, station=1001, features_to_plot='paa', nb_row=120, draw_type='-o')
In [101]:
plot_features_station(df_temp_7, station=1001, features_to_plot='paa', nb_row=29, draw_type='-o')
In [98]:
df_temp_7[df_temp_7.station == 1001][['station', 'probability', 'future', 'paa']].tail(15)
Out[98]:
In [97]:
df_temp_7[df_temp_7.station == 1001][['station', 'probability', 'future', 'paa']][-26:-16]
Out[97]:
In [99]:
df_temp_7[df_temp_7.station == 1001].probability[-26:-16].mean()
Out[99]:
There is data leak in this features (PAA). At 09:40, there is a probability
of 0.062 (target is the same one hour later). But PAA is going to mean the next 9 values. So PAA will se the increase in the futur (0.187 / 0.187 / 0.125 - 30 min later) and PAA will be highter. Algorithm will catch it as a win information but can't see it in a production vision.
In [102]:
from prediction import get_sax_transformation
In [103]:
df_temp_8 = get_sax_transformation(result.copy(), features_to_compute='probability', segments=10, symbols=8)
In [104]:
df_temp_8[df_temp_8.station == 1001].tail(22)
Out[104]:
In [105]:
plot_features_station(df_temp_8, station=1001, features_to_plot='sax', nb_row=35, draw_type='-o')
As PAA, SAX transformation give data leak.
As PAA & SAX can give data leak, we will mean our probability on x bin (rolling mean)
In [17]:
# Original
result.head()
Out[17]:
In [18]:
from prediction import create_rolling_mean_features
In [19]:
df_temp_9 = create_rolling_mean_features(result,
features_name='mean_6',
feature_to_mean='probability',
features_grp='station',
nb_shift=6)
In [20]:
df_temp_9[df_temp_9.station == 1001].tail(15)
Out[20]:
In [33]:
plot_features_station(df_temp_9, station=1001, features_to_plot='mean_6', nb_row=40, draw_type='-o')
Here there is no leak of information in the future. You only take past informations to give context to our algorithm
Sometime station's bike don't move too much, and sometime it's crazy time. By given this indicator, we want to help our algorithm with context awareness
In [22]:
from prediction import create_rolling_std_features
In [23]:
df_temp_10 = create_rolling_std_features(result,
features_name='std_9',
feature_to_std='probability',
features_grp='station',
nb_shift=9)
In [24]:
plot_features_station(df_temp_10, station=4012, features_to_plot='std_9', nb_row=35, draw_type='-o')
In [32]:
from prediction import create_rolling_median_features
In [33]:
df_temp_11 = create_rolling_median_features(result,
features_name='median_6',
feature_to_median='probability',
features_grp='station',
nb_shift=6)
In [34]:
plot_features_station(df_temp_11, station=4012, features_to_plot='median_6', nb_row=40, draw_type='-o')
In [ ]:
In [ ]:
def create_bool_empty_full_station(df):
"""
Create a bool features "warning_empty_full"
If bike <= 2 --> 1
If Proba >= 0.875 --> 1
else --> 0
"""
df['warning_empty_full'] = 0
df.loc[df['bikes'] <= 2, 'warning_empty_full'] = 1
df.loc[df['probability'] >= 0.875, 'warning_empty_full'] = 1
return df
In [47]:
df_temp_12 = result.copy()
In [48]:
df_temp_12.head()
Out[48]:
In [49]:
df_temp_12['warning_empty_full'] = 0
df_temp_12.loc[df_temp_12['bikes'] <= 2, 'warning_empty_full'] = 1
df_temp_12.loc[df_temp_12['probability'] >= 0.875, 'warning_empty_full'] = 1
In [156]:
feature_event_to_plot='warning_empty_full'
features_event_value=1
In [157]:
df_temp_13[df_temp_13[feature_event_to_plot] == features_event_value].head()
Out[157]:
In [50]:
def plot_event_station(result, station, feature_event_to_plot='bike', features_event_value=1,
nb_row=350, point_type='*'):
"""Plot available bikes and bike stands for a given station"""
data = reswult[result.station == station].tail(nb_row).copy()
fig, ax = plt.subplots(figsize=(18,5))
plt.plot(data.index, data.probability, '-', label='probability', alpha=0.8)
plt.plot(data[data[feature_event_to_plot] == features_event_value].index,
data[data[feature_event_to_plot] == features_event_value].probability,
point_type, markerfacecolor='k',
label=feature_event_to_plot, alpha=0.6)
ax = plt.gca()
# set major ticks location every day
ax.xaxis.set_major_locator(mdates.DayLocator())
# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('\n\n\n%a %d.%m.%Y'))
# set minor ticks location every one hours
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
# set minor ticks format
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
plt.setp(ax.xaxis.get_minorticklabels(), rotation=45)
plt.legend(loc='best')
In [64]:
plot_event_station(df_temp_12, station=10101, feature_event_to_plot='warning_empty_full',
features_event_value=1, nb_row=350, point_type='*')
We split our dataset to create on date :
- A trainning Dataset
- A test Dataset
In [40]:
# to have same value
date = predict_date
print ('date : ' + str(date))
cut = date - pd.Timedelta(freq.replace('T', 'm'))
stop = date + periods * pd.Timedelta(freq.replace('T', 'm'))
print ('cut : ' + str(cut))
print ('stop : ' + str(stop))
In [41]:
train = result[result.index <= cut].copy()
mask = np.logical_and(result.index >= date, result.index <= stop)
test = result[mask].copy()
In [42]:
print('train shape : ' + str(train.shape))
print('test shape : ' + str(test.shape))
In [43]:
train.head()
Out[43]:
We need to create our binned hours mapping
In [44]:
from prediction import create_mean_by_sta_day_binned_hours
In [45]:
train_temp_1, test_temp_1 = create_mean_by_sta_day_binned_hours(train.copy(), test.copy(),
features_name='proba_mean_by_sta_day_binned_hour',
feature_to_mean='probability',
features_grp=['station', 'day', 'hours_binned'])
In [60]:
plot_features_station_train_test(train_temp_1, test_temp_1, station=1036, features_to_plot='proba_mean_by_sta_day_binned_hour',
nb_row=450, draw_type='-o')
In [ ]:
In [85]:
df_temp7 = result.copy()
df_temp7['ts'] = df_temp7.index
df_temp7 = df_temp7.sort_values(['station', 'ts'])
In [86]:
df_temp7.head()
Out[86]:
In [87]:
df_temp7['prob_shit'] = df_temp7.groupby(['station'])['probability'].apply(lambda x: x.shift(1))
df_temp7['prob_diff'] = np.abs(df_temp7['prob_shit'] - df_temp7['probability'])
In [89]:
df_temp7.tail()
Out[89]:
In [96]:
df_temp7[df_temp7.prob_diff >= 0.5].tail(6)
Out[96]:
In [113]:
plot_features_station(df_temp7, station=1036, features_to_plot='prob_diff', nb_row=300, draw_type='-')
In [116]:
df_temp7[(df_temp7.station == 1036) & (df_temp7.ts >= '2017-09-26 02:30:00')].head(6)
Out[116]:
In 10 min at 03:00, 13 bikes have been loaded here. It's impossible to predict it as it's bike company who reload this station
In [103]:
df_temp7['ano'] = 0
df_temp7.loc[df_temp7['prob_diff'] > 0.5, 'ano'] = 1
In [112]:
df_temp7[(df_temp7.station == 1036) & (df_temp7['ano']==1)][['prob_diff','day', 'hour', 'minute', 'bikes', 'prob_diff']].tail(50)
Out[112]:
In [108]:
df_temp7[df_temp7['ano']==1].station.value_counts()
Out[108]:
In [ ]:
Exact weather
In [117]:
lyon_meteo = pd.read_csv('../data/lyon_weather.csv', parse_dates=['date'])
lyon_meteo.rename(columns={'date':'ts'}, inplace=True)
In [118]:
lyon_meteo.head()
Out[118]:
Forcast weather
In [119]:
lyon_forecast = pd.read_csv('../data/lyon_forecast.csv', parse_dates=['forecast_at', 'ts'])
lyon_forecast['delta'] = lyon_forecast['ts'] - lyon_forecast['forecast_at']
In [123]:
lyon_forecast.tail()
Out[123]:
In [126]:
lyon_forecast[(lyon_forecast.rain_3h >= 1) & (lyon_forecast.delta == '1H')].tail()
Out[126]:
In [127]:
lyon_forecast[(lyon_forecast.ts >= '2017-09-14 14:00:00') & (lyon_forecast.delta == '1H')].head(15)
Out[127]:
In [ ]: