Use a supervised machine learning algorithm to predict the availability for each bicyle-sharing stations in Lyon (France) based on the history data.
I use the tree method XGBoost to predict a "probability" of bikes availability for each station. A number close to 1. means that you have several available bikes. A number close to 0. means you don't have many bikes.
In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
import graphviz
from xgboost import plot_tree, plot_importance, to_graphviz
In [36]:
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
In [5]:
import folium
In [6]:
%load_ext watermark
In [7]:
%watermark -d -v -p numpy,pandas,xgboost,matplotlib,folium -g -m -w
The module prediction.py
contains some functions dedicated to the bicyle-sharing stations predictions.
In [8]:
from sources.prediction import (datareader, complete_data, cleanup, bikes_probability,
time_resampling, prepare_data_for_training, fit, prediction)
In [9]:
DATAFILE = '../data/lyon.csv'
In [10]:
raw = datareader(DATAFILE)
In [11]:
raw.head()
Out[11]:
Min and max dates of the timeseries
In [12]:
print(raw.last_update.min())
print(raw.last_update.max())
Clean up some columns : drop some lines with the 'CLOSED' status, drop duplicates, remove missing values, etc.
In [13]:
df_clean = cleanup(raw)
In [14]:
df_clean.head()
Out[14]:
Pipe some processing data functions :
10T
)num_avail_bikes / total
In [15]:
df = (df_clean.pipe(time_resampling)
.pipe(complete_data)
.pipe(bikes_probability))
In [16]:
df.head()
Out[16]:
This is the final dataset. For further prediction, I could add some weather forecasts data to these features.
Let's select a time window (start, stop) to a single prediction.
In [17]:
start = pd.Timestamp("2017-07-11T00:00:00") # Tuesday
predict_date = pd.Timestamp("2017-07-26T10:00:00") # wednesday
# predict the next 30 minutes
freq = '30T'
# number of predictions at 'predict_date'.
# Here, the next 30 minutes and the next hour (30 minutes + 30 minutes).
# If you want to predict the next 3 hours, every 30 minutes, thus set periods=6
periods = 2
In [18]:
train_X, train_Y, test_X, test_Y = prepare_data_for_training(df,
predict_date,
freq=freq,
start=start,
periods=periods,
observation='probability')
In [19]:
train_X.head()
Out[19]:
In [20]:
# Observation with a shift of T+30 minutes in comparison to train_X.
# This is the 'future' availability used for traning.
train_Y.head()
Out[20]:
In [21]:
train_X.iloc[3200:3210]
Out[21]:
In [67]:
print(train_X.tail())
print(test_X.head())
The fit
function create some data structure for the XGBoost from the train and test DataFrames (i.e. xgb.DMatrix)
, configure the model and launch it with the objective: 'reg:logistic'
. It's a regression, not a classification.
In [22]:
# the 'booster'
bst, train_prg = fit(train_X, train_Y, test_X, test_Y)
In [42]:
train_prg = pd.DataFrame({"train": train_prg["train"]['rmse'], "test": train_prg["test"]['rmse']})
In [45]:
colors = sns.color_palette('Set1', 2)
sns.palplot(colors)
In [54]:
with sns.axes_style("darkgrid", {'xtick.major.size': 8.0}):
fig, ax = plt.subplots(figsize=(10,6))
for k, label, color in zip(train_prg.values.T, range(2), colors):
print(k)
plt.plot(100*k, color=color, label=label)
plt.legend(train_prg.columns)
plt.xlabel('XGBoost iteration')
plt.ylabel("Error (%)")
plt.xticks(np.linspace(0, 25, 6))
plt.yticks(np.linspace(0, 30, 6))
sns.despine()
plt.tight_layout()
plt.savefig("../images/lyon_prediction_training_curves.png")
In [56]:
# compute the prediction from test_*
pred = prediction(bst, test_X, test_Y)
In [57]:
pred[:5]
Out[57]:
In [58]:
print("Number of predictions: {}".format(len(pred)))
In [59]:
# Compute the RMSE
rmse = np.sqrt(np.mean((pred - test_Y)**2))
rmse
Out[59]:
In [60]:
# must install graphviz
# plot_tree(bst)
In [61]:
result = test_X.copy()
result['ts_future'] = test_Y.index.shift(1, freq=freq)
result['observation'] = test_Y.copy()
result['ts_future'] = test_Y.index.shift(1, freq=freq)
result['prediction'] = pred
result['error'] = pred - test_Y
result['relative_error'] = 100. * np.abs(pred - test_Y) / test_Y
result['quad_error'] = (pred - test_Y)**2
result.to_csv("prediction-freq-{}-{}.csv".format(freq, predict_date))
In [62]:
result.head(10)
Out[62]:
CSV file with station coordinates
In [ ]:
locations = pd.read_csv("../data/lyon-stations.csv")
In [ ]:
locations.shape
Some stations were removed when the data were cleaned up. Remove them from the location data.
In [ ]:
mask = locations['idstation'].isin(result.station.unique())
In [ ]:
mask.sum()
In [ ]:
locations = locations[mask]
In [ ]:
locations = locations.rename_axis({'idstation': 'station'}, axis=1)
In [ ]:
locations.head()
Some station names contains the '
character. Replace it by the HTML code for folium.
In [ ]:
locations["nom"] = locations['nom'].str.replace("'", "'")
Select the prediction data for a specific timestamp
In [ ]:
data_to_plot = result.loc[predict_date]
In [ ]:
data_to_plot.shape
In [ ]:
data_to_plot.head()
In [ ]:
yhat = data_to_plot[['station', 'prediction']].merge(locations, on='station')
yhat.head()
In [ ]:
y = data_to_plot[['station', 'observation']].merge(locations, on='station')
In [ ]:
error = data_to_plot[['station', 'error']].merge(locations, on='station')
In [ ]:
colormap = 'RdYlBu'
cmap = plt.get_cmap(colormap)
In [ ]:
# show the colormap use to plot the stations, values [0, 1]
gradient = np.linspace(0, 1, 256)
gradient = np.vstack((gradient, gradient))
fig, ax = plt.subplots(1)
fig.subplots_adjust(top=0.95, bottom=0.80, left=0.2, right=0.99)
ax.set_xticks([0., 64, 128, 192, 256])
ax.set_xticklabels([0., 0.25, 0.5, 0.75, 1.])
ax.set_xlabel('Bikes Availability')
ax.imshow(gradient, aspect='auto', cmap=cmap, vmin=0, vmax=1)
plt.title('Colormap used to plot stations')
In [ ]:
color = lambda x: mpl.colors.to_hex(cmap(x))
In [ ]:
# Lyon (France) Position
position = [45.750000, 4.850000]
In [ ]:
mp_pred = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
In [ ]:
# Map of the predicted values
for _,row in yhat.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=2,
popup=row['nom'],
color=color(row['prediction']),
fill=True,
fill_opacity=0.3,
fill_color=color(row['prediction'])
).add_to(mp_pred)
In [ ]:
mp_pred
In [ ]:
# Map for the observation
mp_obs = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
In [ ]:
# Map of the observations
for _,row in y.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=2,
popup=row['nom'],
color=color(row['observation']),
fill=True,
fill_opacity=0.3,
fill_color=color(row['observation'])
).add_to(mp_obs)
In [ ]:
mp_obs
In [ ]:
# Colormap for error (by default, the color map fits for [0, 1] values)
norm = mpl.colors.Normalize(vmin=-1, vmax=1)
color_error = lambda x: mpl.colors.to_hex(cmap(norm(x)))
In [ ]:
# Map for the errors
mp_error = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
In [ ]:
# Map of the errors
for _,row in error.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=2,
popup=row['nom'],
color=color_error(row['error']),
fill=True,
fill_opacity=0.3,
fill_color=color_error(row['error'])
).add_to(mp_error)
In [ ]:
mp_error