Use a supervised machine learning algorithm to predict the availability for each bicyle-sharing stations in Lyon (France) based on the history data.
I use the tree method XGBoost to predict a "probability" of bikes availability for each station. A number close to 1. means that you have several available bikes. A number close to 0. means you don't have many bikes.
In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, train_test_split
In [3]:
import xgboost as xgb
from xgboost import plot_tree
In [4]:
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
In [5]:
import folium
In [6]:
%load_ext watermark
In [7]:
%watermark -d -v -p numpy,pandas,xgboost,matplotlib,folium -g -m -w
The module prediction.py
contains some functions dedicated to the bicyle-sharing stations predictions.
In [8]:
from prediction import (datareader, complete_data, cleanup, bikes_probability,
time_resampling, prepare_data_for_training, fit, prediction,
get_weather)
np.random.seed(42)
In [9]:
DATAFILE = './data/lyon.csv'
In [11]:
raw = datareader(DATAFILE)
In [12]:
raw[raw['number'] == 1005].tail()
Out[12]:
Min and max dates of the timeseries
In [13]:
print(raw.last_update.min())
print(raw.last_update.max())
In [14]:
raw[(raw.number == 1001) &(raw.last_update >= "2017-07-26 09:00:00")].head(10)
Out[14]:
In [15]:
df_clean = cleanup(raw)
Pipe some processing data functions :
10T
)num_avail_bikes / total
In [16]:
df_clean.head()
Out[16]:
In [17]:
df = (df_clean.pipe(time_resampling)
.pipe(complete_data)
.pipe(bikes_probability))
In [18]:
import gc
del df_clean
del raw
gc.collect()
Out[18]:
In [19]:
df.head()
Out[19]:
In [20]:
df.shape
Out[20]:
In [21]:
df.info()
WIP
In [ ]:
from numpy import fft
def fourierExtrapolation(x, segments, harmo):
n = x.size
n_harm = round((n * harmo / 100) -0.5) # number of harmonics in model
print(n_harm)
t = np.arange(0, n)
p = np.polyfit(t, x, 2) # find linear trend in x
x_notrend = x - p[0] * t # detrended x
x_freqdom = fft.fft(x_notrend) # detrended x in frequency domain
f = fft.fftfreq(n) # frequencies
indexes = list(range(n))
# sort indexes by frequency, lower -> higher
indexes.sort(key = lambda i: np.absolute(f[i]))
t = np.arange(0, n + segments)
restored_sig = np.zeros(t.size)
for i in indexes[:1 + n_harm * 2]:
ampli = np.absolute(x_freqdom[i]) / n # amplitude
phase = np.angle(x_freqdom[i]) # phase
restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)
return restored_sig + p[0] * t
In [ ]:
def plot_fourrier(data,station, segments=5, harmo=10, return_df=False):
lags = 550
my_wine = data.loc[(data.station == station)].copy()
X1 = my_wine[:-segments].tail(lags).copy()
X2 = my_wine[-segments:].tail(lags).copy()
extrapolation = fourierExtrapolation(X1.probability, segments, harmo)
X1['fourrier'] = extrapolation[:-segments]
X2['fourrier'] = extrapolation[-segments:]
#plot
fig, (axis1) = plt.subplots(1 ,figsize=(15,5))
# Axe 1
axis1.plot(X1.ts, X1.probability, '-', label='Historical')
axis1.plot(X2.ts, X2.probability, '-', label='Reality')
axis1.plot(X1.ts, X1.fourrier, '-', label='fourrier Train')
axis1.plot(X2.ts, X2.fourrier, '-', label='fourrier extrapolation')
plt.legend(loc='best')
#plt.title(title)
if return_df is True:
return my_wine
In [ ]:
lol = plot_fourrier(df, station=12001, segments=12, harmo=10, return_df=True)
In [ ]:
ok = fourierExtrapolation(lol.probability, segments=10, harmo=5)
len(ok)
In [195]:
lol['ff'] = ok[0:len(lol)]
In [199]:
lag = 1000
plt.plot(lol.ts.tail(lag), lol.probability.tail(lag))
plt.plot(lol.ts.tail(lag), lol.ff.tail(lag))
Out[199]:
In [179]:
ok
Out[179]:
In [176]:
ok
Out[176]:
In [171]:
lol['ff'] = ok
Out[171]:
In [150]:
len(ok)
Out[150]:
In [151]:
len(lol)
Out[151]:
In [144]:
fft.fft(lol.probability)[0:2]
Out[144]:
In [137]:
lol.tail()
Out[137]:
In [136]:
plt.plot(lol[120:].ts, lol[120:].probability)
plt.plot(lol[120:].ts, lol[120:].fft)
Out[136]:
In [79]:
data = df[df.station == 1001]
In [80]:
data.head()
Out[80]:
In [81]:
lol = fft.rfft(data.probability)
In [82]:
data.shape
Out[82]:
In [83]:
x = data.probability
In [86]:
x[0:5]
Out[86]:
In [96]:
n = x.size
n_harm = 3 # number of harmonics in model
t = np.arange(0, n)
p = np.polyfit(t, x, 2) # find linear trend in x
x_notrend = x - p[0] * t # detrended x
x_freqdom = fft.fft(x_notrend) # detrended x in frequency domain
f = fft.fftfreq(n) # frequencies
indexes = list(range(n))
In [95]:
range(2005 - 3, 2005 + 4, 1)
Out[95]:
This is the final dataset. For further prediction, I could add some weather forecasts data to these features.
Let's select a time window (start, stop) to a single prediction.
In [21]:
#start = pd.Timestamp("2017-08-01T00:10:00") # Tuesday OLD
start = pd.Timestamp("2017-08-01T02:00:00") # Tuesday
predict_date = pd.Timestamp("2017-09-22T09:00:00") # wednesday
# predict the next 30 minutes
freq = '1H'
# number of predictions at 'predict_date'.
# Here, the next 30 minutes and the next hour (30 minutes + 30 minutes).
# If you want to predict the next 3 hours, every 30 minutes, thus set periods=6
periods = 1
In [22]:
prepare_data_for_training??
In [22]:
#train_X, train_Y, test_X, test_Y
#X, y
#train_X, train_Y, test_X, test_Y
train_X, train_Y, test_X, test_Y = prepare_data_for_training(df,
predict_date,
freq=freq,
start=start,
periods=periods,
observation='probability',
# how=None
)
In [ ]:
def fit_rolling(train_X, train_Y, test_X, test_Y, param, num_round=25):
"""Train the xgboost model
Return the booster trained model
"""
# param = {'objective': 'reg:linear'}
#if param
#param = {'objective': 'reg:logistic'}
#param['eta'] = 0.2
#param['max_depth'] = 4 # 6 original
#param['silent'] = 1
#param['nthread'] = 4
# used num_class only for classification (e.g. a level of availability)
# param = {'objective': 'multi:softmax'}
# param['num_class'] = train_Y.nunique()
xg_train = xgb.DMatrix(train_X, label=train_Y)
xg_test = xgb.DMatrix(test_X, label=test_Y)
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
evals_result = {}
bst = xgb.train(param, xg_train, num_round, watchlist, evals_result=evals_result)
return bst, evals_result
In [ ]:
In [3]:
result = [] # All macro result by fold
learning_progress = {} # micro result by fold
pred_serie = np.full(X.shape[0], np.nan) # Store pred during the rolling windows
fold_serie = np.full(X.shape[0], np.nan) # Store fold during the rolling windows
tscv = TimeSeriesSplit(n_splits=3)
for (train_index, test_index), k in zip (tscv.split(X), range(1, tscv.n_splits+1, 1)):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
print("Train : ")
print(X_train.index.min())
print(X_train.index.max())
print(len(X_train))
print("Test : ")
print(X_test.index.min())
print(X_test.index.max())
print(len(X_test))
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Split tranning for XGB
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train,
test_size=0.33, random_state=42)
# Adding Meteo features
# Learning for Xgb
X_tr = get_weather(X_tr, how='learning')
X_te = get_weather(X_te, how='forecast', freq=freq)
X_test = get_weather(X_test, how='forecast', freq=freq)
# Learning XGB$
# the 'booster'
param ={"objective": "reg:logistic",
"booster" : "gbtree",
"eta": 0.2,
"max_depth": 9,
"silent": 1,
"seed": 42}
bst, evals_result = fit_rolling(X_tr, y_tr, X_te, y_te, param, num_round=60)
pred = prediction(bst, X_test, y_test)
rmse = np.sqrt(np.mean((pred - y_test)**2))
pred_serie[test_index] = pred
fold_serie[test_index] = k
learning_progress[tscv.n_splits] = evals_result
result.append({'fold' : k,
'train_min_date' : X_train.index.min(),
'train_max_date' : X_train.index.max(),
'len_train' : len(X_train),
'test_min_date' : X_test.index.min(),
'test_max_date' : X_test.index.max(),
'len_test' : len(X_test),
'rmse_train' : evals_result['train']['rmse'][-1],
'rmse_test' : evals_result['test']['rmse'][-1],
'rmse_validation' : rmse})
print("---------------------------------------------")
df_result = pd.DataFrame(result)
X['pred'] = pred_serie
X['fold'] = fold_serie
In [45]:
df_result.head()
Out[45]:
In [201]:
df_result.head()
Out[201]:
In [202]:
X['y'] = y
In [203]:
X.tail()
Out[203]:
In [239]:
#import matplotlib.ticker as ticker
import matplotlib.dates as mdates
def plot_station_pred(result, station):
"""Plot available bikes and bike stands for a given station"""
data = result[result.station == station].copy()
nb_flod = data.fold.nunique()
fig, axarr = plt.subplots(nb_flod ,figsize=(17,10))
for i in range(0, nb_flod):
fold_data = data[data.fold == i+1]
rmse_fold = np.round(np.sqrt(np.mean((fold_data.pred - fold_data.y)**2)), 3)
axarr[i].plot(fold_data[fold_data.is_open == 1].index, fold_data[fold_data.is_open == 1].y,
'-', label='Open', alpha=0.6)
axarr[i].plot(fold_data[fold_data.is_open == 0].index, fold_data[fold_data.is_open == 0].y,
'-', label='Closed', alpha=0.6)
axarr[i].plot(fold_data.index, fold_data.pred, 'r-', label='Prediction')
axarr[i].set_title('Fold '+str(i+1)+' : RMSE '+str(rmse_fold), fontsize =25)
##ax = plt.gca()
# set major ticks location every day
##ax.xaxis.set_major_locator(mdates.DayLocator())
# set major ticks format
##ax.xaxis.set_major_formatter(mdates.DateFormatter('\n\n\n%a %d.%m.%Y'))
# set minor ticks location every two hours
##ax.xaxis.set_minor_locator(mdates.HourLocator(interval=2))
# set minor ticks format
##ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
##plt.setp(ax.xaxis.get_minorticklabels(), rotation=45)
plt.legend(loc='best')
fig.tight_layout()
# print("TRAIN")
# print(train.tail(5))
# print("DATA")
# print(data)
In [243]:
plot_station_pred(X, station=2005)
The fit
function create some data structure for the XGBoost from the train and test DataFrames (i.e. xgb.DMatrix)
, configure the model and launch it with the objective: 'reg:logistic'
. It's a regression, not a classification.
In [23]:
# Spliting train in learning / validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y,
test_size=0.33, random_state=42)
In [24]:
X_train.tail()
Out[24]:
In [25]:
# Adding Meteo features
# Learning for Xgb
X_train = get_weather(X_train, how='learning')
X_test = get_weather(X_test, how='forecast', freq=freq)
# Value in futur
test_X = get_weather(test_X, how='forecast', freq=freq)
In [26]:
# the 'booster'
param ={"objective": "reg:logistic",
"booster" : "gbtree",
"eta": 0.2,
"max_depth": 9,
"silent": 1,
"seed": 42}
bst = fit(X_train, y_train, X_test, y_test, param, num_round=60)
In [27]:
# compute the prediction from test_* (Validation)
pred = prediction(bst, test_X, test_Y)
In [28]:
pred[:5]
Out[28]:
In [29]:
test_Y[:5].tolist()
Out[29]:
In [30]:
print("Number of predictions: {}".format(len(pred)))
In [31]:
# Compute the RMSE
rmse = np.sqrt(np.mean((pred - test_Y)**2))
rmse
Out[31]:
In [32]:
# New periode time
#start = pd.Timestamp("2017-08-01T00:10:00") # Tuesday
#predict_date = pd.Timestamp("2017-09-22T09:00:00") # wednesday
# predict the next 30 minutes
#freq = '1H'
# number of predictions at 'predict_date'.
# Here, the next 30 minutes and the next hour (30 minutes + 30 minutes).
# If you want to predict the next 3 hours, every 30 minutes, thus set periods=6
#periods = 1
# train-rmse:0.134421 test-rmse:0.134516 / 0.14425 : original : max_depth 6 / num_round = 25
# train-rmse:0.133858 test-rmse:0.133971 / 0.14436 : shift features
# train-rmse:0.132937 test-rmse:0.133034 / 0.14285 : Trending features
# train-rmse:0.132937 test-rmse:0.133034 / 0.14285 : Was recently open features (No improve at all, no use of features
# train-rmse:0.126300 test-rmse:0.127283 / 0.13869 : Idem max_dept 8 / num_round = 60. Stil no use of recently_open
# train-rmse:0.123439 test-rmse:0.124447 / 0.13477 : Cluster lyon station activite
# train-rmse:0.122884 test-rmse:0.123860 / 0.13571 : Cluster Lyon geo
# train-rmse:0.122716 test-rmse:0.123831 / 0.13432 : filling_station_by_geo_cluster & Total stand
# train-rmse:0.105716 test-rmse:0.107058 / 0.10607 : Add PAA tranformation (segments = 10 )
# train-rmse:0.104273 test-rmse:0.105701 / 0.10400 : Add Sax transformation (segments=10, symbols=8)
# train-rmse:0.104693 test-rmse:0.106081 / 0.10391 : Add public_holiday
# train-rmse:0.104431 test-rmse:0.105907 / 0.10435 : add public_holiday_count (delta from x last holiday)
# train-rmse:0.104448 test-rmse:0.105840 / 0.10461 : Add summer holiday
# train-rmse:0.104113 test-rmse:0.105526 / 0.10457 : absolute public_holiday_count
# train-rmse:0.104328 test-rmse:0.106016 / 0.09623 : start time = "2017-08-01T02:00:00" (old "2017-08-01T00:10:00") for forecast weather matching
# train-rmse:0.104084 test-rmse:0.106086 / 0.09596 : Add weather ['temp', 'humidity', 'weather_desc']
# train-rmse:0.104084 test-rmse:0.106086 / 0.09596 : add ratio_station_open by time (not use)
# train-rmse:0.101491 test-rmse:0.104886 / 0.09667 : max_depth = 9 (ratio_station_open still not use)
# train-rmse:0.101491 test-rmse:0.104886 / 0.09667 : drop ratio_station_open (Features add nothing)
# train-rmse:0.101676 test-rmse:0.105003 / 0.09540 : add ratio_station_geo_cluster_open features
# train-rmse:0.101713 test-rmse:0.104963 / 0.09545 : Fix minutes
# train-rmse:0.101713 test-rmse:0.104963 / 0.09545 : [X] Binned hours of day (7 grp) (Didn't use it)
# train-rmse:0.101835 test-rmse:0.105203 / 0.09624 : [X]"subsample" : 0.85
# train-rmse:0.101666 test-rmse:0.105105 / 0.09575 : [X] Binned hours of day (5 grp) use it but not worth
# train-rmse:0.095662 test-rmse:0.099130 / 0.08941 : Interaction features with paa & sax
# Data Leak with paa & sax so delete it (with interaction)
# Back to Add public_holiday without paa & sax whit #start = pd.Timestamp("2017-08-01T00:10:00") # Tuesday
# -----------------------
# train-rmse:0.122716 test-rmse:0.123831 / 0.13432 : filling_station_by_geo_cluster & Total stand (Old one)
# train-rmse:0.119564 test-rmse:0.121620 / 0.13353 : Add absolute public_holiday_count
# train-rmse:0.119425 test-rmse:0.121437 / 0.13446 : Add summer holiday
# train-rmse:0.119737 test-rmse:0.121806 / 0.13486 : start time = "2017-08-01T02:00:00" (old "2017-08-01T00:10:00") for forecast weather matching
# train-rmse:0.119416 test-rmse:0.122441 / 0.13383 : Add weather ['temp', 'humidity', 'weather_desc']
# train-rmse:0.119327 test-rmse:0.122504 / 0.13391 : Add weather cloudiness [+/-]
# train-rmse:0.119563 test-rmse:0.122719 / 0.13406 : [x] filling_bike_on_geo_cluster on bike (not bike shift feature)
# train-rmse:0.11972 test-rmse:0.122786 / 0.13378 : [X] Drop ratio_station_geo_cluster_open
# train-rmse:0.119536 test-rmse:0.122669 / 0.13558 : Add rolling mean (6)
# train-rmse:0.11953 test-rmse:0.122635 / 0.13620 : Add rolling std (9)
# train-rmse:0.119579 test-rmse:0.122756 / 0.13582 : Add median (6)
# train-rmse:0.119772 test-rmse:0.122874 / 0.13605 : add Interaction features with mean_6 & median_6
# train-rmse:0.119234 test-rmse:0.122487 / 0.13655 : [?1] add bin hours (5 groups, night / morning / midi / aprem / soire)
# train-rmse:0.119772 test-rmse:0.122874 / 0.13605 : [X] update bin hours (7 groups) [Algo don't use it as feature]
# train-rmse:0.119772 test-rmse:0.122874 / 0.13605 : [V?1] Fix bug on binned hour (aprem 17 -> 18). Correct this nug (create NaN value for 17hours give worst result) ???
# train-rmse:0.117679 test-rmse:0.121040 / 0.13530 : [/!\]add create_mean_by_sta_day_binned_hours (grp by : 'station', 'day', 'hours_binned'. Only open station)
# train-rmse:0.117875 test-rmse:0.121231 / 0.13542 : Add minus create_mean_by_sta_day_binned_hours by proba
# train-rmse:0.117794 test-rmse:0.121224 / 0.13598 : [/!\ v] Fix create_mean_by_sta_day_binned_hours (train test leak)
# To create this features have to keep 'probabilty' in train / test
# train-rmse:0.117999 test-rmse:0.121282 / 0.13548 : [X] Add minus create_mean_by_sta_day_binned_hours by proba
# train-rmse:0.117996 test-rmse:0.121267 / 0.13576 : create_bool_empty_full_station
# train-rmse:0.117877 test-rmse:0.121241 / 0.13600 : create_rolling_median_features empty_full_station (nb_shift = 9)
# train-rmse:0.117877 test-rmse:0.121241 / : create_rolling_median_features empty_full_station (nb_shift = 6)
In [37]:
#start = pd.Timestamp("2017-07-11T00:00:00") # Tuesday
#predict_date = pd.Timestamp("2017-07-26T10:00:00") # wednesday
# predict the next 30 minutes
#freq = '3OT'
# number of predictions at 'predict_date'.
# Here, the next 30 minutes and the next hour (30 minutes + 30 minutes).
# If you want to predict the next 3 hours, every 30 minutes, thus set periods=6
#periods = 2
#0.092800 original direct on test set
# Learning on : 501686 rows
# Testing on : 247100 rows
# Validation on : 2366 rows
## train / test / validation : comment
# train-rmse:0.087365 test-rmse:0.091362 / 0.09200 :
# train-rmse:0.086774 test-rmse:0.090772 / 0.09222 : shift 30 min
# train-rmse:0.086267 test-rmse:0.091045 / 0.092387 : trending KPI
# train-rmse:0.084771 test-rmse:0.089825 / 0.091426 : Cluster Lyon station activité
# train-rmse:0.084847 test-rmse:0.089818 / 0.091285 : was recently open
# train-rmse:0.083754 test-rmse:0.089106 / 0.091335 : Cluster Lyon geo
# train-rmse:0.08413 test-rmse:0.089307 / 0.0915887 : Cluster Lyon geo implement (Checking problem with -1)
# train-rmse:0.084369 test-rmse:0.089433 / 0.091028 : Fix nan was recently open with min_periods=1
# train-rmse:0.083312 test-rmse:0.089216 / 0.091014 : Adding total_stand; total_stand_geo_cluster; bikes_shift_30Tmin_geo_cluster; filling_station_by_geo_cluster
# train-rmse:0.083598 test-rmse:0.089433 / 0.091099 : DROP total_stand (features in filling_bike_on_geo_cluster)
# train-rmse:0.083386 test-rmse:0.089155 / 0.091078 : DROP total_stand_geo_cluster (...)
# train-rmse:0.083296 test-rmse:0.089036 / 0.090905 : DROP bikes_shift_30Tmin_geo_cluster (...)
# train-rmse:0.08354 test-rmse:0.089057 / 0.090567 : [V] Keep only filling_station_by_geo_cluster
# train-rmse:0.083809 test-rmse:0.089234 / 0.090531 : [X] Chage recently open (0->24) to boolean 24->1 else 0 (1 si la station est ouverte depuis 4H par exemple)
# train-rmse:0.083872 test-rmse:0.089464 / 0.090993 : [X] filling_station and filling_station_minus_filling_geo_station
# train-rmse:0.083918 test-rmse:0.089440 / 0.09133 : [X] DROP filling_station
# train-rmse:0.085091 test-rmse:0.090080 / 0.091526 : [X] DROP Station for more general learning
# train-rmse:0.063492 test-rmse:0.069996 / 0.059212 : Add PAA & SAX transformation
#
In [38]:
#start = pd.Timestamp("2017-07-01T00:00:00") #
#predict_date = pd.Timestamp("2017-07-27T16:00:00") # Thursday
# predict the next 30 minutes
#freq = '1H'
# number of predictions at 'predict_date'.
# Here, the next 30 minutes and the next hour (30 minutes + 30 minutes).
# If you want to predict the next 3 hours, every 30 minutes, thus set periods=6
#periods = 1
# train-rmse:0.113644 test-rmse:0.117976 / 0.17470 : Same as previous best run
# train-rmse:0.114026 test-rmse:0.118588 / 0.1756795 : adding paa (segment=20)
# train-rmse:0.113203 test-rmse:0.117770 / 0.175806 : adding paa (segment=6 - 1 hours)
# train-rmse:0.098235 test-rmse:0.103058 / 0.133363 : Fix Paa function (bad value before)
# train-rmse:0.097559 test-rmse:0.102510 / 0.132045 : Sax function (drop paa)
# train-rmse:0.096851 test-rmse:0.102011 / 0.130106 : PAA and SAX
In [45]:
# must install graphviz
# plot_tree(bst)
In [33]:
def plot_features_importance(bst):
"""
Plot features importance for a xgboost model
"""
df_importance = pd.DataFrame([bst.get_score()]).T.reset_index()
df_importance.columns = ['features', 'importance']
df_importance.sort_values('importance', ascending=0, inplace=True)
plt.figure(figsize=(10, 6))
sns.barplot('importance', 'features', data=df_importance)
plt.xlabel('Relative importance')
plt.ylabel('Features importance')
In [34]:
plot_features_importance(bst)
In [34]:
plot_features_importance(bst)
In [66]:
result = test_X.copy()
result['ts_future'] = test_Y.index.shift(1, freq=freq)
result['observation'] = test_Y.copy()
result['ts_future'] = test_Y.index.shift(1, freq=freq) ##??
result['prediction'] = pred
result['error'] = pred - test_Y
result['error_abs'] = np.abs(pred - test_Y)
result['relative_error'] = 100. * np.abs(pred - test_Y) / test_Y
result['quad_error'] = (pred - test_Y)**2
result.to_csv("prediction-freq-{}-{}.csv".format(freq, predict_date))
In [67]:
result.sort_values('error_abs', ascending=1).tail(10)
Out[67]:
In [68]:
trainning = X_train.copy()
train_pred = prediction(bst, X_train, y_train)
trainning['train_pred'] = train_pred
trainning['y'] = y_train
# Sorting on time
trainning = trainning.sort_index()
In [69]:
trainning.head()
Out[69]:
In [70]:
result['proba'] = result['bikes'] / (result['bikes'] + result['stands'])
In [120]:
from sklearn.metrics import mean_absolute_error
def plot_station(result, trainning, station, lag=750):
"""Plot available bikes and bike stands for a given station"""
train = trainning[trainning.station == station].tail(lag)
data = result[result.station == station].copy()
fig, ax = plt.subplots(figsize=(18,5))
plt.plot(train.index, train.y, '-', label='trainning target', alpha=0.6)
plt.plot(train.index, train.train_pred, 'r--', label='trainning pred', alpha=0.4)
#plt.plot(train.index, train.paa, '-', label='PAA', alpha=0.6)
#plt.plot(train.index, train.sax, '-', label='SAX', alpha=0.6)
plt.plot(data.index, data.observation, 'g-', label='Real')
plt.plot(data.index, data.prediction, 'r-', label='Prediction')
train['total_stand'] = train['bikes'] + train['stands']
train['bikes_pred'] = np.round(train['total_stand'] * train['train_pred'])
train['bikes_true'] = train['total_stand'] * train['y']
data['total_stand'] = data['bikes'] + data['stands']
data['bikes_pred'] = np.round(data['total_stand'] * data['prediction'])
data['bikes_true'] = data['total_stand'] * data['observation']
rmse_train = np.sqrt(np.mean((train.train_pred - train.y)**2))
mae_train = mean_absolute_error(train.bikes_true, train.bikes_pred)
rmse_test = np.sqrt(np.mean((data.prediction - data.observation)**2))
mae_test = mean_absolute_error(data.bikes_true, data.bikes_pred)
ax = plt.gca()
# set major ticks location every day
ax.xaxis.set_major_locator(mdates.DayLocator())
# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('\n\n\n%a %d.%m.%Y'))
# set minor ticks location every two hours
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=2))
# set minor ticks format
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
plt.setp(ax.xaxis.get_minorticklabels(), rotation=45)
plt.legend(loc='best')
print("RMSE train : " + str(np.round(rmse_train, 4)) + " - MAE : " + str(np.round(mae_train, 4)))
print("RMSE test : " + str(np.round(rmse_test, 4)) + " - MAE : " + str(np.round(mae_test, 4)))
In [124]:
# Il semble que l'on est pas toute les données
plot_station(result, trainning, 10101, lag=350)
In [57]:
testing = result[result.station == 10113].copy()
In [58]:
testing = testing.reset_index()
In [59]:
testing.head()
Out[59]:
In [60]:
station_result = result.groupby('station')[['bikes_true','bikes_pred']].apply(lambda row: mean_absolute_error(row.bikes_true, row.bikes_pred))
station_result_df = pd.DataFrame(station_result).reset_index()
station_result_df.columns = ['station', 'mae']
In [61]:
station_result_df.sort_values('mae')
Out[61]:
In [160]:
testing.tail()
Out[160]:
In [50]:
data.head(10)
Out[50]:
In [69]:
def plot_station_transfo(data, station):
"""Plot available bikes and bike stands for a given station"""
nb_row = 750
data = data[data.station == station].tail(nb_row).copy()
fig, ax = plt.subplots(figsize=(18,5))
#plt.plot(train.index, train.sax, '-', label='SAX', alpha=0.6)
plt.plot(data.index, data.y, 'g-', label='Real')
plt.plot(data.index, data.paa, '-', label='PAA', alpha=0.6)
#plt.plot(data.index, data.prediction, 'r-', label='Prediction')
ax = plt.gca()
# set major ticks location every day
ax.xaxis.set_major_locator(mdates.DayLocator())
# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('\n\n\n%a %d.%m.%Y'))
# set minor ticks location every two hours
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=2))
# set minor ticks format
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
plt.setp(ax.xaxis.get_minorticklabels(), rotation=45)
plt.legend(loc='best')
In [70]:
plot_station_transfo(data, 12001)
In [68]:
data[data.station == 12001][['ts', 'bikes', 'stands', 'y', 'paa']].tail(22)
Out[68]:
In [66]:
data.loc[2528599]
Out[66]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [90]:
lyon_meteo = pd.read_csv('data/lyon_weather.csv', parse_dates=['date'])
In [91]:
lyon_meteo.rename(columns={'date':'ts'}, inplace=True)
In [92]:
lyon_meteo[55:62]
Out[92]:
In [67]:
lyon_meteo.pressure.describe()
Out[67]:
In [70]:
lyon_meteo[lyon_meteo.pressure >= 1020.000000].tail(10)
Out[70]:
In [58]:
lyon_meteo.weather_desc.unique()
Out[58]:
In [59]:
In [60]:
LE = LabelEncoder()
In [61]:
lyon_meteo['weather_desc'] = LE.fit_transform(lyon_meteo['weather_desc'])
In [100]:
joblib.dump(LE, 'model/Label_Encoder_Weather.pkl')
Out[100]:
In [101]:
clf = joblib.load('model/Label_Encoder_Weather.pkl')
In [104]:
clf.transform(['Cleare'])
In [62]:
lyon_meteo.tail()
Out[62]:
In [75]:
clean_lyon_meteo = lyon_meteo.resample("10T", on="ts").mean().bfill().reset_index()
In [76]:
clean_lyon_meteo[55:62]
Out[76]:
In [65]:
train_X.reset_index(inplace=True)
In [77]:
train_X.head()
Out[77]:
In [78]:
train_X.shape
Out[78]:
In [79]:
lol = train_X.merge(clean_lyon_meteo[['ts', 'temp', 'humidity', 'weather_desc']], on='ts', how='left')
In [80]:
pd.isnull(lol[['temp', 'humidity']]).sum()
Out[80]:
In [81]:
lol.shape
Out[81]:
In [82]:
lol.head()
Out[82]:
In [435]:
print(lyon_meteo.date.min())
print(lyon_meteo.date.max())
In [44]:
lyon_meteo = pd.read_csv('data/lyon_weather.csv', parse_dates=['date'])
lyon_meteo.rename(columns={'date':'ts'}, inplace=True)
# have to labelencode weather_desc
LE = LabelEncoder()
lyon_meteo['weather_desc'] = LE.fit_transform(lyon_meteo['weather_desc'])
# Dump LabelEncoder
joblib.dump(LE, 'model/Label_Encoder_Weather.pkl')
# Resemple data on 10
clean_lyon_meteo = lyon_meteo.resample("10T", on="ts").mean().bfill().reset_index()
In [45]:
clean_lyon_meteo.head()
Out[45]:
In [46]:
X_train.head()
Out[46]:
In [214]:
def get_weather(df, how='learning', freq=None):
"""
Match timeseries with weather data.
If type == learning :
Matching with historitical data weather
if type == forcast :
Matching with forcast data. Freq must be fill with this opton
"""
df = df.reset_index()
# Check params
if how not in ['learning', 'forecast']:
logger.error('Bad option for get_weather. You must choose between learning or forecast')
return df
if how == 'forecast' and freq is None:
logger.error("For forecast option, we must specify freq. Ex freq='1H'")
# Process for learning matching
if how == 'learning':
lyon_meteo = pd.read_csv('data/lyon_weather.csv', parse_dates=['date'])
lyon_meteo.rename(columns={'date':'ts'}, inplace=True)
# have to labelencode weather_desc
LE = LabelEncoder()
lyon_meteo['weather_desc'] = LE.fit_transform(lyon_meteo['weather_desc'])
# Dump LabelEncoder
joblib.dump(LE, 'model/Label_Encoder_Weather.pkl')
# Resemple data on 10
clean_lyon_meteo = lyon_meteo.resample("10T", on="ts").mean().bfill().reset_index()
df = df.merge(clean_lyon_meteo[['ts', 'temp', 'humidity', 'weather_desc']], on='ts', how='left')
df = df.set_index('ts')
return df
# Process for forecast matching
if how == 'forecast':
lyon_forecast = pd.read_csv('data/lyon_forecast.csv', parse_dates=['forecast_at', 'ts'])
lyon_forecast['delta'] = lyon_forecast['ts'] - lyon_forecast['forecast_at']
# Filter on delta with freq
lyon_forecast = lyon_forecast[lyon_forecast['delta'] == freq]
lyon_forecast.drop_duplicates(subset=['ts', 'delta'], keep='first', inplace=True)
# Label encode weather_desc
LE = joblib.load('model/Label_Encoder_Weather.pkl')
lyon_forecast['weather_desc'] = LE.transform(lyon_forecast['weather_desc'])
#Merging
# We talk the last forecast (on freq) using backward merging
df = df.sort_values('ts')
df = pd.merge_asof(left=df, right=lyon_forecast[['ts','temp', 'humidity', 'weather_desc']], on='ts', direction='backward')
df = df.set_index('ts')
return df
In [53]:
del lol
In [88]:
lyon_forecast = pd.read_csv('data/lyon_forecast.csv', parse_dates=['forecast_at', 'ts'])
In [89]:
lyon_forecast.head()
Out[89]:
In [90]:
lyon_forecast.weather_desc.unique()
Out[90]:
In [91]:
lyon_forecast.info()
In [92]:
lyon_forecast['delta'] = lyon_forecast['ts'] - lyon_forecast['forecast_at']
In [93]:
lyon_forecast = lyon_forecast[lyon_forecast['delta'] == "1H"]
lyon_forecast.drop_duplicates(subset=['ts', 'delta'], keep='first', inplace=True)
In [126]:
LE = joblib.load('model/Label_Encoder_Weather.pkl')
lyon_forecast['weather_desc'] = LE.transform(lyon_forecast['weather_desc'])
In [229]:
lyon_forecast[(lyon_forecast.ts >= "2017-08-01 07:00:00") & (lyon_forecast.delta == "1H")].head()
Out[229]:
In [153]:
#lyon_forecast[(lyon_forecast.ts >= "2017-08-01 00:10:00") & (lyon_forecast.delta == "1H")]
In [215]:
X_test.head(2)
Out[215]:
In [216]:
lol = get_weather(X_test, how='forecast', freq='1H')
In [235]:
lol[lol.index.isin(['2017-08-01 10:00:00','2017-08-01 11:00:00'])].tail()
Out[235]:
In [232]:
lol.head()
Out[232]:
In [ ]:
In [222]:
lyon_forecast[(lyon_forecast.ts >= "2017-08-01 07:00:00")].head(5)
Out[222]:
In [166]:
#X_test = X_test.reset_index()
#Merging
# We talk the last forecast (on freq) using backward merging
#X_test = X_test.sort_values('ts')
lol = pd.merge_asof(left=X_test, right=lyon_forecast[['ts','temp', 'humidity', 'weather_desc']], on='ts', direction='backward')
In [ ]:
In [176]:
print(lyon_forecast.ts.min())
print(lyon_forecast.ts.max())
In [ ]:
CSV file with station coordinates
In [48]:
locations = pd.read_csv("./data/lyon-stations.csv")
In [49]:
locations.shape
Out[49]:
Some stations were removed when the data were cleaned up. Remove them from the location data.
In [50]:
mask = locations['idstation'].isin(result.station.unique())
In [51]:
mask.sum()
Out[51]:
In [52]:
locations = locations[mask]
In [53]:
locations = locations.rename_axis({'idstation': 'station'}, axis=1)
In [54]:
locations.head()
Out[54]:
Some station names contains the '
character. Replace it by the HTML code for folium.
In [78]:
locations["nom"] = locations['nom'].str.replace("'", "'")
Select the prediction data for a specific timestamp
In [79]:
data_to_plot = result.loc[predict_date]
In [80]:
data_to_plot.shape
Out[80]:
In [81]:
data_to_plot.head()
Out[81]:
In [83]:
yhat = data_to_plot[['station', 'prediction']].merge(locations, on='station')
yhat.head()
Out[83]:
In [104]:
y = data_to_plot[['station', 'observation']].merge(locations, on='station')
In [140]:
error = data_to_plot[['station', 'error']].merge(locations, on='station')
In [122]:
colormap = 'RdYlBu'
cmap = plt.get_cmap(colormap)
In [137]:
# show the colormap use to plot the stations, values [0, 1]
gradient = np.linspace(0, 1, 256)
gradient = np.vstack((gradient, gradient))
fig, ax = plt.subplots(1)
fig.subplots_adjust(top=0.95, bottom=0.80, left=0.2, right=0.99)
ax.set_xticks([0., 64, 128, 192, 256])
ax.set_xticklabels([0., 0.25, 0.5, 0.75, 1.])
ax.set_xlabel('Bikes Availability')
ax.imshow(gradient, aspect='auto', cmap=cmap, vmin=0, vmax=1)
plt.title('Colormap used to plot stations')
Out[137]:
In [95]:
color = lambda x: mpl.colors.to_hex(cmap(x))
In [89]:
# Lyon (France) Position
position = [45.750000, 4.850000]
In [106]:
mp_pred = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
In [107]:
# Map of the predicted values
for _,row in yhat.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=2,
popup=row['nom'],
color=color(row['prediction']),
fill=True,
fill_opacity=0.3,
fill_color=color(row['prediction'])
).add_to(mp_pred)
In [102]:
mp_pred
Out[102]:
In [108]:
# Map for the observation
mp_obs = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
In [109]:
# Map of the observations
for _,row in y.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=2,
popup=row['nom'],
color=color(row['observation']),
fill=True,
fill_opacity=0.3,
fill_color=color(row['observation'])
).add_to(mp_obs)
In [138]:
mp_obs
Out[138]:
In [153]:
# Colormap for error (by default, the color map fits for [0, 1] values)
norm = mpl.colors.Normalize(vmin=-1, vmax=1)
color_error = lambda x: mpl.colors.to_hex(cmap(norm(x)))
In [154]:
# Map for the errors
mp_error = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
In [155]:
# Map of the errors
for _,row in error.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=2,
popup=row['nom'],
color=color_error(row['error']),
fill=True,
fill_opacity=0.3,
fill_color=color_error(row['error'])
).add_to(mp_error)
In [156]:
mp_error
Out[156]:
In [23]:
station_lyon = pd.read_csv('data/lyon-stations.csv')
In [24]:
station_lyon.head()
Out[24]:
In [25]:
X = station_lyon[['lat', 'lon']].copy()
In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
In [27]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)
k_means = KMeans(init='k-means++', n_clusters=12).fit(X_scale)
In [28]:
station_lyon['station_cluster_geo'] = k_means.labels_
In [29]:
sns.lmplot(x="lat", y="lon", data=station_lyon, hue='station_cluster_geo', scatter=True, fit_reg=False, aspect=2)
Out[29]:
In [30]:
# export
station_lyon.head()
station_lyon.rename(columns={'idstation':'station'}, inplace=True)
station_lyon = station_lyon[['station', 'station_cluster_geo']]
station_lyon.to_csv('data/station_cluster_geo_armand.csv', index=False)
In [ ]: