Timeseries data about the number of bikes and bike stands for the Lyon city (France), betweek 2017-07-11 and 2017-09-26.
In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
from sklearn.cluster import KMeans
In [4]:
from matplotlib import pyplot as plt
In [5]:
import seaborn as sns
sns.set_context('talk')
In [6]:
%load_ext watermark
In [7]:
%watermark -d -v -p numpy,pandas,sklearn,matplotlib,seaborn -g -m -w
In [9]:
DATA = "../data/lyon.csv"
In [10]:
raw = pd.read_csv(DATA, parse_dates=['last_update'])
In [11]:
raw.drop_duplicates(inplace=True)
In [12]:
raw = raw.sort_values(["number", "last_update"])
In [13]:
MIN_DATE = '2017-07-11'
MAX_DATE = '2017-09-26'
In [14]:
print(raw.shape)
In [15]:
drop_columns = ["availability", "bonus", "status", "bike_stands", "available_bike_stands", "availabilitycode"]
In [16]:
data = (raw.copy()
.query("last_update >= '{}' and last_update <= '{}'".format(MIN_DATE, MAX_DATE))
.drop(drop_columns, axis=1)
.rename_axis({"available_bikes": "bikes", "number": "station", "last_update": "ts"}, axis=1))
In [17]:
data.head()
Out[17]:
Some strange stations where the max number of bikes is 0
In [19]:
max_bikes = data.groupby("station")["bikes"].max()
max_bikes
Out[19]:
In [20]:
wrong_stations = max_bikes[max_bikes == 0].index.tolist()
wrong_stations
Out[20]:
In [21]:
well_station_mask = np.logical_not(data['station'].isin(wrong_stations))
In [22]:
data = data[well_station_mask]
print(data.shape)
Get data every 5 minutes
In [23]:
df = (data.set_index("ts")
.groupby("station")["bikes"]
.resample("5T")
.mean()
.bfill())
In [24]:
df.head(10)
Out[24]:
In [25]:
df = df.unstack(0)
In [26]:
df.head()
Out[26]:
Get rid of saturday and sunday.
In [27]:
weekday = df.index.weekday
mask = weekday < 5
In [28]:
df = df[mask]
In [29]:
df['hour'] = df.index.hour
In [30]:
df = df.groupby("hour").mean()
In [31]:
df.head()
Out[31]:
In [32]:
n_clusters = 4
In [231]:
#df = df.fillna(method='bfill')
In [232]:
#df = df.fillna(method='ffill')
Normalization
In [33]:
df_norm = df / df.max()
In [34]:
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(df_norm.T)
In [35]:
label = pd.Series(kmeans.labels_)
Number of stations for each label, i.e. usage pattern
In [36]:
label.groupby(label).count()
Out[36]:
Colors for each cluster
In [59]:
colors = sns.color_palette('Set1', n_clusters)
In [60]:
sns.palplot(colors)
Daily profile
In [75]:
pd.DataFrame(kmeans.cluster_centers_).to_csv("../data/lyon_clusters.csv", index=False)
In [69]:
with sns.axes_style("darkgrid", {'xtick.major.size': 8.0}):
fig, ax = plt.subplots(figsize=(10,6))
for k, label, color in zip(kmeans.cluster_centers_, range(n_clusters), colors):
plt.plot(100*k, color=color, label=label)
plt.legend()
plt.xlabel('Hour')
plt.xticks(np.linspace(0, 24, 13))
plt.yticks(np.linspace(0, 100, 11))
plt.ylabel("available bikes%")
sns.despine()
plt.savefig("../images/lyon-pattern.png")
Station names and lat/lon coordinates.
In [41]:
locations = pd.read_csv("../data/lyon-stations.csv")
In [42]:
mask = np.logical_not(locations['idstation'].isin(wrong_stations))
In [43]:
locations = locations[mask]
In [44]:
locations.head()
Out[44]:
In [45]:
dflabel = pd.DataFrame({"label": kmeans.labels_}, index=df.columns)
In [46]:
locations = locations.merge(dflabel, right_index=True, left_on='idstation')
In [47]:
locations.head()
Out[47]:
In [48]:
locations["nom"] = locations['nom'].str.replace("'", "'")
In [49]:
import folium
In [50]:
# Lyon (France) Position
position = [45.750000, 4.850000]
In [51]:
mp = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
#mp = folium.Map(location=position, zoom_start=13)
In [52]:
hex_colors = colors.as_hex()
In [70]:
for _,row in locations.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=5,
popup=row['nom'],
color=hex_colors[row['label']],
fill=True,
fill_opacity=0.5,
fill_color=hex_colors[row['label']]
).add_to(mp)
In [71]:
mp.save("../images/lyon-map-n_clusters-{}.html".format(n_clusters))
In [72]:
mp
Out[72]: