Analyze the daily profile for each station from 2017-07-09 to 2017-09-26.
In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
from sklearn.cluster import KMeans
In [4]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context('notebook')
In [5]:
%load_ext watermark
In [6]:
%watermark -d -v -p numpy,pandas,sklearn,matplotlib,seaborn -g -m -w
In [8]:
# Timeseries data
DATA = "../data/bordeaux.csv"
In [10]:
raw = pd.read_csv(DATA, parse_dates=['ts'])
In [11]:
raw.head()
Out[11]:
In [12]:
print(raw.shape)
CONNECTEE
station (i.e. not closed)
In [13]:
data = (raw.copy()
.query('state == "CONNECTEE"')
.drop(['gid', 'available_stand', 'type', 'state'], axis=1)
.rename_axis({"available_bike": "bikes", "ident": "station"}, axis=1)
.drop_duplicates()
.sort_values(["station", "ts"]))
In [14]:
print(data.shape)
data.head()
Out[14]:
Get data every 5 minutes.
In [15]:
df = (data.set_index("ts")
.groupby("station")["bikes"]
.resample("5T")
.mean()
.bfill()
.unstack(0))
In [16]:
print(df.shape)
df.head()
Out[16]:
In [17]:
weekday = df.index.weekday
In [18]:
mask = weekday < 5
In [19]:
mask.sum()
Out[19]:
In [20]:
df = df[mask]
In [21]:
print(df.shape)
In [22]:
df['hour'] = df.index.hour
In [23]:
df.head()
Out[23]:
In [24]:
profile = df.groupby("hour").mean()
In [25]:
profile.head()
Out[25]:
Use the KMeans algorithm on the daily profile.
In [26]:
n_clusters = 4
In [27]:
# Normalization
df_norm = profile / profile.max()
In [28]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(df_norm.T)
In [29]:
labels = pd.Series(kmeans.labels_)
Number of stations for each cluster (i.e. usage pattern).
In [30]:
label_count = labels.groupby(labels).count()
label_count
Out[30]:
Choose some colors.
In [69]:
colors = sns.color_palette('Set1', n_clusters)
In [71]:
sns.palplot(colors)
In [70]:
sns.barplot(x=label_count.index, y=label_count, palette=colors)
plt.xlabel('Cluster')
plt.ylabel('Number of stations')
plt.title('Number of stations for each cluster')
Out[70]:
Plot the daily profile of available bikes (%) for each cluster.
In [95]:
pd.DataFrame(kmeans.cluster_centers_).to_csv("../data/bordeaux_clusters.csv", index=False)
In [94]:
kmeans.cluster_centers_
Out[94]:
In [78]:
with sns.axes_style("darkgrid", {'xtick.major.size': 8.0}):
fig, ax = plt.subplots(figsize=(10,6))
for k, label, color in zip(kmeans.cluster_centers_, range(n_clusters), colors):
plt.plot(100*k, color=color, label=label)
plt.legend()
plt.xlabel('Hour')
plt.xticks(np.linspace(0, 24, 13))
plt.yticks(np.linspace(0, 100, 11))
plt.ylabel("available bikes%")
sns.despine()
plt.savefig("../images/bordeaux-pattern.png")
Get the station lat/lon coordinates.
In [36]:
locations = pd.read_csv("../data/bordeaux-stations.csv")
In [37]:
locations.head()
Out[37]:
In [38]:
dflabel = pd.DataFrame({"label": kmeans.labels_}, index=profile.columns)
In [39]:
dflabel.head()
Out[39]:
Get the label, i.e. the cluster id, for each station.
In [40]:
locations = locations.merge(dflabel, right_index=True, left_on='ident')
In [41]:
locations.head()
Out[41]:
In [42]:
locations["nom"] = locations['nom'].str.replace("'", "'")
In [43]:
import folium
In [44]:
# Bordeaux (France) position.
position = [44.836151, -0.580816]
In [79]:
mp = folium.Map(location=position, zoom_start=12, tiles='cartodbpositron')
In [80]:
hex_colors = colors.as_hex()
In [84]:
for _,row in locations.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=5,
popup=row['nom'],
color=hex_colors[row['label']],
fill=False,
fill_opacity=0.5,
fill_color=hex_colors[row['label']]
).add_to(mp)
In [85]:
mp.save("../images/bordeaux-map-n_clusters-{}.html".format(n_clusters))
In [86]:
mp
Out[86]: