Timeseries data about the number of bikes and bike stands for the Lyon city (France), betweek 2017-07-11 and 2017-09-26.
In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
from sklearn.cluster import KMeans
In [4]:
from matplotlib import pyplot as plt
In [5]:
import seaborn as sns
sns.set_context('talk')
In [6]:
%load_ext watermark
In [7]:
%watermark -d -v -p numpy,pandas,sklearn,matplotlib,seaborn -g -m -w
In [8]:
DATA = "./data/lyon.csv"
In [9]:
raw = pd.read_csv(DATA, parse_dates=['last_update'])
In [10]:
raw.drop_duplicates(inplace=True)
In [11]:
raw = raw.sort_values(["number", "last_update"])
In [12]:
MIN_DATE = '2017-07-11'
MAX_DATE = '2017-09-26'
In [13]:
print(raw.shape)
In [14]:
drop_columns = ["availability", "bonus", "status", "bike_stands", "available_bike_stands", "availabilitycode"]
In [15]:
data = (raw.copy()
.query("last_update >= '{}' and last_update <= '{}'".format(MIN_DATE, MAX_DATE))
.drop(drop_columns, axis=1)
.rename_axis({"available_bikes": "bikes", "number": "station", "last_update": "ts"}, axis=1))
In [16]:
data.head()
Out[16]:
Some strange stations where the max number of bikes is 0
In [17]:
max_bikes = data.groupby("station")["bikes"].max()
max_bikes
Out[17]:
In [18]:
wrong_stations = max_bikes[max_bikes == 0].index.tolist()
wrong_stations
Out[18]:
In [19]:
well_station_mask = np.logical_not(data['station'].isin(wrong_stations))
In [20]:
data = data[well_station_mask]
print(data.shape)
Get data every 10 minutes
In [21]:
df = (data.set_index("ts")
.groupby("station")["bikes"]
.resample("10T")
.mean()
.bfill())
In [22]:
df.head(10)
Out[22]:
In [23]:
df = df.unstack(0)
In [24]:
df['hour'] = df.index.hour
In [25]:
df.head()
Out[25]:
In [26]:
profile = df.groupby("hour").mean()
profile.head()
Out[26]:
In [36]:
n_clusters = 4
In [28]:
#df = df.fillna(method='bfill')
In [29]:
#df = df.fillna(method='ffill')
Normalization
In [30]:
df_norm = profile / profile.max()
In [37]:
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(df_norm.T)
In [34]:
df_norm.T.head()
Out[34]:
In [35]:
# On choisi 4 cluster
distortions = []
for i in range (1, 15):
km = k_means = KMeans(random_state=0, n_clusters=i).fit(df_norm.T)
distortions.append(km.inertia_)
plt.plot(range(1, 15), distortions, marker='o')
plt.xlabel('Number of cluster')
plt.ylabel('Distortion')
plt.show()
In [38]:
label = pd.Series(kmeans.labels_)
Number of stations for each label, i.e. usage pattern
In [39]:
label.groupby(label).count()
Out[39]:
Colors for each cluster
In [40]:
colors = sns.color_palette('Set1', n_clusters)
In [41]:
sns.palplot(colors)
Daily profile
In [42]:
with sns.axes_style("darkgrid", {'xtick.major.size': 5.0}):
fig, ax = plt.subplots(figsize=(10,6))
for k, label, color in zip(kmeans.cluster_centers_, range(n_clusters), colors):
plt.plot(100*k, color=color, label=label)
plt.legend()
plt.xlabel('Hour')
plt.ylabel("available bikes%")
sns.despine()
In [43]:
len(kmeans.labels_)
Out[43]:
In [44]:
df_norm.T.shape
Out[44]:
In [45]:
cluster_station = df_norm.T
cluster_station['cluster'] = kmeans.labels_
cluster_station.reset_index(inplace=True)
cluster_station = cluster_station[['station', 'cluster']]
In [52]:
cluster_station.head()
Out[52]:
In [54]:
cluster_station.station.nunique()
Out[54]:
In [146]:
cluster_station.to_csv('data/cluster_lyon_armand.csv', index=False)
Station names and lat/lon coordinates.
In [64]:
locations = pd.read_csv("./data/lyon-stations.csv")
In [65]:
mask = np.logical_not(locations['idstation'].isin(wrong_stations))
In [66]:
locations = locations[mask]
In [67]:
locations.head()
Out[67]:
In [68]:
dflabel = pd.DataFrame({"label": kmeans.labels_}, index=df.columns)
In [69]:
locations = locations.merge(dflabel, right_index=True, left_on='idstation')
In [70]:
locations.head()
Out[70]:
In [71]:
locations["nom"] = locations['nom'].str.replace("'", "'")
In [72]:
import folium
In [73]:
# Lyon (France) Position
position = [45.750000, 4.850000]
In [76]:
mp = folium.Map(location=position, zoom_start=13, tiles='cartodbpositron')
#mp = folium.Map(location=position, zoom_start=13)
In [74]:
hex_colors = colors.as_hex()
In [77]:
for _,row in locations.iterrows():
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=5,
popup=row['nom'],
color=hex_colors[row['label']],
fill=True,
fill_opacity=0.7,
fill_color=hex_colors[row['label']]
).add_to(mp)
In [78]:
mp.save("lyon-map-n_clusters-{}.html".format(n_clusters))
In [79]:
mp
Out[79]: