In [1]:
%matplotlib inline
import itertools
import logging
import pickle
import folium
import calendar
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.colors as clrs
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn import preprocessing
from datetime import datetime, date, timedelta
from palettable.colorbrewer.sequential import Oranges_9
from src.data.visualization import create_london_map, draw_stations_map
sns.set_context("notebook")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
In [2]:
readings = pickle.load(open("data/parsed/readings_weather_filled_dataset.p", "rb"))
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb'))
collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb'))
In [3]:
readings = readings.query('Source == "REAL"')
In [4]:
import time
from pytz import timezone
start_time = time.time()
readings['Timestamp'] = readings.Timestamp.dt.tz_convert(timezone('Europe/London'))
readings['Timestamp'] = readings.Timestamp.dt.tz_localize(None)
end_time = time.time()
print 'Modifying timestamp took %s' % (end_time - start_time)
In [5]:
def get_day_start_end(day, hours=None):
start = day.replace(hour=7, minute=0, second=0, microsecond=0)
if hours is None:
return (start, day.replace(hour=23, minute=0, second=0, microsecond=0))
else:
return (start, start + timedelta(hours=hours))
def get_full_day_range(timestamp):
return timestamp.replace(hour=0, minute=0, second=0, microsecond=0), timestamp.replace(hour=23, minute=59, second=59, microsecond=999)
def filter_by_time(df, d1, d2):
timestamp = df['Timestamp']
selector = (timestamp >= d1) & (timestamp < d2)
return df[selector]
def filter_by_id(df, idval):
return df[df['Id'] == idval]
global_start = datetime(2016,5,16)
global_end = datetime(2016,6,26)
In [6]:
def map_priority_color(priority):
if priority == 1:
return '#ff1a1a', '#cc0000'
elif priority == 2:
return '#3333ff', '#0000cc'
else:
return '#ffff1a', '#b3b300'
def create_priority_marker(station):
colors = map_priority_color(station['Priority'])
label = "%s - %s" % (station['Id'], station['Name'])
return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=50,
popup=label, color=colors[1], fill_color=colors[0])
priority_map = draw_stations_map(stations, create_priority_marker)
folium.Map.save(priority_map, 'reports/maps/stations_priorities.html')
We believe the readings are updated as follows:
In [7]:
readings2 = readings.set_index(['Id', 'Timestamp']).sort_index()
In [8]:
readings2.iloc[2:7][['NbBikes','NbDocks','NbEmptyDocks','NbUnusableDocks']]
Out[8]:
In [118]:
def plot_station_readings(readings, station_id, ycols1, ycols2, d1, d2, station_name=None):
date_range = pd.date_range(d1, d2, freq='d')
date_pairs = [[date_range[i], date_range[i+1]] for i in range(len(date_range)-1)]
station_readings = filter_by_id(readings, station_id)
slices = slice_by(station_readings, 'Timestamp', date_pairs)
# set plot properties
ncols = 3
nrows = int(len(slices) / ncols)
fig, axes = plt.subplots(nrows=nrows, ncols=ncols)
fig.suptitle('Bikes Available in station %s' % (station_name if station_name is not None else station_id))
fig.set_size_inches(18.5, 10.5)
fig.text(0.1, 0.5, 'Number of Available Bikes', va='center', rotation='vertical')
plot_in_grid(axes, slices, ycols1, ycols2, nrows, ncols)
def slice_by(df, col_name, date_pairs):
groups = []
for pair in date_pairs:
groups.append(filter_by_time(df, pair[0], pair[1]))
return groups
import calendar
def plot_in_grid(axes, slices, ycols1, ycols2, nrows, ncols):
i = 0
for row in range(nrows):
for col in range(ncols):
ax = axes[row,col]
# x axis set up
day_start, day_end = get_day_start_end(slices[i].iloc[0]['Timestamp'])
xticks = pd.date_range(start=day_start,end=day_end, freq='2h')
xlim = (day_start, day_end)
# y axis set up
ylim = (0, max(slices[i]['NbDocks']) + 5)
# set up data
cols = ycols1 + ycols2
cols.append('Timestamp')
df = slices[i][cols]
times = df.Timestamp.iloc[len(df.Timestamp) / 2]
# plot the entry
sub_ax = df.plot(x='Timestamp', ax=ax, xticks=xticks,
legend=False, sharex=False, sharey=True,
xlim=xlim, ylim=ylim, secondary_y=ycols2)
sub_ax.set_xlabel(calendar.day_name[times.weekday()])
sub_ax.set_xticklabels(sub_ax.get_xticklabels(), rotation=90)
sub_ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
i+=1
In [119]:
def plot_timeline(df, station_id, start, end):
df = filter_by_time(filter_by_id(df, station_id), start, end)[['Timestamp']]
df['Day'] = df['Timestamp'].apply(lambda x: x.strftime("%d/%m"))
df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(year=1970, month=1, day=1))
ax = sns.stripplot(data=df, x='Timestamp', y='Day', orient='h');
ax.set_xlim(get_day_start_end(df['Timestamp'].iloc[0]))
ax.set_title('Readings of Station %s' % (station_id))
return ax
In [143]:
ycols1 = ['NbBikes']
ycols2 = []
start = datetime(2016,5,16)
end = datetime(2016,5,23)
BikePoints_340 - Bank Of England Museum, Bank City Center
In [144]:
station_id = 'BikePoints_340'
plot_timeline(readings, station_id, start, end)
Out[144]:
In [145]:
g = plot_station_readings(readings, station_id, ycols1, ycols2, start, end, 'BikePoints_340 - Bank Of England Museum, Bank')
plt.savefig('reports/images/individual-availability-center.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
BikePoints_298 - Curlew Street, Shad Thames Outside City Center, located in Bermondsey
In [146]:
station_id = 'BikePoints_298'
In [147]:
plot_timeline(readings, station_id, start, end)
Out[147]:
In [148]:
plot_station_readings(readings, station_id, ycols1, ycols2, start, end)
plt.savefig('reports/images/individual-availability-outside.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [141]:
stations.sort_values(by=['Name']).to_csv('stations.csv')
In [18]:
distributed = distributed.reset_index()
In [19]:
print len(distributed)
print len(collected)
print len(distributed.Id.unique())
print len(collected.Id.unique())
In [20]:
distributed['Day'] = distributed.Timestamp.apply(lambda x: x.strftime('%d %m %Y'))
distributed.groupby('Day').mean().mean()
Out[20]:
In [21]:
distributed = distributed.reset_index()
collected = collected.reset_index()
stations = stations.reset_index()
In [22]:
stations.Priority = stations.Priority.fillna(3).astype(int)
In [23]:
# merge with the stations dataset
stations_redistribution = stations.merge(distributed.groupby('Id').sum(), left_on='Id', how='left', right_index=True)
stations_redistribution.rename(columns = {'NbBikes':'NbBikesDist'}, inplace = True)
stations_redistribution = stations_redistribution.merge(collected.groupby('Id').sum(), left_on='Id', how='left', right_index=True)
stations_redistribution.rename(columns = {'NbBikes':'NbBikesColl'}, inplace = True)
stations_redistribution.drop(['TerminalName','PlaceType','Installed','Temporary','Locked','RemovalDate','InstallDate','ShortName'], axis=1, inplace=True)
# fill missing values with 0
stations_redistribution['NbBikesDist'].replace({0: 0.0001}, inplace=True)
stations_redistribution['NbBikesDist'].fillna(0.0001, inplace=True)
stations_redistribution['NbBikesColl'].replace({0: 0.0001}, inplace=True)
stations_redistribution['NbBikesColl'].fillna(0.0001, inplace=True)
# scale to use easily the colormap
min_max_scaler = preprocessing.MinMaxScaler()
max_abs_scaler = preprocessing.MaxAbsScaler()
stations_redistribution['NbBikesDistS'] = min_max_scaler.fit_transform(stations_redistribution['NbBikesDist'].apply(np.log).values.reshape(-1, 1))
stations_redistribution['NbBikesCollS'] = min_max_scaler.fit_transform(stations_redistribution['NbBikesColl'].apply(np.log).values.reshape(-1, 1))
In [24]:
def plot_redistribution_grid(df, start, end, ids=None):
df = df[df['Id'].isin(ids)]
df = filter_by_time(df, start, end).copy()
df['Day'] = df['Timestamp'].apply(lambda x: x.strftime("%d/%m"))
df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(year=1970, month=1, day=1))
day_start, day_end = get_full_day_range(df.iloc[0]['Timestamp'])
g = sns.FacetGrid(df, col="Id", col_wrap=4, size=3, xlim=(day_start, day_end), sharex=True, sharey=True, col_order=ids)
[ax.xaxis_date() for ax in g.axes]
[ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M")) for ax in g.axes]
g = g.map(sns.stripplot, "Timestamp", "Day", "Day", orient='h', palette=sns.xkcd_palette(["windows blue"]))
g = g.set(xticks=pd.date_range(start=day_start,end=day_end, freq='6h'))
return g
In [25]:
f, axes = plt.subplots(1, 2, figsize=(15, 5))
plt.suptitle('Histograms of the Total Number of Bicycles Distributed and Collected from Each Station')
stations_redistribution['NbBikesDist'].hist(ax=axes[0])
stations_redistribution['NbBikesColl'].hist(ax=axes[1])
axes[0].set_title('Distributed')
axes[1].set_title('Collected')
axes[0].set_ylabel('Number of Bicycles')
axes[1].set_ylabel('Number of Bicycles')
plt.savefig('reports/images/bicycles-distributed-collected.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [26]:
top_distributed = stations_redistribution.sort_values(by=['NbBikesDist'], ascending=False)
top_distributed.set_index('Id', inplace=True)
top_distributed[0:20]
Out[26]:
In [35]:
from palettable.colorbrewer.diverging import RdYlBu_10
def cmap_to_hex(cmap, value):
rgb = cmap(value)[:3]
return clrs.rgb2hex(rgb)
def create_redistribution_marker(col_name):
def create_marker(station):
line_color = map_priority_color(station['Priority'])[1]
fill_color = cmap_to_hex(plt.get_cmap('seismic_r'), station[col_name])
label = "%s - %s" % (station.name, station['Name'])
return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=100,
popup=label, color=fill_color, fill_color=fill_color, fill_opacity=0.9)
return create_marker
In [36]:
distributed_map = draw_stations_map(top_distributed, create_redistribution_marker('NbBikesDistS'))
folium.TileLayer('stamentoner').add_to(distributed_map)
folium.Map.save(distributed_map, 'reports/maps/distributed_map.html')
distributed_map
Out[36]:
In [37]:
plot_redistribution_grid(distributed, datetime(2016,5,16), datetime(2016,6,10), ids=top_distributed.reset_index().Id[0:12])
plt.savefig('reports/images/bicycles-distributed.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [38]:
top_collected = stations_redistribution.sort_values(by=['NbBikesColl'], ascending=False)
top_collected.set_index('Id', inplace=True)
top_collected[0:20]
Out[38]:
In [39]:
collected_map = draw_stations_map(top_collected, create_redistribution_marker('NbBikesCollS'))
folium.TileLayer('stamentoner').add_to(collected_map)
folium.Map.save(collected_map, 'reports/maps/collected_map.html')
collected_map
Out[39]:
In [32]:
plot_redistribution_grid(collected, datetime(2016,5,16), datetime(2016,6,6), ids=top_collected.reset_index().Id[0:12])
plt.savefig('reports/images/bicycles-collected.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [83]:
%run src/data/periods.py
In [84]:
%run src/data/helper.py
In [87]:
readings2 = readings[readings.Source == 'REAL']
readings2 = readings2.sort_values(by=['Id', 'Timestamp'], ascending=True)
In [88]:
empty_entries = find_zero_periods(readings2, 'NbBikes')
empty_groups = get_ellapsed_time(empty_entries, by='GroupId').sort_values(by=['Ellapsed'], ascending=False)
full_entries = find_zero_periods(readings2, 'NbEmptyDocks')
full_groups = get_ellapsed_time(full_entries, by='GroupId').sort_values(by=['Ellapsed'], ascending=False)
In [89]:
empty_all = empty_groups.groupby('Id').sum()
empty_all.columns=['EmptyTotalAll']
full_all = full_groups.groupby('Id').sum()
full_all.columns=['FullTotalAll']
In [90]:
invalid_threshold = 720
invalid_group_ids = empty_groups[empty_groups.Ellapsed >= invalid_threshold].GroupId
empty_entries = empty_entries[~empty_entries.GroupId.isin(invalid_group_ids)]
invalid_group_ids = full_groups[full_groups.Ellapsed >= invalid_threshold].GroupId
full_entries = full_entries[~full_entries.GroupId.isin(invalid_group_ids)]
In [91]:
empty_periods = get_ellapsed_time(empty_entries, by='PeriodId')
empty_periods = add_station_info(empty_periods, stations, ['Priority', 'Id'])
empty_periods['Day'] = empty_periods['Period'].apply(lambda x: get_period_day(x))
empty_periods['PeakHours'] = empty_periods['Period'].apply(lambda x: is_peaktime(x)[1])
full_periods = get_ellapsed_time(full_entries, by='PeriodId')
full_periods = add_station_info(full_periods, stations, ['Priority', 'Id'])
full_periods['Day'] = full_periods['Period'].apply(lambda x: get_period_day(x))
full_periods['PeakHours'] = full_periods['Period'].apply(lambda x: is_peaktime(x)[1])
In [92]:
empty_periods_df = empty_periods.groupby(['Id', 'PeakHours']).sum().unstack()
empty_periods_df.columns = empty_periods_df.columns.droplevel()
empty_periods_df.columns.name = None
empty_periods_df.columns = ['EmptyEveningPeak', 'EmptyMorningPeak', 'EmptyNonPeak']
empty_periods_df['EmptyTotal'] = empty_periods_df.EmptyEveningPeak + empty_periods_df.EmptyMorningPeak + empty_periods_df.EmptyNonPeak
In [93]:
full_periods_df = full_periods.groupby(['Id', 'PeakHours']).sum().unstack()
full_periods_df.columns = full_periods_df.columns.droplevel()
full_periods_df.columns.name = None
full_periods_df.columns = ['FullEveningPeak', 'FullMorningPeak', 'FullNonPeak']
full_periods_df['FullTotal'] = full_periods_df.FullEveningPeak + full_periods_df.FullMorningPeak + full_periods_df.FullNonPeak
In [94]:
periods = empty_periods_df.merge(full_periods_df, right_index=True, left_index=True)
In [95]:
readings2['WeightedNbBikesStd'] = readings.NbBikes / readings.NbDocks
weighted_std = readings2.groupby('Id').std()
In [97]:
stats = readings2.groupby('Id').agg({'Timestamp': 'count', 'NbDocks': 'max'})
stats.columns = ['Count', 'NbDocks']
stats = stats.merge(periods, how='left', right_index=True, left_index=True).fillna(0)
stats = stats.merge(top_collected[['NbBikesColl']], how='left', right_index=True, left_index=True).fillna(0)
stats = stats.merge(top_distributed[['NbBikesDist']], how='left', right_index=True, left_index=True).fillna(0)
stats = stats.merge(weighted_std[['WeightedNbBikesStd']], how='left', right_index=True, left_index=True).fillna(0)
stats = add_station_info(stats, stations.set_index(stations.Id), use_indexes=True).drop(['Id', 'Name'], axis=1)
stats.Priority = stats.Priority.fillna(3).astype('int8')
stats
Out[97]:
In [101]:
stats.sort_values(by=['Count'], ascending=False)
Out[101]:
In [ ]:
pickle.dump(stats, open("data/parsed/stations_statistics.p", "wb"))
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
empty_pred = KMeans(n_clusters=3).fit_predict(stats[['EmptyEveningPeak', 'EmptyMorningPeak', 'EmptyNonPeak']].values)
full_pred = KMeans(n_clusters=3).fit_predict(stats[['FullEveningPeak', 'FullMorningPeak', 'FullNonPeak']].values)
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=plt.figaspect(0.5))
###############################################################################
# Empty Minutes Clusters
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.set_title('Empty Minutes Clusters')
ax.set_xlabel('EmptyEveningPeak')
ax.set_ylabel('EmptyMorningPeak')
ax.set_zlabel('EmptyNonPeak')
ax.set_xlim(0, max(stats.EmptyEveningPeak))
ax.set_ylim(0, max(stats.EmptyMorningPeak))
ax.set_zlim(0, max(stats.EmptyNonPeak))
ax.view_init(elev=30, azim=230)
ax.dist=12
ax.scatter(stats.EmptyEveningPeak, stats.EmptyMorningPeak, stats.EmptyNonPeak,
c=empty_pred, color='red', marker='o', s=30)
###############################################################################
# Full Minutes Clusters
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.set_title('Full Minutes Clusters')
ax.set_xlabel('FullEveningPeak')
ax.set_ylabel('FullMorningPeak')
ax.set_zlabel('FullNonPeak')
ax.set_xlim(0, max(stats.FullEveningPeak))
ax.set_ylim(0, max(stats.FullMorningPeak))
ax.set_zlim(0, max(stats.FullNonPeak))
ax.view_init(elev=30, azim=230)
ax.dist=12
ax.scatter(stats.FullEveningPeak, stats.FullMorningPeak, stats.FullNonPeak,
c=full_pred, color='blue', marker='o', s=30)
plt.show()
In [ ]:
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(stats.Count, stats.NbDocks, c=stats.Priority)
plt.title("Incorrect Number of Blobs")
In [ ]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs
###############################################################################
# Generate sample data
X = stats[['Count', 'NbDocks']].values
###############################################################################
# Compute clustering with MeanShift
# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.2)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
###############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [ ]:
chido = stats.copy()
chido['EmptyCluster'] = empty_pred
chido['FullCluster'] = full_pred
chido['ActivityCluster'] = ms.labels_
In [ ]:
chido
In [ ]: