Exploratory Data Analysis

Set up


In [1]:
%matplotlib inline

import itertools
import logging
import pickle
import folium
import calendar
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.colors as clrs
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn import preprocessing
from datetime import datetime, date, timedelta
from palettable.colorbrewer.sequential import Oranges_9

from src.data.visualization import create_london_map, draw_stations_map

sns.set_context("notebook")

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
readings = pickle.load(open("data/parsed/readings_weather_filled_dataset.p", "rb"))
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb'))
collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb'))

In [3]:
readings = readings.query('Source == "REAL"')

In [4]:
import time
from pytz import timezone

start_time = time.time()

readings['Timestamp'] = readings.Timestamp.dt.tz_convert(timezone('Europe/London'))
readings['Timestamp'] = readings.Timestamp.dt.tz_localize(None)

end_time = time.time()
print 'Modifying timestamp took %s' % (end_time - start_time)


Modifying timestamp took 0.159476041794

In [5]:
def get_day_start_end(day, hours=None):
    start = day.replace(hour=7, minute=0, second=0, microsecond=0)
    if hours is None:
        return (start, day.replace(hour=23, minute=0, second=0, microsecond=0))
    else:
        return (start, start + timedelta(hours=hours))
    
def get_full_day_range(timestamp):
    return timestamp.replace(hour=0, minute=0, second=0, microsecond=0), timestamp.replace(hour=23, minute=59, second=59, microsecond=999)

def filter_by_time(df, d1, d2):
    timestamp = df['Timestamp']
    selector = (timestamp >= d1) & (timestamp < d2)
    return df[selector]
    
def filter_by_id(df, idval):
    return df[df['Id'] == idval]

global_start = datetime(2016,5,16)
global_end = datetime(2016,6,26)

Station Priority


In [6]:
def map_priority_color(priority):
    if priority == 1:
        return '#ff1a1a', '#cc0000'
    elif priority == 2:
        return '#3333ff', '#0000cc'
    else: 
        return '#ffff1a', '#b3b300'
    
def create_priority_marker(station):
    colors = map_priority_color(station['Priority'])
    label = "%s - %s" % (station['Id'], station['Name'])

    return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=50,
                        popup=label, color=colors[1], fill_color=colors[0])

priority_map = draw_stations_map(stations, create_priority_marker)

folium.Map.save(priority_map, 'reports/maps/stations_priorities.html')

Bicycle Availability Readings

Why are the readings not continuous?

We believe the readings are updated as follows:

  • The station publishes its readings only when an change in the station occurred in the last 5 minutes
  • This means most of the readings will be different than the previous one
  • When this is not the case, we suspect that an equal number of bikes left and arrived to the station, leaving the station in the same state as it was before but triggering an update to be published

In [7]:
readings2 = readings.set_index(['Id', 'Timestamp']).sort_index()

In [8]:
readings2.iloc[2:7][['NbBikes','NbDocks','NbEmptyDocks','NbUnusableDocks']]


Out[8]:
NbBikes NbDocks NbEmptyDocks NbUnusableDocks
Id Timestamp
BikePoints_1 2016-05-16 08:01:29.163000064 10.0 19.0 8.0 1.0
2016-05-16 08:11:30.432999936 8.0 19.0 10.0 1.0
2016-05-16 08:16:30.956999936 7.0 19.0 11.0 1.0
2016-05-16 08:26:32.369999872 5.0 19.0 13.0 1.0
2016-05-16 08:31:33.192999936 6.0 19.0 12.0 1.0

High vs Low Priority


In [118]:
def plot_station_readings(readings, station_id, ycols1, ycols2, d1, d2, station_name=None):
    date_range = pd.date_range(d1, d2, freq='d')
    date_pairs = [[date_range[i], date_range[i+1]] for i in range(len(date_range)-1)]

    station_readings = filter_by_id(readings, station_id)
    slices = slice_by(station_readings, 'Timestamp', date_pairs)  
    
    # set plot properties
    ncols = 3
    nrows = int(len(slices) / ncols)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols)  
    fig.suptitle('Bikes Available in station %s' % (station_name if station_name is not None else station_id))
    fig.set_size_inches(18.5, 10.5)
    fig.text(0.1, 0.5, 'Number of Available Bikes', va='center', rotation='vertical')
        
    plot_in_grid(axes, slices, ycols1, ycols2, nrows, ncols)

def slice_by(df, col_name, date_pairs):
    groups = []
    for pair in date_pairs:
        groups.append(filter_by_time(df, pair[0], pair[1]))
    return groups

import calendar

def plot_in_grid(axes, slices, ycols1, ycols2, nrows, ncols):
    i = 0    
    for row in range(nrows):
        for col in range(ncols):
            ax = axes[row,col]
                        
            # x axis set up            
            day_start, day_end = get_day_start_end(slices[i].iloc[0]['Timestamp'])
            xticks = pd.date_range(start=day_start,end=day_end, freq='2h')
            xlim = (day_start, day_end)

            # y axis set up
            ylim = (0, max(slices[i]['NbDocks']) + 5)
                        
            # set up data
            cols = ycols1 + ycols2
            cols.append('Timestamp')
            df = slices[i][cols]
            
            times =  df.Timestamp.iloc[len(df.Timestamp) / 2]            
                        
            # plot the entry                        
            sub_ax = df.plot(x='Timestamp', ax=ax, xticks=xticks,
                           legend=False, sharex=False, sharey=True,
                           xlim=xlim, ylim=ylim, secondary_y=ycols2)
                                    
            sub_ax.set_xlabel(calendar.day_name[times.weekday()])
            sub_ax.set_xticklabels(sub_ax.get_xticklabels(), rotation=90)
            sub_ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
            i+=1

In [119]:
def plot_timeline(df, station_id, start, end):
    df = filter_by_time(filter_by_id(df, station_id), start, end)[['Timestamp']]

    df['Day'] = df['Timestamp'].apply(lambda x: x.strftime("%d/%m"))
    df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(year=1970, month=1, day=1))
    ax = sns.stripplot(data=df, x='Timestamp', y='Day', orient='h');
    ax.set_xlim(get_day_start_end(df['Timestamp'].iloc[0]))
    ax.set_title('Readings of Station %s' % (station_id))
    
    return ax

In [143]:
ycols1 = ['NbBikes']
ycols2 = []
start = datetime(2016,5,16)
end = datetime(2016,5,23)

High Activity Stations

BikePoints_340 - Bank Of England Museum, Bank City Center


In [144]:
station_id = 'BikePoints_340'
plot_timeline(readings, station_id, start, end)


Out[144]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e7ced7690>

In [145]:
g = plot_station_readings(readings, station_id, ycols1, ycols2, start, end, 'BikePoints_340 - Bank Of England Museum, Bank')
plt.savefig('reports/images/individual-availability-center.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)


BikePoints_298 - Curlew Street, Shad Thames Outside City Center, located in Bermondsey


In [146]:
station_id = 'BikePoints_298'

In [147]:
plot_timeline(readings, station_id, start, end)


Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e9cdbcb90>

In [148]:
plot_station_readings(readings, station_id, ycols1, ycols2, start, end)
plt.savefig('reports/images/individual-availability-outside.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)



In [141]:
stations.sort_values(by=['Name']).to_csv('stations.csv')

In [18]:
distributed = distributed.reset_index()

Bicycle Redistribution


In [19]:
print len(distributed)
print len(collected)
print len(distributed.Id.unique())
print len(collected.Id.unique())


39677
43396
768
756

In [20]:
distributed['Day'] = distributed.Timestamp.apply(lambda x: x.strftime('%d %m %Y'))
distributed.groupby('Day').mean().mean()


Out[20]:
NbBikes    9.932787
dtype: float64

In [21]:
distributed = distributed.reset_index()
collected = collected.reset_index()
stations = stations.reset_index()

In [22]:
stations.Priority = stations.Priority.fillna(3).astype(int)

In [23]:
# merge with the stations dataset
stations_redistribution = stations.merge(distributed.groupby('Id').sum(), left_on='Id', how='left', right_index=True)
stations_redistribution.rename(columns = {'NbBikes':'NbBikesDist'}, inplace = True)
stations_redistribution = stations_redistribution.merge(collected.groupby('Id').sum(), left_on='Id', how='left', right_index=True)
stations_redistribution.rename(columns = {'NbBikes':'NbBikesColl'}, inplace = True)
stations_redistribution.drop(['TerminalName','PlaceType','Installed','Temporary','Locked','RemovalDate','InstallDate','ShortName'], axis=1, inplace=True)

# fill missing values with 0
stations_redistribution['NbBikesDist'].replace({0: 0.0001}, inplace=True)
stations_redistribution['NbBikesDist'].fillna(0.0001, inplace=True)
stations_redistribution['NbBikesColl'].replace({0: 0.0001}, inplace=True)
stations_redistribution['NbBikesColl'].fillna(0.0001, inplace=True)

# scale to use easily the colormap
min_max_scaler = preprocessing.MinMaxScaler()
max_abs_scaler = preprocessing.MaxAbsScaler()
stations_redistribution['NbBikesDistS'] = min_max_scaler.fit_transform(stations_redistribution['NbBikesDist'].apply(np.log).values.reshape(-1, 1))
stations_redistribution['NbBikesCollS'] = min_max_scaler.fit_transform(stations_redistribution['NbBikesColl'].apply(np.log).values.reshape(-1, 1))

In [24]:
def plot_redistribution_grid(df, start, end, ids=None):
    df = df[df['Id'].isin(ids)]
    df = filter_by_time(df, start, end).copy()
    df['Day'] = df['Timestamp'].apply(lambda x: x.strftime("%d/%m"))
    df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(year=1970, month=1, day=1))
    
    day_start, day_end = get_full_day_range(df.iloc[0]['Timestamp'])
    g = sns.FacetGrid(df, col="Id", col_wrap=4, size=3, xlim=(day_start, day_end), sharex=True, sharey=True, col_order=ids)
    [ax.xaxis_date() for ax in g.axes]
    [ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M")) for ax in g.axes]
    g = g.map(sns.stripplot, "Timestamp", "Day", "Day", orient='h', palette=sns.xkcd_palette(["windows blue"]))
    g = g.set(xticks=pd.date_range(start=day_start,end=day_end, freq='6h'))
    
    return g

In [25]:
f, axes = plt.subplots(1, 2, figsize=(15, 5))
plt.suptitle('Histograms of the Total Number of Bicycles Distributed and Collected from Each Station')

stations_redistribution['NbBikesDist'].hist(ax=axes[0])
stations_redistribution['NbBikesColl'].hist(ax=axes[1])

axes[0].set_title('Distributed')
axes[1].set_title('Collected')

axes[0].set_ylabel('Number of Bicycles')
axes[1].set_ylabel('Number of Bicycles')

plt.savefig('reports/images/bicycles-distributed-collected.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)


Distributed


In [26]:
top_distributed = stations_redistribution.sort_values(by=['NbBikesDist'], ascending=False)
top_distributed.set_index('Id', inplace=True)
top_distributed[0:20]


Out[26]:
level_0 index_x Name Latitude Longitude Priority index_y NbBikesDist index NbBikesColl NbBikesDistS NbBikesCollS
Id
BikePoints_374 293 293 Waterloo Station 1, Waterloo 51.504027 -0.113864 1 18705933.0 13720.0 18514768.0 11324.0 1.000000 1.000000
BikePoints_361 280 280 Waterloo Station 2, Waterloo 51.503919 -0.113426 1 14328975.0 11542.0 12275968.0 7939.0 0.990774 0.980850
BikePoints_20 110 110 Drummond Street, Euston 51.527736 -0.135273 2 2207577.0 4246.0 1832787.0 2330.0 0.937403 0.914745
BikePoints_574 499 499 Eagle Wharf Road, Hoxton 51.533560 -0.093150 2 8910450.0 3848.0 999891.0 211.0 0.932150 0.785235
BikePoints_154 61 61 Waterloo Station 3, Waterloo 51.503791 -0.112824 1 937332.0 3778.0 827816.0 2568.0 0.931170 0.919989
BikePoints_436 352 352 Red Lion Street, Holborn 51.518240 -0.116550 1 7492014.0 3390.0 10821538.0 3237.0 0.925387 0.932474
BikePoints_25 163 163 Doric Way, Somers Town 51.528833 -0.132250 2 2800477.0 3216.0 2281365.0 1780.0 0.922575 0.900226
BikePoints_45 365 365 Boston Place, Marylebone 51.522511 -0.162298 2 5843275.0 3041.0 4306647.0 1433.0 0.919588 0.888533
BikePoints_427 344 344 Cheapside, Bank 51.513970 -0.092940 1 5993520.0 3020.0 7929618.0 3263.0 0.919219 0.932905
BikePoints_272 187 187 Baylis Road, Waterloo 51.501444 -0.110699 1 3055325.0 2708.0 4883148.0 3276.0 0.913399 0.933119
BikePoints_214 125 125 Endsleigh Gardens, Euston 51.526838 -0.130504 2 1568352.0 2591.0 2324336.0 2676.0 0.911042 0.922211
BikePoints_14 45 45 Belgrove Street, Kings Cross 51.529943 -0.123616 1 574668.0 2520.0 770385.0 2523.0 0.909559 0.919036
BikePoints_17 78 78 Hatton Wall, Holborn 51.521661 -0.109006 1 1179917.0 2476.0 1872339.0 2803.0 0.908619 0.924711
BikePoints_579 503 503 Queen Street 2, Bank 51.511246 -0.093051 1 6603190.0 2259.0 15745827.0 5269.0 0.903723 0.958745
BikePoints_108 10 10 Abbey Orchard Street, Westminster 51.498125 -0.132102 1 123933.0 2253.0 273245.0 3975.0 0.903581 0.943548
BikePoints_326 245 245 Graham Street, Angel 51.532661 -0.099981 2 2505522.0 2149.0 482220.0 182.0 0.901059 0.777262
BikePoints_273 188 188 Belvedere Road, South Bank 51.506133 -0.114686 2 2151435.0 2134.0 3899987.0 3072.0 0.900685 0.929653
BikePoints_66 591 591 Holborn Circus, Holborn 51.517950 -0.108657 1 6573965.0 2082.0 11224161.0 3974.0 0.899369 0.943535
BikePoints_253 167 167 Shoreditch Park, Hoxton 51.534042 -0.086379 2 1768921.0 2080.0 253606.0 96.0 0.899317 0.742770
BikePoints_93 772 772 Cloudesley Road, Angel 51.534408 -0.109025 2 7418645.0 2067.0 814150.0 191.0 0.898983 0.779865

In [35]:
from palettable.colorbrewer.diverging import RdYlBu_10

def cmap_to_hex(cmap, value):
    rgb = cmap(value)[:3]
    return clrs.rgb2hex(rgb)

def create_redistribution_marker(col_name):
    def create_marker(station):
        line_color = map_priority_color(station['Priority'])[1]        

        fill_color = cmap_to_hex(plt.get_cmap('seismic_r'), station[col_name])
        
        label = "%s - %s" % (station.name, station['Name'])

        return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=100,
                        popup=label, color=fill_color, fill_color=fill_color, fill_opacity=0.9)
    
    return create_marker

In [36]:
distributed_map = draw_stations_map(top_distributed, create_redistribution_marker('NbBikesDistS'))
folium.TileLayer('stamentoner').add_to(distributed_map)
folium.Map.save(distributed_map, 'reports/maps/distributed_map.html')
distributed_map


Out[36]:

In [37]:
plot_redistribution_grid(distributed, datetime(2016,5,16), datetime(2016,6,10), ids=top_distributed.reset_index().Id[0:12])
plt.savefig('reports/images/bicycles-distributed.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)


Collected


In [38]:
top_collected = stations_redistribution.sort_values(by=['NbBikesColl'], ascending=False)
top_collected.set_index('Id', inplace=True)
top_collected[0:20]


Out[38]:
level_0 index_x Name Latitude Longitude Priority index_y NbBikesDist index NbBikesColl NbBikesDistS NbBikesCollS
Id
BikePoints_374 293 293 Waterloo Station 1, Waterloo 51.504027 -0.113864 1 18705933.0 13720.0 18514768.0 11324.0 1.000000 1.000000
BikePoints_194 104 104 Hop Exchange, The Borough 51.504627 -0.091773 1 52756.0 83.0 5091275.0 8859.0 0.727396 0.986762
BikePoints_361 280 280 Waterloo Station 2, Waterloo 51.503919 -0.113426 1 14328975.0 11542.0 12275968.0 7939.0 0.990774 0.980850
BikePoints_228 140 140 St. Jamess Square, St. Jamess 51.507424 -0.134621 1 924165.0 1081.0 5214238.0 5495.0 0.864387 0.961009
BikePoints_579 503 503 Queen Street 2, Bank 51.511246 -0.093051 1 6603190.0 2259.0 15745827.0 5269.0 0.903723 0.958745
BikePoints_251 165 165 Brushfield Street, Liverpool Street 51.518908 -0.079249 1 1212462.0 883.0 6043235.0 4760.0 0.853590 0.953266
BikePoints_64 569 569 William Iv Street, Strand 51.509462 -0.124749 1 1032848.0 219.0 16651180.0 4634.0 0.779178 0.951820
BikePoints_278 193 193 Tooley Street, Bermondsey 51.503493 -0.079620 1 462910.0 223.0 7713012.0 4136.0 0.780144 0.945689
BikePoints_108 10 10 Abbey Orchard Street, Westminster 51.498125 -0.132102 1 123933.0 2253.0 273245.0 3975.0 0.903581 0.943548
BikePoints_66 591 591 Holborn Circus, Holborn 51.517950 -0.108657 1 6573965.0 2082.0 11224161.0 3974.0 0.899369 0.943535
BikePoints_428 345 345 Exhibition Road, Knightsbridge 51.499917 -0.174554 1 4317495.0 1505.0 11100729.0 3633.0 0.882048 0.938697
BikePoints_341 262 262 Craven Street, Strand 51.508103 -0.126021 1 2901730.0 1664.0 7903373.0 3552.0 0.887408 0.937481
BikePoints_336 256 256 Concert Hall Approach 2, South Bank 51.504942 -0.115533 1 3484215.0 1895.0 7683984.0 3472.0 0.894346 0.936253
BikePoints_703 640 640 St. Bride Street, Holborn 51.515059 -0.105344 2 2003721.0 502.0 10246217.0 3453.0 0.823450 0.935957
BikePoints_193 103 103 Bankside Mix, Bankside 51.505817 -0.100186 1 269370.0 505.0 2100735.0 3435.0 0.823768 0.935675
BikePoints_71 647 647 Newgate Street, St. Pauls 51.515418 -0.098850 1 2653125.0 792.0 10192427.0 3313.0 0.847785 0.933725
BikePoints_335 255 255 Tavistock Street, Covent Garden 51.511968 -0.120718 1 823075.0 403.0 6767800.0 3301.0 0.811726 0.933529
BikePoints_272 187 187 Baylis Road, Waterloo 51.501444 -0.110699 1 3055325.0 2708.0 4883148.0 3276.0 0.913399 0.933119
BikePoints_427 344 344 Cheapside, Bank 51.513970 -0.092940 1 5993520.0 3020.0 7929618.0 3263.0 0.919219 0.932905
BikePoints_109 11 11 Soho Square, Soho 51.515631 -0.132328 1 27183.0 406.0 259249.0 3238.0 0.812122 0.932490

In [39]:
collected_map = draw_stations_map(top_collected, create_redistribution_marker('NbBikesCollS'))
folium.TileLayer('stamentoner').add_to(collected_map)
folium.Map.save(collected_map, 'reports/maps/collected_map.html')
collected_map


Out[39]:

In [32]:
plot_redistribution_grid(collected, datetime(2016,5,16), datetime(2016,6,6), ids=top_collected.reset_index().Id[0:12])
plt.savefig('reports/images/bicycles-collected.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)


Clusters


In [83]:
%run src/data/periods.py

In [84]:
%run src/data/helper.py

In [87]:
readings2 = readings[readings.Source == 'REAL']
readings2 = readings2.sort_values(by=['Id', 'Timestamp'], ascending=True)

In [88]:
empty_entries = find_zero_periods(readings2, 'NbBikes')
empty_groups = get_ellapsed_time(empty_entries, by='GroupId').sort_values(by=['Ellapsed'], ascending=False)
full_entries = find_zero_periods(readings2, 'NbEmptyDocks')
full_groups = get_ellapsed_time(full_entries, by='GroupId').sort_values(by=['Ellapsed'], ascending=False)

In [89]:
empty_all = empty_groups.groupby('Id').sum()
empty_all.columns=['EmptyTotalAll']

full_all = full_groups.groupby('Id').sum()
full_all.columns=['FullTotalAll']

In [90]:
invalid_threshold = 720
invalid_group_ids = empty_groups[empty_groups.Ellapsed >= invalid_threshold].GroupId
empty_entries = empty_entries[~empty_entries.GroupId.isin(invalid_group_ids)]
invalid_group_ids = full_groups[full_groups.Ellapsed >= invalid_threshold].GroupId
full_entries = full_entries[~full_entries.GroupId.isin(invalid_group_ids)]

In [91]:
empty_periods = get_ellapsed_time(empty_entries, by='PeriodId')
empty_periods = add_station_info(empty_periods, stations, ['Priority', 'Id'])
empty_periods['Day'] = empty_periods['Period'].apply(lambda x: get_period_day(x))
empty_periods['PeakHours'] = empty_periods['Period'].apply(lambda x: is_peaktime(x)[1])

full_periods = get_ellapsed_time(full_entries, by='PeriodId')
full_periods = add_station_info(full_periods, stations, ['Priority', 'Id'])
full_periods['Day'] = full_periods['Period'].apply(lambda x: get_period_day(x))
full_periods['PeakHours'] = full_periods['Period'].apply(lambda x: is_peaktime(x)[1])

In [92]:
empty_periods_df = empty_periods.groupby(['Id', 'PeakHours']).sum().unstack()
empty_periods_df.columns = empty_periods_df.columns.droplevel()
empty_periods_df.columns.name = None
empty_periods_df.columns = ['EmptyEveningPeak', 'EmptyMorningPeak', 'EmptyNonPeak']
empty_periods_df['EmptyTotal'] = empty_periods_df.EmptyEveningPeak + empty_periods_df.EmptyMorningPeak + empty_periods_df.EmptyNonPeak

In [93]:
full_periods_df = full_periods.groupby(['Id', 'PeakHours']).sum().unstack()
full_periods_df.columns = full_periods_df.columns.droplevel()
full_periods_df.columns.name = None
full_periods_df.columns = ['FullEveningPeak', 'FullMorningPeak', 'FullNonPeak']
full_periods_df['FullTotal'] = full_periods_df.FullEveningPeak + full_periods_df.FullMorningPeak + full_periods_df.FullNonPeak

In [94]:
periods = empty_periods_df.merge(full_periods_df, right_index=True, left_index=True)

In [95]:
readings2['WeightedNbBikesStd'] = readings.NbBikes / readings.NbDocks
weighted_std = readings2.groupby('Id').std()

In [97]:
stats = readings2.groupby('Id').agg({'Timestamp': 'count', 'NbDocks': 'max'})
stats.columns = ['Count', 'NbDocks']

stats = stats.merge(periods, how='left', right_index=True, left_index=True).fillna(0)
stats = stats.merge(top_collected[['NbBikesColl']], how='left', right_index=True, left_index=True).fillna(0)
stats = stats.merge(top_distributed[['NbBikesDist']], how='left', right_index=True, left_index=True).fillna(0)
stats = stats.merge(weighted_std[['WeightedNbBikesStd']], how='left', right_index=True, left_index=True).fillna(0)

stats = add_station_info(stats, stations.set_index(stations.Id), use_indexes=True).drop(['Id', 'Name'], axis=1)
stats.Priority = stats.Priority.fillna(3).astype('int8')
stats


Out[97]:
Count NbDocks EmptyEveningPeak EmptyMorningPeak EmptyNonPeak EmptyTotal FullEveningPeak FullMorningPeak FullNonPeak FullTotal NbBikesColl NbBikesDist WeightedNbBikesStd Latitude Longitude Priority
Id
BikePoints_1 1348 19.0 716.0 1289.0 3600.0 5605.0 118.0 5.0 977.0 1100.0 9.0 1121.0 0.249440 51.529163 -0.109970 2
BikePoints_10 1750 18.0 455.0 457.0 4055.0 4967.0 167.0 0.0 797.0 0.0 341.0 125.0 0.276474 51.505974 -0.092754 2
BikePoints_100 1956 24.0 290.0 36.0 910.0 1236.0 26.0 437.0 645.0 1108.0 257.0 450.0 0.240004 51.490435 -0.122806 2
BikePoints_101 3244 22.0 788.0 878.0 15559.0 17225.0 183.0 890.0 1035.0 2108.0 0.0 0.0 0.296015 51.511553 -0.092940 1
BikePoints_102 1712 17.0 1761.0 897.0 16005.0 18663.0 270.0 729.0 3105.0 4104.0 351.0 150.0 0.334019 51.513406 -0.076793 2
BikePoints_103 1340 18.0 753.0 1721.0 4788.0 7262.0 5.0 135.0 105.0 245.0 26.0 516.0 0.237761 51.504723 -0.192538 2
BikePoints_104 3014 34.0 155.0 1119.0 4281.0 5555.0 273.0 66.0 732.0 1071.0 2119.0 201.0 0.290370 51.511594 -0.077121 1
BikePoints_105 2168 26.0 694.0 166.0 1535.0 2395.0 17.0 440.0 3265.0 3722.0 78.0 340.0 0.273838 51.515529 -0.190240 2
BikePoints_106 1510 21.0 751.0 2028.0 19186.0 21965.0 130.0 21.0 1153.0 1304.0 142.0 196.0 0.296680 51.514105 -0.147301 2
BikePoints_107 3023 20.0 437.0 1397.0 2458.0 4292.0 381.0 45.0 568.0 994.0 284.0 627.0 0.273600 51.526008 -0.096317 2
BikePoints_108 2419 29.0 614.0 94.0 5342.0 6050.0 236.0 441.0 1556.0 2233.0 3975.0 2253.0 0.279009 51.498125 -0.132102 1
BikePoints_109 3576 57.0 74.0 485.0 10722.0 11281.0 132.0 101.0 1587.0 1820.0 3238.0 406.0 0.305477 51.515631 -0.132328 1
BikePoints_11 2085 24.0 821.0 1056.0 9174.0 11051.0 50.0 0.0 176.0 0.0 413.0 281.0 0.247016 51.523951 -0.122502 2
BikePoints_110 1592 17.0 591.0 1140.0 5432.0 7163.0 89.0 68.0 1196.0 1353.0 9.0 594.0 0.265295 51.533043 -0.172528 2
BikePoints_111 1540 28.0 2274.0 1435.0 12866.0 16575.0 25.0 0.0 120.0 0.0 153.0 386.0 0.240716 51.510017 -0.157275 2
BikePoints_112 1455 42.0 352.0 519.0 11432.0 12303.0 16.0 888.0 1335.0 2239.0 1567.0 224.0 0.352896 51.515809 -0.105270 2
BikePoints_113 1922 19.0 274.0 421.0 2049.0 2744.0 642.0 25.0 2030.0 2697.0 361.0 159.0 0.273892 51.496462 -0.183289 2
BikePoints_114 2724 22.0 462.0 753.0 4172.0 5387.0 220.0 38.0 1688.0 1946.0 0.0 0.0 0.277506 51.524517 -0.158963 2
BikePoints_115 2734 34.0 942.0 593.0 13097.0 14632.0 18.0 11.0 577.0 606.0 157.0 285.0 0.268528 51.514233 -0.073537 2
BikePoints_116 3336 21.0 925.0 1048.0 17305.0 19278.0 236.0 525.0 2317.0 3078.0 2471.0 760.0 0.307579 51.514499 -0.141423 1
BikePoints_117 1954 25.0 90.0 581.0 1586.0 2257.0 138.0 0.0 4116.0 0.0 172.0 1159.0 0.250183 51.492880 -0.114934 2
BikePoints_118 1574 13.0 761.0 574.0 7597.0 8932.0 182.0 937.0 2151.0 3270.0 970.0 532.0 0.279275 51.495827 -0.135478 2
BikePoints_119 2895 18.0 359.0 949.0 7775.0 9083.0 253.0 52.0 768.0 1073.0 402.0 254.0 0.258200 51.525893 -0.090847 2
BikePoints_12 3183 49.0 193.0 212.0 10122.0 10527.0 0.0 0.0 275.0 0.0 1892.0 1434.0 0.274449 51.521680 -0.130431 1
BikePoints_120 1463 17.0 1696.0 675.0 18388.0 20759.0 0.0 1815.0 1422.0 0.0 1099.0 1063.0 0.305374 51.515735 -0.093080 2
BikePoints_121 1664 22.0 2753.0 1019.0 17616.0 21388.0 0.0 15.0 15.0 0.0 71.0 325.0 0.209431 51.518913 -0.156166 2
BikePoints_122 2393 23.0 984.0 609.0 18666.0 20259.0 33.0 470.0 888.0 1391.0 819.0 188.0 0.304171 51.521113 -0.078869 2
BikePoints_123 1757 18.0 604.0 647.0 1784.0 3035.0 25.0 0.0 635.0 0.0 89.0 829.0 0.230401 51.528360 -0.104724 2
BikePoints_124 1650 20.0 196.0 147.0 1634.0 1977.0 296.0 109.0 797.0 1202.0 202.0 259.0 0.250577 51.496544 -0.150905 2
BikePoints_125 2093 21.0 73.0 45.0 224.0 342.0 125.0 369.0 1106.0 1600.0 176.0 220.0 0.242163 51.500694 -0.094524 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
BikePoints_805 1 27.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18.0 0.000000 51.520069 -0.206338 3
BikePoints_807 93 24.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 26.0 0.270029 51.521200 -0.208888 3
BikePoints_808 773 33.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 157.0 33.0 0.167695 51.473486 -0.122555 3
BikePoints_809 1 27.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 51.516277 -0.118272 3
BikePoints_81 2373 19.0 1217.0 1620.0 11289.0 14126.0 0.0 364.0 2161.0 0.0 287.0 98.0 0.319367 51.520253 -0.141327 2
BikePoints_810 1352 30.0 271.0 245.0 3316.0 3832.0 50.0 139.0 868.0 1057.0 89.0 31.0 0.282491 51.506725 -0.098807 3
BikePoints_811 12 36.0 0.0 0.0 40.0 0.0 0.0 0.0 20.0 0.0 0.0 15.0 0.014848 51.505703 -0.027772 3
BikePoints_814 698 30.0 0.0 0.0 25.0 0.0 94.0 32.0 3646.0 3772.0 0.0 0.0 0.184637 51.471433 -0.123670 3
BikePoints_815 2269 35.0 30.0 0.0 361.0 0.0 460.0 458.0 2339.0 3257.0 380.0 44.0 0.232019 51.500089 -0.116628 3
BikePoints_817 8 26.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18.0 0.018767 51.481335 -0.138212 3
BikePoints_818 1551 13.0 96.0 85.0 469.0 650.0 1911.0 2703.0 15328.0 19942.0 0.0 0.0 0.245217 51.503127 -0.078655 3
BikePoints_82 1585 15.0 1421.0 938.0 17094.0 19453.0 32.0 804.0 2282.0 3118.0 800.0 939.0 0.297723 51.514274 -0.111257 2
BikePoints_83 2487 21.0 119.0 2607.0 12611.0 15337.0 1212.0 67.0 3843.0 5122.0 993.0 112.0 0.309105 51.509639 -0.131510 2
BikePoints_84 1453 25.0 1120.0 217.0 15599.0 16936.0 16.0 341.0 1280.0 1637.0 619.0 576.0 0.306582 51.515937 -0.111778 2
BikePoints_85 2533 41.0 45.0 0.0 15.0 0.0 149.0 140.0 2273.0 2562.0 365.0 196.0 0.228608 51.500647 -0.078600 2
BikePoints_86 876 24.0 146.0 48.0 652.0 846.0 72.0 0.0 2594.0 0.0 162.0 625.0 0.271581 51.489479 -0.115156 2
BikePoints_87 2189 16.0 994.0 553.0 16582.0 18129.0 68.0 267.0 740.0 1075.0 253.0 249.0 0.284451 51.516468 -0.079684 2
BikePoints_88 2444 25.0 763.0 624.0 15300.0 16687.0 14.0 583.0 2133.0 2730.0 596.0 262.0 0.328718 51.518587 -0.132053 2
BikePoints_89 2139 19.0 466.0 944.0 3188.0 4598.0 52.0 73.0 842.0 967.0 268.0 858.0 0.256537 51.526250 -0.123509 2
BikePoints_9 2244 19.0 655.0 245.0 3585.0 4485.0 110.0 541.0 1724.0 2375.0 750.0 601.0 0.310481 51.507385 -0.096440 2
BikePoints_90 744 27.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.199771 51.533019 -0.139174 2
BikePoints_91 1546 20.0 0.0 991.0 2201.0 0.0 312.0 59.0 5015.0 5386.0 192.0 1058.0 0.276452 51.493686 -0.111014 2
BikePoints_92 1432 41.0 5.0 0.0 0.0 0.0 39.0 42.0 669.0 750.0 177.0 322.0 0.214318 51.498898 -0.100440 2
BikePoints_93 1771 37.0 20.0 964.0 1577.0 2561.0 0.0 97.0 264.0 0.0 191.0 2067.0 0.238161 51.534408 -0.109025 2
BikePoints_94 2333 28.0 44.0 738.0 2281.0 3063.0 201.0 462.0 8768.0 9431.0 347.0 1044.0 0.297407 51.495061 -0.085814 2
BikePoints_95 3406 15.0 923.0 496.0 10585.0 12004.0 138.0 438.0 801.0 1377.0 2244.0 1204.0 0.265512 51.520841 -0.097340 1
BikePoints_96 1914 26.0 144.0 206.0 774.0 1124.0 214.0 0.0 1560.0 0.0 123.0 532.0 0.254120 51.530950 -0.078505 2
BikePoints_97 2557 18.0 115.0 984.0 4488.0 5587.0 91.0 112.0 1328.0 1531.0 318.0 129.0 0.258679 51.497924 -0.183834 2
BikePoints_98 2515 54.0 204.0 128.0 2519.0 2851.0 0.0 0.0 5.0 0.0 364.0 2045.0 0.204667 51.525542 -0.138231 2
BikePoints_99 2380 16.0 1498.0 770.0 11663.0 13931.0 192.0 229.0 638.0 1059.0 222.0 156.0 0.284076 51.514577 -0.158264 2

780 rows × 16 columns


In [101]:
stats.sort_values(by=['Count'], ascending=False)


Out[101]:
Count NbDocks EmptyEveningPeak EmptyMorningPeak EmptyNonPeak EmptyTotal FullEveningPeak FullMorningPeak FullNonPeak FullTotal NbBikesColl NbBikesDist WeightedNbBikesStd Latitude Longitude Priority
Id
BikePoints_14 4750 48.0 30.0 632.0 892.0 1554.0 99.0 9.0 1466.0 1574.0 2523.0 2520.0 0.237829 51.529943 -0.123616 1
BikePoints_194 4585 56.0 0.0 41.0 1976.0 0.0 371.0 53.0 1359.0 1783.0 8859.0 83.0 0.269856 51.504627 -0.091773 1
BikePoints_132 4203 38.0 79.0 595.0 4795.0 5469.0 228.0 0.0 467.0 0.0 1149.0 963.0 0.270609 51.523648 -0.074754 1
BikePoints_191 4194 28.0 977.0 257.0 9278.0 10512.0 336.0 312.0 999.0 1647.0 462.0 414.0 0.248440 51.503117 -0.153520 1
BikePoints_39 4191 41.0 75.0 654.0 3949.0 4678.0 0.0 0.0 15.0 0.0 1092.0 864.0 0.245162 51.526377 -0.078130 1
BikePoints_251 4183 34.0 460.0 10.0 11606.0 12076.0 15.0 436.0 680.0 1131.0 4760.0 883.0 0.289847 51.518908 -0.079249 1
BikePoints_341 4091 23.0 75.0 525.0 4373.0 4973.0 1042.0 20.0 2401.0 3463.0 3552.0 1664.0 0.281550 51.508103 -0.126021 1
BikePoints_553 4064 24.0 300.0 1388.0 2368.0 4056.0 525.0 72.0 6403.0 7000.0 923.0 1589.0 0.298380 51.535678 -0.062546 1
BikePoints_73 4012 37.0 314.0 332.0 8377.0 9023.0 46.0 0.0 100.0 0.0 1396.0 251.0 0.230097 51.525726 -0.088486 1
BikePoints_307 3989 24.0 202.0 679.0 2846.0 3727.0 737.0 112.0 7573.0 8422.0 134.0 1088.0 0.290867 51.509908 -0.187842 1
BikePoints_374 3875 36.0 55.0 852.0 208.0 1115.0 1233.0 255.0 3486.0 4974.0 11324.0 13720.0 0.287078 51.504027 -0.113864 1
BikePoints_356 3759 15.0 613.0 278.0 10219.0 11110.0 152.0 252.0 1260.0 1664.0 1479.0 466.0 0.270791 51.494412 -0.173881 1
BikePoints_213 3751 36.0 517.0 311.0 7533.0 8361.0 117.0 20.0 764.0 901.0 599.0 1363.0 0.277135 51.502740 -0.149569 1
BikePoints_407 3741 20.0 1344.0 727.0 8641.0 10712.0 76.0 322.0 1968.0 2366.0 181.0 428.0 0.219490 51.512303 -0.159988 1
BikePoints_303 3733 28.0 597.0 364.0 6533.0 7494.0 555.0 687.0 3361.0 4603.0 748.0 177.0 0.294630 51.502953 -0.158456 1
BikePoints_217 3729 16.0 489.0 725.0 12627.0 13841.0 718.0 510.0 875.0 2103.0 2530.0 346.0 0.283340 51.516154 -0.082422 1
BikePoints_732 3699 21.0 147.0 508.0 6884.0 7539.0 744.0 268.0 1492.0 2504.0 823.0 119.0 0.279776 51.506304 -0.087262 2
BikePoints_229 3671 24.0 251.0 207.0 7848.0 8306.0 110.0 46.0 685.0 841.0 1942.0 624.0 0.286519 51.506543 -0.123179 1
BikePoints_64 3621 26.0 22.0 711.0 7385.0 8118.0 270.0 53.0 1924.0 2247.0 4634.0 219.0 0.279594 51.509462 -0.124749 1
BikePoints_71 3617 34.0 164.0 153.0 12660.0 12977.0 45.0 15.0 310.0 370.0 3313.0 792.0 0.260924 51.515418 -0.098850 1
BikePoints_541 3599 28.0 505.0 566.0 16361.0 17432.0 177.0 623.0 858.0 1658.0 2602.0 957.0 0.288411 51.506613 -0.142844 1
BikePoints_225 3584 30.0 637.0 726.0 7561.0 8924.0 0.0 18.0 72.0 0.0 162.0 1125.0 0.224723 51.509353 -0.196422 1
BikePoints_278 3583 17.0 983.0 82.0 4026.0 5091.0 294.0 482.0 1449.0 2225.0 4136.0 223.0 0.273558 51.503493 -0.079620 1
BikePoints_109 3576 57.0 74.0 485.0 10722.0 11281.0 132.0 101.0 1587.0 1820.0 3238.0 406.0 0.305477 51.515631 -0.132328 1
BikePoints_74 3558 17.0 199.0 143.0 577.0 919.0 458.0 509.0 4009.0 4976.0 69.0 124.0 0.254167 51.485917 -0.124469 2
BikePoints_270 3555 31.0 0.0 291.0 233.0 0.0 641.0 460.0 4223.0 5324.0 1798.0 1007.0 0.268686 51.486343 -0.122492 1
BikePoints_428 3543 20.0 1401.0 317.0 14290.0 16008.0 43.0 364.0 1844.0 2251.0 3633.0 1505.0 0.288183 51.499917 -0.174554 1
BikePoints_130 3534 24.0 271.0 644.0 9295.0 10210.0 236.0 44.0 1328.0 1608.0 1601.0 177.0 0.280330 51.509506 -0.075459 1
BikePoints_695 3519 21.0 56.0 1670.0 1550.0 3276.0 1089.0 27.0 3597.0 4713.0 434.0 1384.0 0.311042 51.536384 -0.102757 2
BikePoints_737 3479 26.0 52.0 71.0 219.0 342.0 810.0 521.0 4250.0 5581.0 226.0 187.0 0.246781 51.479932 -0.194116 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
BikePoints_586 563 31.0 92.0 0.0 162.0 0.0 373.0 235.0 1943.0 2551.0 78.0 80.0 0.273803 51.490645 -0.014582 2
BikePoints_704 551 18.0 739.0 607.0 3052.0 4398.0 93.0 43.0 1140.0 1276.0 14.0 161.0 0.263221 51.456820 -0.202802 2
BikePoints_629 541 28.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 150.0 77.0 0.136196 51.459953 -0.190184 2
BikePoints_777 530 21.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.173024 51.461923 -0.165297 2
BikePoints_689 523 30.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.202046 51.459225 -0.180884 2
BikePoints_678 521 30.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 70.0 35.0 0.151384 51.462312 -0.211468 2
BikePoints_476 514 29.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 171.0 10.0 0.182647 51.489096 -0.009205 2
BikePoints_519 514 33.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 65.0 0.0 0.125163 51.518811 -0.011662 2
BikePoints_637 483 25.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30.0 60.0 0.260802 51.457870 -0.174691 2
BikePoints_8 481 18.0 292.0 143.0 804.0 1239.0 60.0 0.0 0.0 0.0 43.0 227.0 0.208989 51.528341 -0.170134 2
BikePoints_796 481 29.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 45.0 185.0 0.132927 51.524000 -0.126409 3
BikePoints_659 472 57.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 360.0 130.0 0.135441 51.464370 -0.174619 2
BikePoints_504 450 30.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.129877 51.496454 -0.009506 2
BikePoints_473 439 19.0 115.0 0.0 81.0 0.0 633.0 1147.0 3454.0 5234.0 65.0 19.0 0.279441 51.496137 -0.019355 2
BikePoints_554 439 16.0 418.0 8.0 1394.0 1820.0 0.0 10.0 0.0 0.0 7.0 36.0 0.253525 51.513548 -0.005659 2
BikePoints_482 356 26.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 66.0 25.0 0.257298 51.509843 -0.023770 2
BikePoints_752 332 29.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 221.0 0.161063 51.458164 -0.206002 2
BikePoints_474 326 39.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0 71.0 0.193888 51.498125 -0.011457 2
BikePoints_672 317 28.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 441.0 36.0 0.196234 51.464688 -0.173656 2
BikePoints_608 289 29.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 50.0 36.0 0.151252 51.491093 -0.216493 2
BikePoints_555 214 56.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 352.0 46.0 0.157360 51.509424 -0.219712 3
BikePoints_237 157 30.0 25.0 28.0 558.0 611.0 0.0 15.0 0.0 0.0 170.0 108.0 0.239742 51.509786 -0.068161 3
BikePoints_494 154 36.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 88.0 44.0 0.101297 51.501960 -0.016251 2
BikePoints_497 99 27.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.190349 51.526177 -0.027467 3
BikePoints_807 93 24.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 26.0 0.270029 51.521200 -0.208888 3
BikePoints_811 12 36.0 0.0 0.0 40.0 0.0 0.0 0.0 20.0 0.0 0.0 15.0 0.014848 51.505703 -0.027772 3
BikePoints_817 8 26.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18.0 0.018767 51.481335 -0.138212 3
BikePoints_791 7 0.0 180.0 0.0 361.0 0.0 180.0 0.0 361.0 0.0 0.0 0.0 0.000000 NaN NaN 3
BikePoints_805 1 27.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18.0 0.000000 51.520069 -0.206338 3
BikePoints_809 1 27.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 51.516277 -0.118272 3

780 rows × 16 columns


In [ ]:
pickle.dump(stats, open("data/parsed/stations_statistics.p", "wb"))

In [ ]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

empty_pred = KMeans(n_clusters=3).fit_predict(stats[['EmptyEveningPeak', 'EmptyMorningPeak', 'EmptyNonPeak']].values)
full_pred = KMeans(n_clusters=3).fit_predict(stats[['FullEveningPeak', 'FullMorningPeak', 'FullNonPeak']].values)

In [ ]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=plt.figaspect(0.5))

###############################################################################
# Empty Minutes Clusters

ax = fig.add_subplot(1, 2, 1, projection='3d')

ax.set_title('Empty Minutes Clusters')
ax.set_xlabel('EmptyEveningPeak')
ax.set_ylabel('EmptyMorningPeak')
ax.set_zlabel('EmptyNonPeak')

ax.set_xlim(0, max(stats.EmptyEveningPeak))
ax.set_ylim(0, max(stats.EmptyMorningPeak))
ax.set_zlim(0, max(stats.EmptyNonPeak))

ax.view_init(elev=30, azim=230)
ax.dist=12                  

ax.scatter(stats.EmptyEveningPeak, stats.EmptyMorningPeak, stats.EmptyNonPeak,
           c=empty_pred, color='red', marker='o', s=30)

###############################################################################
# Full Minutes Clusters

ax = fig.add_subplot(1, 2, 2, projection='3d')

ax.set_title('Full Minutes Clusters')
ax.set_xlabel('FullEveningPeak')
ax.set_ylabel('FullMorningPeak')
ax.set_zlabel('FullNonPeak')

ax.set_xlim(0, max(stats.FullEveningPeak))
ax.set_ylim(0, max(stats.FullMorningPeak))
ax.set_zlim(0, max(stats.FullNonPeak))

ax.view_init(elev=30, azim=230)
ax.dist=12                  

ax.scatter(stats.FullEveningPeak, stats.FullMorningPeak, stats.FullNonPeak,
           c=full_pred, color='blue', marker='o', s=30)

plt.show()

In [ ]:
plt.figure(figsize=(12, 12))

plt.subplot(221)
plt.scatter(stats.Count, stats.NbDocks, c=stats.Priority)
plt.title("Incorrect Number of Blobs")

In [ ]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs

###############################################################################
# Generate sample data
X = stats[['Count', 'NbDocks']].values

###############################################################################
# Compute clustering with MeanShift

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

###############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [ ]:
chido = stats.copy()
chido['EmptyCluster'] = empty_pred
chido['FullCluster'] = full_pred
chido['ActivityCluster'] = ms.labels_

In [ ]:
chido

In [ ]: