In [1]:
%matplotlib inline

import math
import pytz 
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import cPickle as pickle

In [2]:
%run src/data/helper.py

In [3]:
%run src/data/visualization.py

In [4]:
start_time = time.time()

stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_clean.p', "rb"))

end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)


Opening redistribution data took 307.107703924

DayViews


In [25]:
readings['NAB'] = readings.NbBikes / (readings.NbBikes + readings.NbEmptyDocks)
readings['NAS'] = (readings.NbBikesTMinus1 - readings.NbBikes).apply(math.fabs) / (readings.NbBikes + readings.NbEmptyDocks)

In [26]:
dayviews = readings.query('Holiday == 0')[['NAB', 'NAS', 'TimeOfDay','Weekday']].dropna().reset_index().groupby(['Weekday', 'Id', 'TimeOfDay']).mean()

Use NAS data for weekdays and non-holidays to find clusters


In [27]:
data = dayviews.unstack().loc[1].NAS

Distance Metric


In [28]:
from cdtw import cdtw_sakoe_chiba

def dtw_sakoe_chiba(a,b):
    return cdtw_sakoe_chiba(a, b, 12)

PreCompute Distances


In [29]:
from scipy.spatial.distance import pdist,squareform

dist_condensed = pdist(data.values, dtw_sakoe_chiba)
dist_matrix = squareform(dist_condensed)

Cluster


In [262]:
from sklearn.cluster import AgglomerativeClustering

agg_clustering = AgglomerativeClustering(n_clusters=6, affinity='precomputed', linkage="complete")
clusters = pd.Series(agg_clustering.fit_predict(dist_matrix), index=data.index)

Analysis


In [263]:
cluster_counts = clusters.value_counts().rename('StationsInCluster').to_frame()

In [264]:
stations_df = stations.copy().set_index('Id')
stations_df.Priority.fillna(3, inplace=True)

clusters_df = add_station_info(clusters.to_frame('Cluster'), stations_df, use_indexes=True)
clusters_df = clusters_df.merge(cluster_counts, how='inner', left_on='Cluster', right_index=True)

In [265]:
import math

results = clusters_df.groupby(['Priority', 'Cluster']).agg({'Name':'count', 'StationsInCluster':'first'})
results['Percentage'] = (results.Name / results.StationsInCluster) * 100
print results.loc["1"].query('Percentage < 99.99 & Percentage > 40').Percentage.sum()
results


56.25
Out[265]:
StationsInCluster Name Percentage
Priority Cluster
3 0 442 13 2.941176
3 287 25 8.710801
1 0 442 47 10.633484
1 32 18 56.250000
2 7 7 100.000000
3 287 2 0.696864
4 2 2 100.000000
5 1 1 100.000000
2 0 442 382 86.425339
1 32 14 43.750000
3 287 260 90.592334

Find Priority 1


In [33]:
data = add_station_info(clusters.to_frame('Cluster'), stations.set_index('Id'), use_indexes=True)
data.query('Priority == "1"').Cluster.value_counts()


Out[33]:
1    47
0    25
4     2
3     2
2     1
Name: Cluster, dtype: int64

In [34]:
clusters_df = dayviews.unstack().loc[1].NAS.copy()
clusters_df['Cluster'] = clusters
clusters_df = clusters_df[['Cluster']]
clusters_df.Cluster.nunique()


Out[34]:
5

In [ ]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
for col in statistics.columns.difference(['Latitude', 'Longitude', 'Priority']):
    std_col = '%sS' % col    
    statistics[std_col] = min_max_scaler.fit_transform(statistics[col].values.reshape(-1, 1))

In [ ]:
from sklearn.cluster import KMeans

statistics = statistics.sort_values(by=['Priority'])

priority_clusters = [(3,1), (2,2), (2,3)]
cluster_cols = ['EmptyEveningPeakS', 'EmptyMorningPeakS', 'EmptyNonPeakS', 
                'FullEveningPeakS', 'FullMorningPeakS', 'FullNonPeakS',
                'CountS']

clusters = []
offset = 0
for cls_prior in priority_clusters:
    n_clusters, priority = cls_prior
    window = statistics[statistics.Priority == priority][cluster_cols]
    p_clusters = KMeans(n_clusters=n_clusters).fit_predict(window.values)
    clusters.extend(p_clusters + offset) 
    
    offset += n_clusters
    
statistics['Cluster'] = clusters

In [ ]:
draw_stations_map(statistics, create_cluster_marker('Cluster'))