notebook.community



In [1]:

    
%matplotlib inline

import math
import pytz 
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import cPickle as pickle



In [2]:

    
%run src/data/helper.py



In [3]:

    
%run src/data/visualization.py



In [4]:

    
start_time = time.time()

stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_clean.p', "rb"))

end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)









    



Opening redistribution data took 307.107703924

DayViews



In [25]:

    
readings['NAB'] = readings.NbBikes / (readings.NbBikes + readings.NbEmptyDocks)
readings['NAS'] = (readings.NbBikesTMinus1 - readings.NbBikes).apply(math.fabs) / (readings.NbBikes + readings.NbEmptyDocks)



In [26]:

    
dayviews = readings.query('Holiday == 0')[['NAB', 'NAS', 'TimeOfDay','Weekday']].dropna().reset_index().groupby(['Weekday', 'Id', 'TimeOfDay']).mean()

Use NAS data for weekdays and non-holidays to find clusters



In [27]:

    
data = dayviews.unstack().loc[1].NAS

Distance Metric



In [28]:

    
from cdtw import cdtw_sakoe_chiba

def dtw_sakoe_chiba(a,b):
    return cdtw_sakoe_chiba(a, b, 12)

PreCompute Distances



In [29]:

    
from scipy.spatial.distance import pdist,squareform

dist_condensed = pdist(data.values, dtw_sakoe_chiba)
dist_matrix = squareform(dist_condensed)

Cluster



In [262]:

    
from sklearn.cluster import AgglomerativeClustering

agg_clustering = AgglomerativeClustering(n_clusters=6, affinity='precomputed', linkage="complete")
clusters = pd.Series(agg_clustering.fit_predict(dist_matrix), index=data.index)

Analysis



In [263]:

    
cluster_counts = clusters.value_counts().rename('StationsInCluster').to_frame()



In [264]:

    
stations_df = stations.copy().set_index('Id')
stations_df.Priority.fillna(3, inplace=True)

clusters_df = add_station_info(clusters.to_frame('Cluster'), stations_df, use_indexes=True)
clusters_df = clusters_df.merge(cluster_counts, how='inner', left_on='Cluster', right_index=True)



In [265]:

    
import math

results = clusters_df.groupby(['Priority', 'Cluster']).agg({'Name':'count', 'StationsInCluster':'first'})
results['Percentage'] = (results.Name / results.StationsInCluster) * 100
print results.loc["1"].query('Percentage < 99.99 & Percentage > 40').Percentage.sum()
results









    



56.25






    Out[265]:






  
    
      
      
      StationsInCluster
      Name
      Percentage
    
    
      Priority
      Cluster
      
      
      
    
  
  
    
      3
      0
      442
      13
      2.941176
    
    
      3
      287
      25
      8.710801
    
    
      1
      0
      442
      47
      10.633484
    
    
      1
      32
      18
      56.250000
    
    
      2
      7
      7
      100.000000
    
    
      3
      287
      2
      0.696864
    
    
      4
      2
      2
      100.000000
    
    
      5
      1
      1
      100.000000
    
    
      2
      0
      442
      382
      86.425339
    
    
      1
      32
      14
      43.750000
    
    
      3
      287
      260
      90.592334

Find Priority 1



In [33]:

    
data = add_station_info(clusters.to_frame('Cluster'), stations.set_index('Id'), use_indexes=True)
data.query('Priority == "1"').Cluster.value_counts()









    Out[33]:





1    47
0    25
4     2
3     2
2     1
Name: Cluster, dtype: int64



In [34]:

    
clusters_df = dayviews.unstack().loc[1].NAS.copy()
clusters_df['Cluster'] = clusters
clusters_df = clusters_df[['Cluster']]
clusters_df.Cluster.nunique()









    Out[34]:





5



In [ ]:

    
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
for col in statistics.columns.difference(['Latitude', 'Longitude', 'Priority']):
    std_col = '%sS' % col    
    statistics[std_col] = min_max_scaler.fit_transform(statistics[col].values.reshape(-1, 1))



In [ ]:

    
from sklearn.cluster import KMeans

statistics = statistics.sort_values(by=['Priority'])

priority_clusters = [(3,1), (2,2), (2,3)]
cluster_cols = ['EmptyEveningPeakS', 'EmptyMorningPeakS', 'EmptyNonPeakS', 
                'FullEveningPeakS', 'FullMorningPeakS', 'FullNonPeakS',
                'CountS']

clusters = []
offset = 0
for cls_prior in priority_clusters:
    n_clusters, priority = cls_prior
    window = statistics[statistics.Priority == priority][cluster_cols]
    p_clusters = KMeans(n_clusters=n_clusters).fit_predict(window.values)
    clusters.extend(p_clusters + offset) 
    
    offset += n_clusters
    
statistics['Cluster'] = clusters



In [ ]:

    
draw_stations_map(statistics, create_cluster_marker('Cluster'))

		StationsInCluster	Name	Percentage
Priority	Cluster
3	0	442	13	2.941176
3	3	287	25	8.710801
1	0	442	47	10.633484
	1	32	18	56.250000
	2	7	7	100.000000
	3	287	2	0.696864
	4	2	2	100.000000
	5	1	1	100.000000
2	0	442	382	86.425339
	1	32	14	43.750000
	3	287	260	90.592334