In [1]:
%matplotlib inline
import math
import pytz
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import cPickle as pickle
In [2]:
%run src/data/helper.py
In [3]:
%run src/data/visualization.py
In [4]:
start_time = time.time()
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open('data/parsed/readings_clean.p', "rb"))
end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)
In [25]:
readings['NAB'] = readings.NbBikes / (readings.NbBikes + readings.NbEmptyDocks)
readings['NAS'] = (readings.NbBikesTMinus1 - readings.NbBikes).apply(math.fabs) / (readings.NbBikes + readings.NbEmptyDocks)
In [26]:
dayviews = readings.query('Holiday == 0')[['NAB', 'NAS', 'TimeOfDay','Weekday']].dropna().reset_index().groupby(['Weekday', 'Id', 'TimeOfDay']).mean()
Use NAS data for weekdays and non-holidays to find clusters
In [27]:
data = dayviews.unstack().loc[1].NAS
In [28]:
from cdtw import cdtw_sakoe_chiba
def dtw_sakoe_chiba(a,b):
return cdtw_sakoe_chiba(a, b, 12)
In [29]:
from scipy.spatial.distance import pdist,squareform
dist_condensed = pdist(data.values, dtw_sakoe_chiba)
dist_matrix = squareform(dist_condensed)
In [262]:
from sklearn.cluster import AgglomerativeClustering
agg_clustering = AgglomerativeClustering(n_clusters=6, affinity='precomputed', linkage="complete")
clusters = pd.Series(agg_clustering.fit_predict(dist_matrix), index=data.index)
In [263]:
cluster_counts = clusters.value_counts().rename('StationsInCluster').to_frame()
In [264]:
stations_df = stations.copy().set_index('Id')
stations_df.Priority.fillna(3, inplace=True)
clusters_df = add_station_info(clusters.to_frame('Cluster'), stations_df, use_indexes=True)
clusters_df = clusters_df.merge(cluster_counts, how='inner', left_on='Cluster', right_index=True)
In [265]:
import math
results = clusters_df.groupby(['Priority', 'Cluster']).agg({'Name':'count', 'StationsInCluster':'first'})
results['Percentage'] = (results.Name / results.StationsInCluster) * 100
print results.loc["1"].query('Percentage < 99.99 & Percentage > 40').Percentage.sum()
results
Out[265]:
In [33]:
data = add_station_info(clusters.to_frame('Cluster'), stations.set_index('Id'), use_indexes=True)
data.query('Priority == "1"').Cluster.value_counts()
Out[33]:
In [34]:
clusters_df = dayviews.unstack().loc[1].NAS.copy()
clusters_df['Cluster'] = clusters
clusters_df = clusters_df[['Cluster']]
clusters_df.Cluster.nunique()
Out[34]:
In [ ]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
for col in statistics.columns.difference(['Latitude', 'Longitude', 'Priority']):
std_col = '%sS' % col
statistics[std_col] = min_max_scaler.fit_transform(statistics[col].values.reshape(-1, 1))
In [ ]:
from sklearn.cluster import KMeans
statistics = statistics.sort_values(by=['Priority'])
priority_clusters = [(3,1), (2,2), (2,3)]
cluster_cols = ['EmptyEveningPeakS', 'EmptyMorningPeakS', 'EmptyNonPeakS',
'FullEveningPeakS', 'FullMorningPeakS', 'FullNonPeakS',
'CountS']
clusters = []
offset = 0
for cls_prior in priority_clusters:
n_clusters, priority = cls_prior
window = statistics[statistics.Priority == priority][cluster_cols]
p_clusters = KMeans(n_clusters=n_clusters).fit_predict(window.values)
clusters.extend(p_clusters + offset)
offset += n_clusters
statistics['Cluster'] = clusters
In [ ]:
draw_stations_map(statistics, create_cluster_marker('Cluster'))