In [ ]:
import networkx as nx
import pandas as pd
import igraph
from pandas import Timestamp
import numpy as np
In [ ]:
df = pd.read_csv('Data/working_data.csv',parse_dates=["starttime", "stoptime"])
del df['Unnamed: 0']
In [ ]:
# Subset of WEEKDAYS; subscribers, non-round trips
df_wkday = df[(df.usertype == 'Subscriber') & (df.dist != 0) & (df.weektype == 'Weekday') & \
(df.starttime >= Timestamp('2013-08-01')) & (df.starttime < Timestamp('2013-11-01'))]\
[['starttime', 'stoptime', 'start station id', 'end station id', 'dayofweek', 'dayofweek_name']]
# Add auxillary information
wkday_aux_info = pd.read_csv('Data/station_aux_info.csv')
for i in ['start station id', 'end station id']:
prefix = i.split(' ',1)[0]
df_wkday_merge = pd.merge(df_wkday, wkday_aux_info, left_on=i, right_on='station_id')
df_wkday_merge.rename(columns={'stationid_new':prefix + 'stationid_new', 'station_x':prefix + 'station_x',\
'station_y':prefix + 'station_y', 'borough':prefix + '_boro',\
'neighborhood':prefix + 'neigh', 'neigh_id':prefix + 'neigh_id',\
'neigh_x':prefix + 'neigh_x', 'neigh_y':prefix + 'neigh_y',\
'grid_x':prefix + 'grid_x', 'grid_y':prefix + 'grid_y', 'grid_id':prefix + 'grid_id'},\
inplace=True)
del df_wkday_merge['station_id']
df_wkday = df_wkday_merge
In [ ]:
# Matrix for WEEKDAYS at 1 hour intervals between August-October from 7am - 11:59pm; minus round-trips
store_1hr_allWkday_matrix = pd.HDFStore('store_1hr_allWkday_matrix.h5')
# Create list of hours to loop through
hrList = ['07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
indexList = range(0, 332)
df_indexed = df_wkday.set_index('starttime')
for hr in hrList:
subset = df_indexed.between_time(hr + ':00:00', hr + ':59:00')
subset_agg = subset.groupby(['startstationid_new', 'endstationid_new'], as_index=False).agg(len) \
[['startstationid_new', 'endstationid_new', 'dayofweek']]
subset_agg.columns = ['startstationid', 'endstationid', 'count']
matrix = subset_agg.pivot(index = 'startstationid', columns = 'endstationid', values = 'count').fillna(0)
matrix_reindex = matrix.reindex(index = indexList, columns = indexList, fill_value = 0)
store_1hr_allWkday_matrix['matrix_' + hr] = matrix_reindex
In [ ]:
# Save WEEKDAY matrices in pajek format for Infomap
for i in store_1hr_allWkday_matrix.keys():
outname = 'pajek/' + i[8:] + '.pajek'
f = store_1hr_allWkday_matrix[i]
iG_dir = igraph.Graph.Weighted_Adjacency(np.matrix(f).tolist(), "DIRECTED", "weight", True)
iG_dir.write(outname, format='pajek')
In [ ]:
"""Initial test showed very little differences between the days. So this was excluded from the study"""
# # Matrix for MONDAY-FRIDAY at 1 hour intervals between August-October from 7am - 11:59pm; sans round-trips
# store_1hr_matrix = pd.HDFStore('store_1hr_matrix.h5')
# # Create list of hours to loop through
# hrList = ['07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
# indexList = range(0, 332)
# for i in range(0,5):
# df_aug_oct = df_merge[df_merge['dayofweek'] == i]
# df_indexed = df_aug_oct.set_index('starttime')
# for hr in hrList:
# subset = df_indexed.between_time(hr + ':00:00', hr + ':59:00')
# subset_agg = subset.groupby(['startstationid_new', 'endstationid_new'], as_index=False).agg(len) \
# [['startstationid_new', 'endstationid_new', 'dayofweek']]
# subset_agg.columns = ['startstationid', 'endstationid', 'count']
# matrix = subset_agg.pivot(index = 'startstationid', columns = 'endstationid', values = 'count').fillna(0)
# matrix_reindex = matrix.reindex(index = indexList, columns = indexList, fill_value = 0)
# store_1hr_matrix['matrix_' + str(i) + '_' + hr] = matrix_reindex
# # Save MONDAY-FRIDAY matrices in pajek format for Infomap
# for i in store_1hr_matrix.h5.keys():
# outname = 'pajek/' + i[8:] + '.pajek'
# f = store_1hr_allWknd_matrix[i]
# iG_dir = igraph.Graph.Weighted_Adjacency(np.matrix(f).tolist(), "DIRECTED", "weight", True)
# iG_dir.write(outname, format='pajek')
In [ ]:
# Subset of WEEKENDS; non-round trips; all usertypes
df_wknd = df[(df.dist != 0) & (df.weektype == 'Weekend') & \
(df.starttime >= Timestamp('2013-08-01')) & (df.starttime < Timestamp('2013-11-01'))]\
[['starttime','stoptime','start station id','end station id', 'dayofweek', 'dayofweek_name']]
wknd_aux_info = pd.read_csv('Data/station_aux_info.csv')
for i in ['start station id', 'end station id']:
prefix = i.split(' ',1)[0]
df_wknd_merge = pd.merge(df_wknd, wknd_aux_info, left_on=i, right_on='station_id')
df_wknd_merge.rename(columns={'stationid_new':prefix + 'stationid_new', 'station_x':prefix + 'station_x',\
'station_y':prefix + 'station_y', 'borough':prefix + '_boro',\
'neighborhood':prefix + 'neigh', 'neigh_id':prefix + 'neigh_id',\
'neigh_x':prefix + 'neigh_x', 'neigh_y':prefix + 'neigh_y',\
'grid_x':prefix + 'grid_x', 'grid_y':prefix + 'grid_y', 'grid_id':prefix + 'grid_id'},\
inplace=True)
del df_wknd_merge['station_id']
df_wknd = df_wknd_merge
In [ ]:
# Matrix for the WEEKENDS at 1 hour intervals between August-October from 8am - 11:59pm; sans round-trips
store_1hr_allWknd_matrix = pd.HDFStore('store_1hr_allWknd_matrix.h5')
# Create list of hours to loop through
hrList = ['08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
indexList = range(0, 332)
df_indexed = df_wknd.set_index('starttime')
for hr in hrList:
subset = df_indexed.between_time(hr + ':00:00', hr + ':59:00')
subset_agg = subset.groupby(['startstationid_new', 'endstationid_new'], as_index=False).agg(len) \
[['startstationid_new', 'endstationid_new', 'dayofweek']]
subset_agg.columns = ['startstationid', 'endstationid', 'count']
matrix = subset_agg.pivot(index = 'startstationid', columns = 'endstationid', values = 'count').fillna(0)
matrix_reindex = matrix.reindex(index = indexList, columns = indexList, fill_value = 0)
store_1hr_allWknd_matrix['matrix_' + hr] = matrix_reindex
In [ ]:
# Save WEEKEND matrices in pajek format for Infomap
for i in store_1hr_allWknd_matrix.keys():
outname = 'pajek/' + i[8:] + '.pajek'
f = store_1hr_allWknd_matrix[i]
iG_dir = igraph.Graph.Weighted_Adjacency(np.matrix(f).tolist(), "DIRECTED", "weight", True)
iG_dir.write(outname, format='pajek')
In [ ]:
"""Initial test showed very little differences between the days. So this was excluded from the study"""
# # Matrix for the Saturdays and Sundays at 1 hour intervals between August-October from 8am - 11:59pm; sans round-trips
# store_1hr_wknd_matrix = pd.HDFStore('store_1hr_wknd_matrix.h5')
# # Create list of hours to loop through
# hrList = ['08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
# indexList = range(0, 332)
# for i in [5,6]:
# df_aug_oct = df_wknd_merge[df_wknd_merge['dayofweek'] == i]
# df_indexed = df_aug_oct.set_index('starttime')
# for hr in hrList:
# subset = df_indexed.between_time(hr + ':00:00', hr + ':59:00')
# subset_agg = subset.groupby(['startstationid_new', 'endstationid_new'], as_index=False).agg(len) \
# [['startstationid_new', 'endstationid_new', 'dayofweek']]
# subset_agg.columns = ['startstationid', 'endstationid', 'count']
# matrix = subset_agg.pivot(index = 'startstationid', columns = 'endstationid', values = 'count').fillna(0)
# matrix_reindex = matrix.reindex(index = indexList, columns = indexList, fill_value = 0)
# store_1hr_wknd_matrix['matrix_' + str(i) + '_' + hr] = matrix_reindex
# # Save matrices in pajek format for Infomap
# for i in store_1hr_wknd_matrix.h5.keys():
# outname = 'pajek/' + i[8:] + '.pajek'
# f = store_1hr_allWknd_matrix[i]
# iG_dir = igraph.Graph.Weighted_Adjacency(np.matrix(f).tolist(), "DIRECTED", "weight", True)
# iG_dir.write(outname, format='pajek')