In [83]:
import networkx as nx
import pandas as pd
from pandas import Timestamp
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [84]:
df = pd.read_csv('Data/working_data.csv',parse_dates=["starttime", "stoptime"])
del df['Unnamed: 0']
In [85]:
"""DATA SUBSET A: AUG - OCT; WEEKDAYS; SUBSCRIBERS"""
df_augOct = df[(df.starttime >= Timestamp('2013-08-01')) & (df.starttime < Timestamp('2013-11-01')) &\
(df.weektype == 'Weekday') & (df.usertype == 'Subscriber')]\
[['starttime','stoptime','start station id','end station id', 'dayofweek', 'dayofweek_name']]
# Add auxillary information
augOct_aux_info = pd.read_csv('Data/station_aux_info.csv')
for i in ['start station id', 'end station id']:
prefix = i.split(' ',1)[0]
df_augOct_merge = pd.merge(df_augOct, augOct_aux_info, left_on=i, right_on='stationid')
df_augOct_merge.rename(columns={'stationid_new':prefix + 'stationid_new', 'station_x':prefix + 'station_x',\
'station_y':prefix + 'station_y', 'borough':prefix + '_boro',\
'neighborhood':prefix + 'neigh', 'neigh_id':prefix + 'neigh_id',\
'neigh_x':prefix + 'neigh_x', 'neigh_y':prefix + 'neigh_y',\
'grid_x':prefix + 'grid_x', 'grid_y':prefix + 'grid_y', 'grid_id':prefix + 'grid_id'},\
inplace=True)
del df_augOct_merge['stationid']
df_augOct = df_augOct_merge
In [120]:
"""DATA SUBSET B: AUG - OCT; WEEKEND; ALL USER TYPES"""
df_augOct = df[(df.starttime >= Timestamp('2013-08-01')) & (df.starttime < Timestamp('2013-11-01')) &\
(df.weektype == 'Weekend')]\
[['starttime','stoptime','start station id','end station id', 'dayofweek', 'dayofweek_name']]
# Add auxillary information
augOct_aux_info = pd.read_csv('Data/station_aux_info.csv')
for i in ['start station id', 'end station id']:
prefix = i.split(' ',1)[0]
df_augOct_merge = pd.merge(df_augOct, augOct_aux_info, left_on=i, right_on='stationid')
df_augOct_merge.rename(columns={'stationid_new':prefix + 'stationid_new', 'station_x':prefix + 'station_x',\
'station_y':prefix + 'station_y', 'borough':prefix + '_boro',\
'neighborhood':prefix + 'neigh', 'neigh_id':prefix + 'neigh_id',\
'neigh_x':prefix + 'neigh_x', 'neigh_y':prefix + 'neigh_y',\
'grid_x':prefix + 'grid_x', 'grid_y':prefix + 'grid_y', 'grid_id':prefix + 'grid_id'},\
inplace=True)
del df_augOct_merge['stationid']
df_augOct = df_augOct_merge
In [121]:
# MATRIX BY STATIONS
indexList = range(0, 332)
augOct_agg = df_augOct.groupby(['startstationid_new', 'endstationid_new'], as_index=False).agg(len) \
[['startstationid_new', 'endstationid_new', 'dayofweek']]
augOct_agg.columns = ['startstationid', 'endstationid', 'count']
augOct_pivot = augOct_agg.pivot(index = "startstationid", columns = "endstationid", values = "count").fillna(0)
augOct_matrix = augOct_pivot.reindex(index = indexList, columns = indexList, fill_value = 0)
In [122]:
# DIRECTED GRAPH
g_dir = nx.DiGraph()
m = np.matrix(augOct_matrix)
g_dir = nx.from_numpy_matrix(m, g_dir)
In [88]:
# STATION COORDINATES
pos = dict(zip(augOct_aux_info['stationid_new'].values, \
[tuple(x) for x in augOct_aux_info[['station_x','station_y']].values]))
In [89]:
# GRAPH PROPERTIES
#Access nodes and edges
nodes = g_dir.number_of_nodes()
edges = g_dir.number_of_edges()
print "Nodes: ", nodes
print "Edges: ", edges
In [123]:
# DEGREE DISTRIBUTION
in_degrees = g_dir.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
in_values = in_degrees.values()
in_hist = [in_degrees.values().count(x) for x in in_values]
out_degrees = g_dir.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
out_values = out_degrees.values()
out_hist = [out_degrees.values().count(x) for x in out_values]
plt.figure
plt.hist([in_values, out_values], bins = 25, histtype='bar', color =['#006BB6','#a2d9ff'])
plt.xlabel('Degree')
plt.ylabel('Number of nodes')
plt.legend(['In-degree','Out-degree'], loc='best')
plt.savefig('Weekday_station_degree_distribution.png')
plt.show
plt.close
Out[123]:
In [ ]:
# DEGREE DISTRIBUTION
# in_degrees = g_dir.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
# in_hist = [in_degrees.values().count(x) for x in in_values]
# out_degrees = g_dir.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
# out_hist = [out_degrees.values().count(x) for x in out_values]
# plt.figure()
# plt.plot(in_values,in_hist,'#FF0000') # in-degree
# plt.plot(out_values,out_hist,'#006BB6') # out-degree
# plt.legend(['In-degree','Out-degree'])
# plt.xlabel('Degree')
# plt.ylabel('Number of nodes')
# plt.savefig('Weekday_neigh_degree_distribution.png')
# plt.show()
# plt.close()
In [104]:
# NODE CENTRALITIES
g_ud = g_dir.to_undirected()
g_components = nx.connected_component_subgraphs(g_ud)
g_c = g_components[0]
# Betweenness centrality
bet_cen_sta = nx.betweenness_centrality(g_c)
# Closeness centrality
clo_cen_sta = nx.closeness_centrality(g_c)
# Eigenvector centrality
eig_cen_sta = nx.eigenvector_centrality(g_c)
In [105]:
# MOST CENTRAL NODES
def highest_centrality(cent_dict):
"""Returns a tuple (node,value) with the node with largest value from Networkx centrality dictionary."""
# Create ordered tuple of centrality data
cent_items=[(b,a) for (a,b) in cent_dict.iteritems()]
# Sort in descending order
cent_items.sort()
cent_items.reverse()
return tuple(reversed(cent_items[0]))
# return tuple(cent_items)
In [107]:
bet_central_node = highest_centrality(bet_cen_sta)
print 'Highest betweenness centrality node:\n', bet_central_node
In [ ]:
clo_central_node = highest_centrality(clo_cen_sta)
print 'Highest closeness centrality node:\n', clo_central_node
In [ ]:
eig_central_node = highest_centrality(eig_cen_sta)
print 'Highest eigenvector centrality node:\n', eig_central_node
In [114]:
# NETWORK VISUALIZATION
plt.figure(num=None, figsize=(20,20))
plt.axis('off')
edges, weights = zip(*nx.get_edge_attributes(g_dir,'weight').items())
weights = np.array(weights)
nx.draw_networkx_nodes(g_dir, pos, node_size=250, node_color='#A0CBE2', alpha=0.8, linewidths=0.2)
nx.draw_networkx_labels(g_dir, pos, font_size=10, font_color='#3d3d3d', font_weight='bold')
draw_edge = nx.draw_networkx_edges(g_dir, pos, edgelist=edges, edge_color=np.log(weights),
width=np.log(weights)/5, edge_cmap=plt.cm.Blues, alpha=.5, arrows=False,)
cbar = plt.colorbar(draw_edge, ticks=[0, 4.3, 8.45], shrink=.5)
cbar.ax.set_yticklabels(['Low', 'Medium', 'High'])
cbar.solids.set_edgecolor("face")
plt.savefig('Weekday_Stations.png')
plt.show()
In [124]:
# Matrix by neighborhoods (IDs start with 1)
indexList = range(1, 38)
augOct_neigh_agg = df_augOct.groupby(['startneigh_id', 'endneigh_id'], as_index=False).agg(len) \
[['startneigh_id', 'endneigh_id', 'dayofweek']]
augOct_neigh_agg.columns = ['startneigh_id', 'endneigh_id', 'count']
augOct_neigh_pivot = augOct_neigh_agg.pivot(index = "startneigh_id", columns = "endneigh_id",\
values = "count").fillna(0)
augOct_neigh_matrix = augOct_neigh_pivot.reindex(index = indexList, columns = indexList, fill_value = 0)
In [125]:
# DIRECTED GRAPH WITH NODES AS NEIGHBORHOODS
g_dir_neigh = nx.DiGraph()
mn = np.matrix(augOct_neigh_matrix)
g_dir_neigh = nx.from_numpy_matrix(mn, g_dir_neigh)
In [128]:
# NEIGHBORHOOD COORDINATES
pos_neigh = dict(zip(augOct_aux_info['neigh_id'].values - 1, \
[tuple(x) for x in augOct_aux_info[['neigh_x','neigh_y']].values]))
In [ ]:
# GRAPH PROPERTIES
#Access nodes and edges
nodes = g_dir_neigh.number_of_nodes()
edges = g_dir_neigh.number_of_edges()
print "Nodes: ", nodes
print "Edges: ", edges
In [126]:
# DEGREE DISTRIBUTION
in_degrees = g_dir_neigh.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
in_values = in_degrees.values()
in_hist = [in_degrees.values().count(x) for x in in_values]
out_degrees = g_dir_neigh.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
out_values = out_degrees.values()
out_hist = [out_degrees.values().count(x) for x in out_values]
plt.figure
plt.hist([in_values, out_values], bins = 20, histtype='bar', color =['#006BB6','#a2d9ff'])
plt.xlabel('Degree')
plt.ylabel('Number of nodes')
plt.legend(['In-degree','Out-degree'], loc='best')
plt.savefig('Weekday_neigh_degree_distribution.png')
plt.show
plt.close
Out[126]:
In [ ]:
# # DEGREE DISTRIBUTION
# in_degrees = g_dir_neigh.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
# in_hist = [in_degrees.values().count(x) for x in in_values]
# out_degrees = g_dir_neigh.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
# out_hist = [out_degrees.values().count(x) for x in out_values]
# plt.figure()
# plt.plot(in_values,in_hist,'#FF0000') # in-degree
# plt.plot(out_values,out_hist,'#006BB6') # out-degree
# plt.legend(['In-degree','Out-degree'])
# plt.xlabel('Degree')
# plt.ylabel('Number of nodes')
# plt.savefig('Weekday_neigh_degree_distribution.png')
# plt.show()
# plt.close()
In [ ]:
# NODE CENTRALITIES
g_neigh_ud = g_dir_neigh.to_undirected()
g_neigh_components = nx.connected_component_subgraphs(g_neigh_ud)
g_neigh_nc = g_neigh_components[0]
# Betweenness centrality
bet_cen_neigh = nx.betweenness_centrality(g_neigh_nc)
# Closeness centrality
clo_cen_neigh = nx.closeness_centrality(g_neigh_nc)
# Eigenvector centrality
eig_cen_neigh = nx.eigenvector_centrality(g_neigh_nc)
In [ ]:
# MOST CENTRAL NODES
def highest_centrality(cent_dict):
"""Returns a tuple (node,value) with the node with largest value from Networkx centrality dictionary."""
# Create ordered tuple of centrality data
cent_items=[(b,a) for (a,b) in cent_dict.iteritems()]
# Sort in descending order
cent_items.sort()
cent_items.reverse()
# return tuple(reversed(cent_items[0]))
return tuple(cent_items)
In [ ]:
bet_central_node = highest_centrality(bet_cen_neigh)
print 'Highest betweenness centrality node:\n', bet_central_node
In [ ]:
clo_central_node = highest_centrality(clo_cen_neigh)
print 'Highest closeness centrality node:\n', clo_central_node
In [ ]:
eig_central_node = highest_centrality(eig_cen_neigh)
print 'Highest eigenvector centrality node:\n', eig_central_node
In [129]:
# NETWORK VISUALIZATION
plt.figure(num=None, figsize=(20,20))
plt.axis('off')
edges, weights = zip(*nx.get_edge_attributes(g_dir_neigh,'weight').items())
weights = np.array(weights)
nx.draw_networkx_nodes(g_dir_neigh, pos_neigh, node_size=420, node_color='#A0CBE2', alpha=0.8, linewidths=0.5)
nx.draw_networkx_labels(g_dir_neigh, pos_neigh, font_size=12, font_color='#3d3d3d', font_weight='bold')
draw_edge = nx.draw_networkx_edges(g_dir_neigh, pos_neigh, edgelist=edges, edge_color=np.log(weights),
width=np.log(weights)/3, edge_cmap=plt.cm.Blues, alpha=.6, arrows=False,)
cbar = plt.colorbar(draw_edge, ticks=[0, 6, 11.65], shrink=.5)
cbar.ax.set_yticklabels(['Low', 'Medium', 'High'])
cbar.solids.set_edgecolor("face")
plt.savefig('Weekday_Neigh.png')
plt.show()