In [83]:
import networkx as nx
import pandas as pd 
from pandas import Timestamp
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Network Anaylsis: August - October 2013

Data


In [84]:
df = pd.read_csv('Data/working_data.csv',parse_dates=["starttime", "stoptime"])
del df['Unnamed: 0']

Data set-up

NOTE: Run subset A (Weekdays) and then proceed to the Network Analysis section. Repeat with subset B (Weekends). Remember to change image names.


In [85]:
"""DATA SUBSET A: AUG - OCT; WEEKDAYS; SUBSCRIBERS"""

df_augOct = df[(df.starttime >= Timestamp('2013-08-01')) & (df.starttime < Timestamp('2013-11-01')) &\
               (df.weektype == 'Weekday') & (df.usertype == 'Subscriber')]\
[['starttime','stoptime','start station id','end station id', 'dayofweek', 'dayofweek_name']]

# Add auxillary information
augOct_aux_info = pd.read_csv('Data/station_aux_info.csv')
for i in ['start station id', 'end station id']:
    prefix = i.split(' ',1)[0]
    df_augOct_merge = pd.merge(df_augOct, augOct_aux_info, left_on=i, right_on='stationid')
    df_augOct_merge.rename(columns={'stationid_new':prefix + 'stationid_new', 'station_x':prefix + 'station_x',\
                             'station_y':prefix + 'station_y', 'borough':prefix + '_boro',\
                             'neighborhood':prefix + 'neigh', 'neigh_id':prefix + 'neigh_id',\
                             'neigh_x':prefix + 'neigh_x', 'neigh_y':prefix + 'neigh_y',\
                             'grid_x':prefix + 'grid_x', 'grid_y':prefix + 'grid_y', 'grid_id':prefix + 'grid_id'},\
                          inplace=True)
    del df_augOct_merge['stationid']
    df_augOct = df_augOct_merge

In [120]:
"""DATA SUBSET B: AUG - OCT; WEEKEND; ALL USER TYPES"""

df_augOct = df[(df.starttime >= Timestamp('2013-08-01')) & (df.starttime < Timestamp('2013-11-01')) &\
               (df.weektype == 'Weekend')]\
[['starttime','stoptime','start station id','end station id', 'dayofweek', 'dayofweek_name']]

# Add auxillary information
augOct_aux_info = pd.read_csv('Data/station_aux_info.csv')
for i in ['start station id', 'end station id']:
    prefix = i.split(' ',1)[0]
    df_augOct_merge = pd.merge(df_augOct, augOct_aux_info, left_on=i, right_on='stationid')
    df_augOct_merge.rename(columns={'stationid_new':prefix + 'stationid_new', 'station_x':prefix + 'station_x',\
                             'station_y':prefix + 'station_y', 'borough':prefix + '_boro',\
                             'neighborhood':prefix + 'neigh', 'neigh_id':prefix + 'neigh_id',\
                             'neigh_x':prefix + 'neigh_x', 'neigh_y':prefix + 'neigh_y',\
                             'grid_x':prefix + 'grid_x', 'grid_y':prefix + 'grid_y', 'grid_id':prefix + 'grid_id'},\
                          inplace=True)
    del df_augOct_merge['stationid']
    df_augOct = df_augOct_merge

Network Analysis

Stations as nodes


In [121]:
# MATRIX BY STATIONS
indexList = range(0, 332)

augOct_agg = df_augOct.groupby(['startstationid_new', 'endstationid_new'], as_index=False).agg(len) \
[['startstationid_new', 'endstationid_new', 'dayofweek']]
augOct_agg.columns = ['startstationid', 'endstationid', 'count']
augOct_pivot = augOct_agg.pivot(index = "startstationid", columns = "endstationid", values = "count").fillna(0)  
augOct_matrix = augOct_pivot.reindex(index = indexList, columns = indexList, fill_value = 0)

In [122]:
# DIRECTED GRAPH
g_dir = nx.DiGraph()
m = np.matrix(augOct_matrix)
g_dir = nx.from_numpy_matrix(m, g_dir)

In [88]:
# STATION COORDINATES
pos = dict(zip(augOct_aux_info['stationid_new'].values, \
             [tuple(x) for x in augOct_aux_info[['station_x','station_y']].values]))

In [89]:
# GRAPH PROPERTIES
#Access nodes and edges
nodes = g_dir.number_of_nodes()
edges = g_dir.number_of_edges()

print "Nodes: ", nodes
print "Edges: ", edges


Nodes:  332
Edges:  85344

In [123]:
# DEGREE DISTRIBUTION
in_degrees = g_dir.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
in_values = in_degrees.values()
in_hist = [in_degrees.values().count(x) for x in in_values]

out_degrees = g_dir.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
out_values = out_degrees.values()
out_hist = [out_degrees.values().count(x) for x in out_values]

plt.figure
plt.hist([in_values, out_values], bins = 25, histtype='bar', color =['#006BB6','#a2d9ff'])
plt.xlabel('Degree')
plt.ylabel('Number of nodes')
plt.legend(['In-degree','Out-degree'], loc='best')
plt.savefig('Weekday_station_degree_distribution.png')
plt.show
plt.close


Out[123]:
<function matplotlib.pyplot.close>

In [ ]:
# DEGREE DISTRIBUTION
# in_degrees = g_dir.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
# in_hist = [in_degrees.values().count(x) for x in in_values]

# out_degrees = g_dir.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
# out_hist = [out_degrees.values().count(x) for x in out_values]

# plt.figure()
# plt.plot(in_values,in_hist,'#FF0000') # in-degree
# plt.plot(out_values,out_hist,'#006BB6') # out-degree
# plt.legend(['In-degree','Out-degree'])
# plt.xlabel('Degree')
# plt.ylabel('Number of nodes')
# plt.savefig('Weekday_neigh_degree_distribution.png')
# plt.show()
# plt.close()

In [104]:
# NODE CENTRALITIES
g_ud = g_dir.to_undirected()
g_components = nx.connected_component_subgraphs(g_ud)
g_c = g_components[0]

# Betweenness centrality
bet_cen_sta = nx.betweenness_centrality(g_c)
# Closeness centrality
clo_cen_sta = nx.closeness_centrality(g_c)
# Eigenvector centrality
eig_cen_sta = nx.eigenvector_centrality(g_c)

In [105]:
# MOST CENTRAL NODES 
def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node with largest value from Networkx centrality dictionary."""
    # Create ordered tuple of centrality data
    cent_items=[(b,a) for (a,b) in cent_dict.iteritems()]
    
    # Sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))
#     return tuple(cent_items)

In [107]:
bet_central_node = highest_centrality(bet_cen_sta)
print 'Highest betweenness centrality node:\n', bet_central_node


Highest betweenness centrality node:
(111, 0.0010482254062884667)

In [ ]:
clo_central_node = highest_centrality(clo_cen_sta)
print 'Highest closeness centrality node:\n', clo_central_node

In [ ]:
eig_central_node = highest_centrality(eig_cen_sta)
print 'Highest eigenvector centrality node:\n', eig_central_node

In [114]:
# NETWORK VISUALIZATION
plt.figure(num=None, figsize=(20,20))
plt.axis('off')

edges, weights = zip(*nx.get_edge_attributes(g_dir,'weight').items())

weights = np.array(weights)

nx.draw_networkx_nodes(g_dir, pos, node_size=250, node_color='#A0CBE2', alpha=0.8, linewidths=0.2)
nx.draw_networkx_labels(g_dir, pos, font_size=10, font_color='#3d3d3d', font_weight='bold')
draw_edge = nx.draw_networkx_edges(g_dir, pos, edgelist=edges, edge_color=np.log(weights),
        width=np.log(weights)/5, edge_cmap=plt.cm.Blues, alpha=.5, arrows=False,)
cbar = plt.colorbar(draw_edge, ticks=[0, 4.3, 8.45], shrink=.5)
cbar.ax.set_yticklabels(['Low', 'Medium', 'High'])
cbar.solids.set_edgecolor("face")

plt.savefig('Weekday_Stations.png')
plt.show()


Neighborhoods as nodes


In [124]:
# Matrix by neighborhoods (IDs start with 1)
indexList = range(1, 38)

augOct_neigh_agg = df_augOct.groupby(['startneigh_id', 'endneigh_id'], as_index=False).agg(len) \
[['startneigh_id', 'endneigh_id', 'dayofweek']]
augOct_neigh_agg.columns = ['startneigh_id', 'endneigh_id', 'count']
augOct_neigh_pivot = augOct_neigh_agg.pivot(index = "startneigh_id", columns = "endneigh_id",\
                                            values = "count").fillna(0)  
augOct_neigh_matrix = augOct_neigh_pivot.reindex(index = indexList, columns = indexList, fill_value = 0)

In [125]:
# DIRECTED GRAPH WITH NODES AS NEIGHBORHOODS
g_dir_neigh = nx.DiGraph()
mn = np.matrix(augOct_neigh_matrix)
g_dir_neigh = nx.from_numpy_matrix(mn, g_dir_neigh)

In [128]:
# NEIGHBORHOOD COORDINATES
pos_neigh = dict(zip(augOct_aux_info['neigh_id'].values - 1, \
             [tuple(x) for x in augOct_aux_info[['neigh_x','neigh_y']].values]))

In [ ]:
# GRAPH PROPERTIES
#Access nodes and edges
nodes = g_dir_neigh.number_of_nodes()
edges = g_dir_neigh.number_of_edges()

print "Nodes: ", nodes
print "Edges: ", edges

In [126]:
# DEGREE DISTRIBUTION
in_degrees = g_dir_neigh.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
in_values = in_degrees.values()
in_hist = [in_degrees.values().count(x) for x in in_values]

out_degrees = g_dir_neigh.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
out_values = out_degrees.values()
out_hist = [out_degrees.values().count(x) for x in out_values]

plt.figure
plt.hist([in_values, out_values], bins = 20, histtype='bar', color =['#006BB6','#a2d9ff'])
plt.xlabel('Degree')
plt.ylabel('Number of nodes')
plt.legend(['In-degree','Out-degree'], loc='best')
plt.savefig('Weekday_neigh_degree_distribution.png')
plt.show
plt.close


Out[126]:
<function matplotlib.pyplot.close>

In [ ]:
# # DEGREE DISTRIBUTION
# in_degrees = g_dir_neigh.in_degree() # dictionary node:degree
# in_values = sorted(set(in_degrees.values()))
# in_hist = [in_degrees.values().count(x) for x in in_values]

# out_degrees = g_dir_neigh.out_degree() # dictionary node:degree
# out_values = sorted(set(out_degrees.values()))
# out_hist = [out_degrees.values().count(x) for x in out_values]

# plt.figure()
# plt.plot(in_values,in_hist,'#FF0000') # in-degree
# plt.plot(out_values,out_hist,'#006BB6') # out-degree
# plt.legend(['In-degree','Out-degree'])
# plt.xlabel('Degree')
# plt.ylabel('Number of nodes')
# plt.savefig('Weekday_neigh_degree_distribution.png')
# plt.show()
# plt.close()

In [ ]:
# NODE CENTRALITIES
g_neigh_ud = g_dir_neigh.to_undirected()
g_neigh_components = nx.connected_component_subgraphs(g_neigh_ud)
g_neigh_nc = g_neigh_components[0]

# Betweenness centrality
bet_cen_neigh = nx.betweenness_centrality(g_neigh_nc)
# Closeness centrality
clo_cen_neigh = nx.closeness_centrality(g_neigh_nc)
# Eigenvector centrality
eig_cen_neigh = nx.eigenvector_centrality(g_neigh_nc)

In [ ]:
# MOST CENTRAL NODES 
def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node with largest value from Networkx centrality dictionary."""
    # Create ordered tuple of centrality data
    cent_items=[(b,a) for (a,b) in cent_dict.iteritems()]
    
    # Sort in descending order
    cent_items.sort()
    cent_items.reverse()
#     return tuple(reversed(cent_items[0]))
    return tuple(cent_items)

In [ ]:
bet_central_node = highest_centrality(bet_cen_neigh)
print 'Highest betweenness centrality node:\n', bet_central_node

In [ ]:
clo_central_node = highest_centrality(clo_cen_neigh)
print 'Highest closeness centrality node:\n', clo_central_node

In [ ]:
eig_central_node = highest_centrality(eig_cen_neigh)
print 'Highest eigenvector centrality node:\n', eig_central_node

In [129]:
# NETWORK VISUALIZATION
plt.figure(num=None, figsize=(20,20))
plt.axis('off')

edges, weights = zip(*nx.get_edge_attributes(g_dir_neigh,'weight').items())

weights = np.array(weights)

nx.draw_networkx_nodes(g_dir_neigh, pos_neigh, node_size=420, node_color='#A0CBE2', alpha=0.8, linewidths=0.5)
nx.draw_networkx_labels(g_dir_neigh, pos_neigh, font_size=12, font_color='#3d3d3d', font_weight='bold')
draw_edge = nx.draw_networkx_edges(g_dir_neigh, pos_neigh, edgelist=edges, edge_color=np.log(weights),
        width=np.log(weights)/3, edge_cmap=plt.cm.Blues, alpha=.6, arrows=False,)
cbar = plt.colorbar(draw_edge, ticks=[0, 6, 11.65], shrink=.5)
cbar.ax.set_yticklabels(['Low', 'Medium', 'High'])
cbar.solids.set_edgecolor("face")
plt.savefig('Weekday_Neigh.png')
plt.show()