In [1]:
import networkx as nx
import csv
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import gc
%matplotlib inline
Reading graph edges and nodes:
In [2]:
with open('../csv_files/metro_edges_no_duplicated_edges_no_cycles_speed_networkx.csv') as f:
f.readline()
# Source,Target,Weight,edge_name,edge_color,travel_seconds,longitude_Source,latitude_Source,longitude_Target,latitude_Target,distance_meters,speed_ms
g = nx.parse_edgelist(f, delimiter=',', nodetype=int, data=(('Weight', float), ('edge_name', str), ('edge_color', str), ('travel_seconds', float), ('longitude_Source', float), ('latitude_Source', float), ('longitude_Target', float), ('latitude_Target', float), ('distance_meters', float), ('speed_ms', float) ), create_using = nx.MultiGraph())
with open('../csv_files/metro_gephi_nodes_coordinates.csv') as f:
reader = csv.DictReader(f)
node_latitudes = {}
node_longitudes = {}
node_names = {}
for row in reader:
node_latitudes[ int(row['Id']) ] = float(row['latitude'])
node_longitudes[ int(row['Id']) ] = float(row['longitude'])
node_names[ int(row['Id']) ] = row['Label']
nx.set_node_attributes(g, name = 'latitude', values = node_latitudes)
nx.set_node_attributes(g, name = 'longitude', values = node_longitudes)
nx.set_node_attributes(g, name = 'name', values = node_names)
In [4]:
def top_n_stations_by_attribute(graph, attr_name, n, ascending = False):
return pd.DataFrame.from_records(map(lambda x: x[1], list(graph.nodes(data=True)) ))[['name', attr_name]].sort_values(attr_name, ascending = ascending)[:(n+1)].reset_index(drop=True).shift()[1:]
Top 35 stations with more neighbour stations
In [104]:
nx.set_node_attributes(g, name = 'degree', values = dict(g.degree))
stations_and_their_neighbours = top_n_stations_by_attribute(g, 'degree', len(g.edges()))
stations_and_their_neighbours[:35]
Out[104]:
Neighbours' count histogram
In [105]:
sns.plt.figure(figsize=(15, 10))
sns.distplot(stations_and_their_neighbours['degree'], kde=False, rug=False, bins = np.arange(8)-0.5, hist_kws=dict(edgecolor='k', linewidth=2))
sns.plt.show()
In [106]:
stations_and_their_neighbours.groupby('degree').count().rename(columns={'name': 'count'})
Out[106]:
Most of the stations are connected to two other stations.
There are 10 Metro stations which are connected to only 1 stations: these stations are the most vulnerable because they strongly depend on other stations to remain connected to the Madrid Metro network.
Calculating stations importance using Closeness Centrality: This metric indicates how long it will take for information (people) from a node (station) "u" will take to reach other nodes (stations) in the network. We can determine which is the closest node (station) to any other node (station) in the network.
In [107]:
nx.set_node_attributes(g, name = 'closeness_centrality', values = nx.closeness_centrality(g, distance = 'travel_seconds'))
Top 20 most important (according to Closeness Centrality algorithm) Metro stations are shown
In [108]:
top_n_stations_by_attribute(g, 'closeness_centrality', 20)
Out[108]:
Another metric to have in mind: Betweenness Centrality. This metric indicates how often a node (station) is found on a shortest path between two nodes (stations) in the network.
In [109]:
nx.set_node_attributes(g, name = 'betweenness_centrality', values = nx.betweenness_centrality(g, normalized = True, weight = 'Weight'))
Top 20 most important (according to Betweenness Centrality algorithm) Metro stations are shown
In [110]:
top_n_stations_by_attribute(g, 'betweenness_centrality', 20)
Out[110]:
It seems Gregorio Marañón is quite important again, and thanks to the Betweenness Centrality we can say Gregorio Marañón is the Metro station that controls the most the Metro network because more information (people) can pass through it than on any other node (station).
Now let's calculate the articulation points: a node (Metro Station) is an articulation point if the graph gets disconnected (some Metro Stations will stop being connected somehow to other Metro Stations) by removing that node (station).
In [111]:
dict_nodes = g.node_dict_factory(g.nodes)
In [112]:
articulation_stations = list(nx.articulation_points(nx.Graph(g)))
print('%i articulation stations out of %i stations (%0.3f %%) \n' % (len(articulation_stations), len(dict_nodes.keys()), (len(articulation_stations) * 100)/len(dict_nodes.keys()) ) )
aux_list = []
for i in articulation_stations:
aux_list.append(dict_nodes[i]['name'])
aux_list.sort()
aux_list
Out[112]:
As you can see that was a long list. However, there are Metro Stations which are more critical than others. For example, if Puerta del Sur would stop working, none of the Metro Sur (Line 12) stations could reach any Madrid Metro Station because Puerta del Sur is the only one station that connects the Line 12 with the rest of the Metro network.
So in order to know which articulation stations are the most critical, I'm gonna count the number of connected components (the sets of connected Metro Stations) and the number of nodes (stations) each component has.
In [ ]:
art_points_dict = {}
for i in articulation_stations:
g_copy = g.copy()
g_copy.remove_node(i)
art_points_dict[dict_nodes[i]['name']] = {'n_ccs': nx.number_connected_components(g_copy)}
# print(dict_nodes[i]['name'])
# print('')
# print('Connected components if that node (station) is removed: %i' % art_points_dict[dict_nodes[i]['name']]['n_ccs'])
cc_i = 1
n_nodes_ccs = []
for cc in nx.connected_components(g_copy):
art_points_dict[dict_nodes[i]['name']]['nodes_cc_' + str(cc_i)] = len(cc)
# print('Nodes (stations) in connected component %i: %i' % (cc_i, art_points_dict[dict_nodes[i]['name']]['nodes_cc_' + str(cc_i)]))
n_nodes_ccs.append(art_points_dict[dict_nodes[i]['name']]['nodes_cc_' + str(cc_i)])
cc_i += 1
n_nodes_ccs.sort(reverse = True)
art_points_dict[dict_nodes[i]['name']]['n_isolated_nodes'] = sum(n_nodes_ccs[1:])
# print('Number of isolated nodes (stations): %i' % art_points_dict[dict_nodes[i]['name']]['n_isolated_nodes'])
# print('-------------------------------------------------------------------------------------------')
del g_copy
gc.collect()
In [114]:
art_points_info_df = pd.DataFrame.from_dict(art_points_dict, orient='index')[['n_ccs', 'n_isolated_nodes', 'nodes_cc_1', 'nodes_cc_2', 'nodes_cc_3']]
Top 20 most critical articulation Metro stations:
In [115]:
art_points_info_df.sort_values('n_isolated_nodes', ascending = False)[:20]
Out[115]:
As you can see, Casa de Campo is the most critical Metro Station because removing it...
Of course, any station between Cuatro Vientos and Puerta del Sur (both included) are extremely critical and would had similar effects of those written above.
Pueblo Nuevo is another critical station to have in mind, since is the only one that connects 6 Line 5 stations (Ciudad Lineal, Suanzes, Torre Arias, Canillejas, El Capricho and Alameda Osuna) and 13 Line 7 stations (Ascao, García Noblejas, Simancas, San Blas, Las Musas, Estadio Metropolitano, Barrio del Puerto, La Rambla, San Fernando, Jarama, Henares and Hospital del Henares) to the rest of the Madrid Metro network. Line 7 connects Madrid with Coslada and San Fernando de Henares, so again cities would get disconnected from Madrid. Pueblo Nuevo case is special because it generates 3 connected componets.
Similar conclusions can be drawn in the cases of Sainz de Baranda and Chamartín: again there would be a disconnection between Madrid and other towns (not to mention many Madrid wards would get disconnected from the resto of Madrid Metro network too).
Though Plaza Elíptica doesn't appear on the Top 20 list, it's a critical station too because it's the only station that connects Line 11 with the rest of Madrid Metro network.
My final conclusion on this topic is that towns and cities (some Madrid wards too, specially those in the most outer Madrid areas) are very weakly connected to the rest of the Metro network having many critical unique points of failure.
Eigenvector centrality: a node (station) importance is based on how important its neighbours are.
In [223]:
nx.set_node_attributes(g, name = 'eigenvector_centrality', values = nx.eigenvector_centrality_numpy(g, weight = 'Weight', max_iter = 1000))
Top 20 most important Metro stations based on Eigenvector Centrality.
In [224]:
top_n_stations_by_attribute(g, 'eigenvector_centrality', 20)
Out[224]:
This looks interesting: as you can see, Gregorio Marañón is not the most important station according to this centrality measure. Avenida América it's important persé and it's connected to other important stations like Diego de León (2th position), Núñez de Balboa (3th position), Manuel Becerra (4th position) and Gregorio Marañón (6th position).
I can see there are two important stations in different areas: one is Avenida América and the other one is Alonso Martínez, many Metro lines pass through those stations (lines 6, 4, 7 and 9 pass through Avenida América and lines 10, 5 and 4 pass through Alonso Martinez). Both stations are connected by Gregorio Marañón.
Though the data doesn't take this into account, Noviciado and Plaza España are connected underground: the problem is they aren't directly connected by railways, you need to walk through the stations. So we could think about Plaza España and Noviciado as an important unique station that joins up to 3 lines (line 2, line 3 and line 10).
Let's get some info about edges speeds and distances:
In [4]:
aux_edges_dict = {}
i = 0
for edge in list(g.edges(data = True)):
edge[2]['Source'] = node_names[edge[0]]
edge[2]['Target'] = node_names[edge[1]]
aux_edges_dict[i] = edge[2]
i += 1
edges_pd = pd.DataFrame.from_dict(aux_edges_dict, orient='index')[['Source', 'Target', 'edge_name', 'distance_meters', 'travel_seconds', 'speed_ms', 'Weight', 'edge_color', 'longitude_Source', 'latitude_Source', 'longitude_Target', 'latitude_Target']]
Top 20 further edges (connections between stations)
In [38]:
edges_pd.sort_values('distance_meters', ascending = False)[:20]
Out[38]:
Top 20 closest edges (connections between stations)
In [39]:
edges_pd.sort_values('distance_meters')[:20]
Out[39]:
Exploring Metro lines distances and travel times. First let's take a look at distances.
In [79]:
edges_dist_grouped = edges_pd[['edge_name', 'distance_meters']].groupby(by = 'edge_name').describe().iloc[[0, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, 12],:]
edges_dist_grouped['distance_meters'].sort_values('mean', ascending = False)
Out[79]:
In [95]:
metro_colors = ['#2DBEF0', '#ED1C24', '#FFD000', '#B65518', '#8FD400', '#98989B', '#EE7518', '#EC82B1', '#A60084', '#005AA9', '#009B3A', '#A49800', '#FFFFFF']
In [98]:
metro_order = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'R']
In [100]:
plt.figure(figsize=(15, 10))
sns.boxplot(data=edges_pd, x = 'edge_name', y = 'distance_meters', order = metro_order, palette = metro_colors).set_title('Distance by Metro Line (geodesic, meters)')
plt.show()
Mean distance per edge (stations connections) per line
In [77]:
edges_dist_grouped = edges_dist_grouped.assign(dist_per_edge = edges_dist_grouped['distance_meters']['mean'] / (edges_dist_grouped['distance_meters']['count'] - 1) )
edges_dist_grouped = edges_dist_grouped.assign(dist_per_edge_std = edges_dist_grouped['distance_meters']['std'] / (edges_dist_grouped['distance_meters']['count'] - 1) )
edges_dist_grouped.sort_values('dist_per_edge', ascending = False).iloc[1:,:] # I removed the first line because it was useless (R line, only two stations)
Out[77]:
It's clear line 8 is the line with the highest mean distance between stations.
To have in mind the number of stations and the variability of the values, I plotted a box-plot and calculated the mean distance between stations by edge (a pair of stations connected).
This last part reveals that line 8 is again the line whose stations are further from each other. We can find line 11 in the second place, with a mean distance per edge (pair of connected stations) of 204.66 meters (+/- 96.31 meters).
Let's take a look now at travel times (in seconds):
In [80]:
edges_time_grouped = edges_pd[['edge_name', 'travel_seconds']].groupby(by = 'edge_name').describe().iloc[[0, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, 12],:]
edges_time_grouped['travel_seconds'].sort_values('mean', ascending = False)
Out[80]:
In [96]:
plt.figure(figsize=(15, 10))
sns.boxplot(data=edges_pd, x = 'edge_name', y = 'travel_seconds', order = metro_order, palette = metro_colors).set_title('Travel time by Metro Line (seconds)')
plt.show()
"Surprise": Line 12 is the line with the worst mean time travel between stations. It's even higher in the chart than line 8 (remember we said above line 8 is the line with the highest mean distance between stations).
It happens kind of the same with other lines. I suggest you to take the time to compare the positions of some Metro lines in the mean distances chart and in the travel times' chart.
Maybe claiming that line 12 is the worst one only by looking at this plot might not be the most rigorous thing to do. Let's do an hypothesis test.
Please correct me if I'm wrong, but I think the H0 would be this: Line 12 median travel time == the rest of lines median times. And H1 would be: Line 12 median travel time > the rest of lines median times.
In [10]:
# Checking if data is normally distributed with a significance level (alpha) of 0.05.
# H0: the data is normally distributed
import scipy
alpha = 0.05
statistic, pvalue = scipy.stats.normaltest(edges_pd[['travel_seconds']])
pvalue[0]
Out[10]:
Because p-value < alpha, the null hypothesis can be rejected: The data is not normally distributed.
Let's test if variances are equal:
In [96]:
for i in metro_lines_to_use:
_, pvalue = scipy.stats.levene(edges_pd[edges_pd.edge_name == '12'][['travel_seconds']],
edges_pd[edges_pd.edge_name == i][['travel_seconds']])
print('Line 12 and Line %s' % i)
print('p-value: %f' % pvalue)
print('p-value < %f ?: %r' % (alpha, pvalue[0] < alpha))
print('-----------------------------------------------')
That means we cannot perform some types of hypothesis tests because some of them assume the data is normally distributed (among other conditions, like the equality of variances among groups. We tested that on the previous chunk and for every pair of groups we cannot reject nor accept H0 (that variances are equal)).
Thus, we're gonna apply a non-parametric method that allows to perform hypothesis test with more relaxed conditions.
In this case we did a Mann-Whitney test to check if medians between groups are equal (H0) or Line 12 median is greater than other group median.
In [ ]:
metro_lines_to_use = edges_pd.edge_name.unique()[:-1]
metro_lines_to_use = np.delete(metro_lines_to_use, 8) # Removing 'R' line
In [94]:
for i in metro_lines_to_use:
_, pvalue = scipy.stats.mannwhitneyu(edges_pd[edges_pd.edge_name == '12'][['travel_seconds']],
edges_pd[edges_pd.edge_name == i][['travel_seconds']],
use_continuity=False, alternative='greater')
print('Line 12 and Line %s' % i)
print('p-value: %f' % pvalue)
print('p-value < %f ?: %r' % (alpha, pvalue < alpha))
print('-----------------------------------------------')
Take a look at the Line 12 and Line 8 case. The null hypothesis cannot be rejected in favor of the alternative hypothesis. If you see the box-plot, it's hard to say if the medians are equal or not, they are too close but we can't be sure. This test proves then that we can't say Line 12 has the worst travel times. However, I must say that, from my experience, using Metro Sur (Line 12) can be a real pain in the ass because it's too slow and I can't say the same about Line 8.
I'm gonna plot speed (meters / second) vs distance to check the linear correlation between them:
In [148]:
plt.figure(figsize=(15, 10))
sns.lmplot(x = 'distance_meters', y = 'speed_ms', col = 'edge_name', data = edges_pd, col_wrap = 4, size = 5,
hue = 'edge_name',
hue_order = metro_order,
palette = metro_colors,
col_order = metro_order)
plt.show()
In [143]:
plt.figure(figsize=(15, 10))
sns.regplot(data=edges_pd, x = 'distance_meters', y = 'speed_ms', fit_reg = True, scatter = True, scatter_kws={'facecolors': metro_colors}, line_kws={'color': 'black'})
plt.show()
It seems to me (and please correct me if I'm wrong) that if you look at every Metro Line individually, for almost every point, the following statement applies: the further two stations are the faster the Metro is (except in some cases for lines 6 and 7).
But if we plot all the observations, the previous statement doesn't apply, specially in those cases we can observe better on the plot (those outside the main cloud at the bottom left).
For non-compliant cases from lines 6 and 7 there's maybe a reason, but I'm not a train expert so if you are a Madrid Metro expert I'd be glad to hear your opinion.