This dataset contains data for 25 years[1995-2015] of flights between various US airports and metadata about these routes. Taken from Bureau of Transportation Statistics, United States Department of Transportation.
Let's see what can we make out of this!
In [ ]:
%matplotlib inline
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pass_air_data = pd.read_csv('datasets/passengers.csv')
In the pass_air_data
dataframe we have the information of number of people that fly every year on a particular route on the list of airlines that fly that route.
In [ ]:
pass_air_data.head()
In [ ]:
# Create a MultiDiGraph from this dataset
passenger_graph = nx.from_pandas_edgelist(pass_air_data, source='ORIGIN', target='DEST', edge_attr=['YEAR', 'PASSENGERS', 'UNIQUE_CARRIER_NAME'], create_using=nx.MultiDiGraph())
In [ ]:
passenger_graph['CLE']['ORD'][25]
In [ ]:
temp = [(i['YEAR'], i['PASSENGERS'])for i in dict(passenger_graph['CLE']['ORD']).values()]
x, y = zip(*temp)
plt.plot(x, y)
plt.show()
In [ ]:
temp = pass_air_data.groupby(['YEAR'])['PASSENGERS'].transform(max) == pass_air_data['PASSENGERS']
In [ ]:
pass_air_data[temp][pass_air_data.YEAR.isin([1990, 2015])]
In [ ]:
pass_air_data[(pass_air_data['ORIGIN'] == 'LAX') & (pass_air_data['DEST'] == 'HNL')].plot('YEAR', 'PASSENGERS')
In [ ]:
pass_air_data[(pass_air_data['ORIGIN'] == 'LAX') & (pass_air_data['DEST'] == 'SFO')].plot('YEAR', 'PASSENGERS')
So let's have a look at the important nodes in this network, i.e. important airports in this network. We'll use pagerank, betweenness centrality and degree centrality.
In [ ]:
# nx.pagerank(passenger_graph)
In [ ]:
def year_network(G, year):
temp_g = nx.DiGraph()
for i in G.edges(data=True):
if i[2]['YEAR'] == year:
temp_g.add_edge(i[0], i[1], weight=i[2]['PASSENGERS'])
return temp_g
In [ ]:
pass_2015 = year_network(passenger_graph, 2015)
In [ ]:
len(pass_2015)
In [ ]:
len(pass_2015.edges())
In [ ]:
# Load in the GPS coordinates of all the airports
lat_long = pd.read_csv('datasets/GlobalAirportDatabase.txt', delimiter=':', header=None)
In [ ]:
lat_long[lat_long[1].isin(list(pass_2015.nodes()))]
In [ ]:
pos_dict = {}
for airport in lat_long[lat_long[1].isin(list(pass_2015.nodes()))].iterrows():
pos_dict[airport[1][1]] = (airport[1][15], airport[1][14])
In [ ]:
pos_dict
Using the position dictionary pos_dict
create a plot of the airports, only the nodes not the edges.
nx.subgraph(Graph, iterable of nodes)
to create the subgraphnx.draw_networkx_nodes(G, pos)
to map the nodes. or
In [ ]:
plt.figure(figsize=(20, 9))
G = nx.subgraph(pass_2015, pos_dict.keys())
nx.draw_networkx_nodes(G, pos=pos_dict, node_size=10, alpha=0.6, node_color='b')
# nx.draw_networkx_edges(G, pos=pos_dict, width=0.1, arrows=False)
plt.show()
In [ ]:
plt.figure(figsize=(20, 9))
x = [i[0] for i in pos_dict.values()]
y = [i[1] for i in pos_dict.values()]
plt.scatter(x, y)
In [ ]:
plt.hist(list(nx.degree_centrality(pass_2015).values()))
plt.show()
Let's plot a log log plot to get a better overview of this.
In [ ]:
d = {}
for i, j in dict(nx.degree(pass_2015)).items():
if j in d:
d[j] += 1
else:
d[j] = 1
x = np.log2(list((d.keys())))
y = np.log2(list(d.values()))
plt.scatter(x, y, alpha=0.4)
plt.show()
In [ ]:
G = nx.DiGraph()
G.add_edge(1, 2, weight=1)
# print(G.edges())
# G[1][2]
# G[2][1]
# G.is_directed()
# type(G)
In [ ]:
G.add_edges_from([(1, 2), (3, 2), (4, 2), (5, 2), (6, 2), (7, 2)])
nx.draw_circular(G, with_labels=True)
In [ ]:
G.in_degree()
In [ ]:
nx.pagerank(G)
In [ ]:
G.add_edge(5, 6)
nx.draw_circular(G, with_labels=True)
In [ ]:
nx.pagerank(G)
In [ ]:
G.add_edge(2, 8)
nx.draw_circular(G, with_labels=True)
In [ ]:
nx.pagerank(G)
In [ ]:
sorted(nx.pagerank(pass_2015, weight=None).items(), key=lambda x:x[1], reverse=True)[:10]
In [ ]:
sorted(nx.betweenness_centrality(pass_2015).items(), key=lambda x:x[1], reverse=True)[0:10]
In [ ]:
sorted(nx.degree_centrality(pass_2015).items(), key=lambda x:x[1], reverse=True)[0:10]
'ANC' is the airport code of Anchorage airport, a place in Alaska, and according to pagerank and betweenness centrality it is the most important airport in this network Isn't that weird? Thoughts?
related blog post: https://toreopsahl.com/2011/08/12/why-anchorage-is-not-that-important-binary-ties-and-sample-selection/
Let's look at weighted version, i.e taking into account the number of people flying to these places.
In [ ]:
sorted(nx.betweenness_centrality(pass_2015, weight='weight').items(), key=lambda x:x[1], reverse=True)[0:10]
In [ ]:
sorted(nx.pagerank(pass_2015, weight='weight').items(), key=lambda x:x[1], reverse=True)[0:10]
In [ ]:
# nx.average_shortest_path_length(pass_2015)
Wait, What??? This network is not connected. That seems like a really stupid thing to do.
In [ ]:
list(nx.weakly_connected_components(pass_2015))
In [ ]:
pass_air_data[(pass_air_data['YEAR'] == 2015) & (pass_air_data['ORIGIN'] == 'AIK')]
In [ ]:
pass_2015.remove_nodes_from(['SPB', 'SSB', 'AIK'])
In [ ]:
nx.is_weakly_connected(pass_2015)
In [ ]:
nx.is_strongly_connected(pass_2015)
In [ ]:
G = nx.DiGraph()
G.add_edge(1, 2)
G.add_edge(2, 3)
G.add_edge(3, 1)
nx.draw(G)
In [ ]:
G.add_edge(3, 4)
nx.draw(G)
In [ ]:
nx.is_strongly_connected(G)
In [ ]:
list(nx.strongly_connected_components(pass_2015))
In [ ]:
pass_air_data[(pass_air_data['YEAR'] == 2015) & (pass_air_data['DEST'] == 'TSP')]
In [ ]:
pass_2015_strong = max(nx.strongly_connected_components_recursive(pass_2015), key=len)
In [ ]:
len(pass_2015_strong)
In [ ]:
nx.average_shortest_path_length(pass_2015_strong)
How can we decrease the avg shortest path length of this network?
Think of an effective way to add new edges to decrease the avg shortest path length. Let's see if we can come up with a nice way to do this, and the one who gets the highest decrease wins!!!
The rules are simple:
[10 mins]
In [ ]:
sort_degree = sorted(nx.degree_centrality(pass_2015_strong).items(), key=lambda x:x[1], reverse=True)
top_count = 0
for n, v in sort_degree:
count = 0
for node, val in sort_degree:
if node != n:
if node not in pass_2015_strong.adj[n]:
pass_2015_strong.add_edge(n, node)
count += 1
if count == 25:
break
top_count += 1
if top_count == 20:
break
In [ ]:
nx.average_shortest_path_length(pass_2015_strong)
In [ ]:
passenger_graph['JFK']['SFO'][25]
In [ ]:
def str_to_list(a):
return a[1:-1].split(', ')
In [ ]:
for i in str_to_list(passenger_graph['JFK']['SFO'][25]['UNIQUE_CARRIER_NAME']):
print(i)
In [ ]:
%%time
for origin, dest in passenger_graph.edges():
for key in passenger_graph[origin][dest]:
passenger_graph[origin][dest][key]['airlines'] = str_to_list(passenger_graph[origin][dest][key]['UNIQUE_CARRIER_NAME'])
Play around with United Airlines network.
passenger_graph
for the year 2015
In [ ]:
united_network = nx.DiGraph()
for origin, dest in passenger_graph.edges():
if 25 in passenger_graph[origin][dest]:
if "'United Air Lines Inc.'" in passenger_graph[origin][dest][25]['airlines']:
united_network.add_edge(origin, dest, weight=passenger_graph[origin][dest][25]['PASSENGERS'])
In [ ]:
len(united_network)
In [ ]:
len(united_network.edges())
In [ ]:
sorted(nx.pagerank(united_network, weight='weight').items(), key=lambda x:x[1], reverse=True)[0:10]
In [ ]:
sorted(nx.degree_centrality(united_network).items(), key=lambda x:x[1], reverse=True)[0:10]
In [ ]:
pass_air_data[(pass_air_data.ORIGIN == 'CLE') &
(pass_air_data.UNIQUE_CARRIER_NAME.str.contains('United Air Lines Inc.'))
].groupby('YEAR')['PASSENGERS'].sum().plot()