In [1]:
import geopy
from geopy.distance import distance #calculates distance based on coordinates
from geopy.geocoders import Nominatim
import operator
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import urllib3
from bs4 import BeautifulSoup
urllib3.disable_warnings()
from urllib.parse import quote
import unicodedata
import string
import re
import warnings
warnings.filterwarnings('ignore')
Pull up-to-date data from github
In [2]:
!bash download_data.sh
In [3]:
airports = pd.read_csv('airports.dat', header=None, names=
["AirportID","Name", "City", "Country", "IATA", "ICAO",
"Latitude", "Longitude", "Altitude", "Timezone", "DST", "TzDatabaseTimeZone",
"Type", "Source"],
na_values='\\N')
airlines = pd.read_csv('airlines.dat', header=None, names=
["AirlineID", "Name", "Alias", "IATA", "ICAO", "Callsign", "Country", "Active"]
,na_values='\\N')
routes = pd.read_csv('routes.dat', header=None, names=
['Airline', 'AirlineID', 'SourceAirport', 'SourceAirportID', 'DestinationAirport',
'DestinationAirportID', 'Codeshare', 'Stops', 'Equipment'],
na_values='\\N')
planes = pd.read_csv('planes.dat', header=None, names=['Name', 'IATA code', 'ICAO code'])
In [4]:
routes.head()
Out[4]:
In [5]:
airlines.head()
Out[5]:
Only keep airports in that are both in routes dataframe:
In [6]:
valid_airports = set(routes.SourceAirport).union(set(routes.DestinationAirport))
airports = airports[airports.IATA.isin(valid_airports)]
Only keep airlines in intersection of that are both in the airline and in the routes dataframe :
In [7]:
valid_airlines = set(routes.AirlineID)
In [8]:
airlines = airlines[airlines.AirlineID.isin(valid_airlines)]
routes = routes[routes.AirlineID.isin(valid_airlines)]
We check that for each airline we have exactly one edge between a given source and destination none. This means that our graph will be unweighted.
In [9]:
routes_by_airline = routes[['SourceAirport', 'DestinationAirport', 'Airline']]
routes_by_airline.drop_duplicates().shape == routes_by_airline.shape
Out[9]:
Only take direct flights, only very few have more than that anyway
In [10]:
routes = routes[routes.Stops == 0]
Airports to fill in information for :
In [11]:
len(valid_airports - set(airports.IATA))
Out[11]:
Web crawling: get IATA, aiportname and address from list
In [12]:
all_data = []
to_find = valid_airports - set(airports.IATA)
for u in string.ascii_uppercase:
url =u'https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:_'+quote(u)
http_pool = urllib3.connection_from_url(url)
r = http_pool.urlopen('GET',url)
html = r.data
tree = BeautifulSoup(html,"lxml")
table_tag = tree.select("table")[0]
tab_data = [[item.text.replace('\n', '') for item in row_data.select("th,td")]
for row_data in table_tag.select("tr")]
all_data = all_data + tab_data
df = pd.DataFrame(all_data[2:], columns=all_data[0])
df = df[df.IATA.isin(to_find)]
In [13]:
df.count()
Out[13]:
In [14]:
#makes calls to API to get location based on adress
geolocator = Nominatim(timeout=3, user_agent="my-application")
In [15]:
def get_info(address):
try:
location = geolocator.geocode(address, addressdetails=True, language='en')
return location.raw['address']['country'], location.raw['lon'], location.raw['lat']
except:
return None, None, None
Cleaning the dataframe:
In [16]:
details = df['Location\xa0served'].map(lambda x: get_info(x))
df['Country'] = details.map(lambda x: x[0])
df['Longitude'] = details.map(lambda x: x[1])
df['Latitude'] = details.map(lambda x: x[2])
df['Airport\xa0name'] = df['Airport\xa0name'].map(lambda x: re.sub('[\d\[\]]*|\([A-Z\ \:\d]*\)', '', x).strip())
df.rename(columns={'Airport\xa0name': 'Name', 'Location\xa0served': 'City'}, inplace=True)
df['Country'] = df.City.map(lambda x: x.split(',')[-1])
df['City'] = df.City.map(lambda x: x.split(',')[0])
del df['DST']
Manual additions to the dataset:
In [17]:
newMatrix = [['Altay Air Base', 'AAT', 47.7498856, 88.0858078, 'Altay', 'China', 'ZWAT'],
['Baise Youjiang Airport ', 'AEB', 23.7206001, 106.9599991, 'Baise', 'China', 'ZGBS'],
['Tasiilaq', 'AGM', 65.6122961, -37.6183355, 'Tasiilaq', 'Greenland', 'BGAM'],
['Atmautluak Airport', 'ATT', 60.8666992, -162.272995, 'Atmautluak', 'United States', ''],
['Branson Airport', 'BKG', 36.532082, -93.200544, 'Branson', 'United States', 'KBBG'],
['Baoshan Yunduan Airport', 'BSD', 25.0533009, 99.1682968, 'Baoshan', 'United States', 'ZPBS'],
['Laguindingan Airport', 'CGY', 8.612203, 124.456496, 'Cagayan de Oro City', 'Philippines', ''],
['Chuathbaluk Airport', 'CHU', 61.579102, -159.216003, 'Chuathbaluk', 'United States', 'PACH'],
['Crooked Creek Airport', 'CKD', 61.8679008, -158.1349945, 'Crooked Creek', 'United States', 'CJX'],
['Desierto De Atacama Airport', 'CPO', -27.2612, -70.7791977, 'Copiapo', 'Chile', 'SCAT'],
['Dandong Airport', 'DDG', 40.0247002, 124.2860031, 'Dandong', 'China', 'ZYDD'],
['Hamad International Airport', 'DOH', 25.2620449, 51.6130829, 'Doha', 'Qatar', 'OTHH'],
['Dongying Shengli Airport', 'DOY', 37.5085983, 118.788002, 'Dongying', 'China', 'ZSDY'],
['Saertu Airport', 'DQA', 46.7463889, 125.1405556, 'Daqing Shi', 'China', 'ZYDQ'],
['Førde Airport', 'FDE', 61.3911018, 5.7569399, 'Førde', 'Norway', 'ENBL'],
['FMt. Fuji Shizuoka Airport', 'FSZ', 34.7960435, 138.1877518, 'Makinohara', 'Japan', 'RJNS'],
['Foshan Shadi Airport', 'FUO', 23.0832996, 113.0699997, 'Foshan', 'China', 'ZGFS'],
['Goulimime Airport', 'GLN', 29.0266991, -10.0502996, 'Goulimime', 'Morocco', 'GMAG'],
['Gheshm Airport', 'GSM', 26.9487, 56.2687988, 'Gheshm', 'Iran', 'OIKQ'],
['Lianshui Airport', 'HIA', 33.7908333, 119.125, 'Huai\'an', 'China', 'ZSSH'],
['Iğdır Airport', 'IGD', 39.9766274, 43.876648, 'Iğdır', 'Turkey', 'LTCT'],
['Kingman Airport', 'IGM', 35.2594986, -113.9380035, 'Kingman', 'United States', 'KIGM'],
['Jiagedaqi Airport', 'JGD', 50.375, 124.1166667, 'Jiagedaqi', 'China', ''],
['Jinggangshan Airport', 'JGS', 26.8568993, 114.7369995, 'Ji\'an', 'China', 'ZSJA'],
['Jinchuan Airport', 'JIC', 38.5422222, 102.3483333, 'Jinchang', 'China', 'ZLJC'],
['Qianjiang Wulingshan Airport', 'JIQ', 29.5133333, 108.8311111, 'Qianjiang', 'China', 'ZUQJ'],
['Chizhou Jiuhuashan Airport', 'JUH', 30.7403, 117.6856, 'Chizhou', 'China', 'ZSJH'],
['Stagen Airport', 'KBU', -3.2947199, 116.1650009, 'Laut Island', 'Indonesia', 'WAOK'],
['Grayling Airport', 'KGX', 62.8945999, -160.0650024, 'Grayling', 'United States', 'PAGX'],
['Akiachak Airport', 'KKI', 60.9079018, -161.4349976, 'Akiachak', 'United States', 'KKI'],
['Kulusuk Airport', 'KUS', 65.5736008, -37.1236, 'Kulusuk', 'Greenland', 'BGKK'],
['Long Banga Airport', 'LBP', 3.202, 115.4018, 'Long Banga', 'Malaysia', ''],
['Lintsang Airfield', 'LNJ', 23.7381001, 100.0250015, 'Lincang', 'China', 'ZPLC'],
['Lublin Airport', 'LUZ', 51.240278, 22.713611, 'Lublin', 'Poland', 'EPLB'],
['Tampa Padang Airport', 'MJU', -2.583333, 119.033333, 'Mamuju-Celebes Island', 'Indonesia', 'WAWJ'],
['Basel Mulhouse Freiburg Airport', 'MLH', 47.5999, 7.53166, 'Basel Mulhouse Freiburg', 'France', 'LFSB'],
['Misratah Airport', 'MRA', 32.3250008, 15.0609999, 'Misratah', 'Libya', 'HLMS'],
['Enfidha - Hammamet International Airport', 'NBE', 36.075833, 10.438611, 'Enfidha', 'Tunisia', 'DTNH'],
['Changbaishan Airport', 'NBS', 42.0669444, 127.6022222, 'Baishan', 'China', 'ZYBS'],
['Nuussuaq Heliport', 'NSQ', 74.109722, -57.065, 'Nuussuaq', 'Greenland', 'BGNU'],
['Niaqornat Heliport', 'NIQ', 70.788889, -53.656111, 'Niaqornat', 'Greenland', 'BGNT'],
['Nantong Airport', 'NTG', 32.0708008, 120.9759979, 'Nantong', 'China', 'ZSNT'],
['Nunapitchuk Airport', 'NUP', 60.9057999, -162.4389954, 'Nunapitchuk', 'United States', 'PPIT'],
['Point Hope Airport', 'PHO', 68.3488007 , -166.798996, 'Point Hope', 'United States', 'PAPO'],
['Pilot Station Airport', 'PQS', 61.9346008 , -162.8999939, 'Pilot Station', 'United States', ''],
['Pomala Airport', 'PUM', -4.1810899 , 121.6179962, 'Kolaka', 'Indonesia', 'WAWP'],
['Bao\'anying Airport', 'PZI', 26.54 , 101.79852, 'Panzhihua', 'China', 'ZUZH'],
['Narsaq Kujalleq', 'QFN', 60.0046935, -44.6569347, 'Narsaq Kujalleq', 'Greenland', 'BGFD'],
['Igaliku', 'QFX', 60.9920065, -45.4323345, 'Igaliku', 'Greenland', 'BGIO'],
['Qassimiut Heliport', 'QJH', 60.779444, -47.1525, 'Qassimiut', 'Greenland', 'BGQT'],
['Aappilattoq', 'QUV', 60.1483571, -44.2869186, 'Nanortalik', 'Greenland', 'BGAQ'],
['Ammassivik', 'QUW', 60.5973758, -45.3824455, 'Ammassivik', 'Greenland', 'BGAS'],
['Red Devil Airport', 'RDV', 61.7881012, -157.3500061, 'Red Devil', 'United States', 'RDV'],
['Rio Grande Airport', 'RIG', -32.081699, -52.163299, 'Rio Grande', 'Brazil', 'SJRG'],
['Saattut Heliport', 'SAE', 70.808611, -51.626667, 'Saattut', 'Greenland', 'BGST'],
['Shire Inda Selassie Airport', 'SHC', 14.0781002, 38.2724991, 'Shire Indasilase', 'Ethiopia', ''],
['Stony River 2 Airport', 'SRV', 61.7896996, -156.5890045, 'Stony River', 'United States', 'SRV'],
['Sheldon Point Airport', 'SXP', 62.5205994, -164.8480072, 'Nunam Iqua', 'United States', 'SXP'],
['Semera Airport', 'SZE', 11.7875, 40.9915, 'Semera', 'Ethiopia', 'HASM'],
['Tuticorin Airport', 'TCR', 8.7241667, 78.0258333, 'Thoothukudi', 'India', 'VOTK'],
['Tanjung Manis Airport', 'TGC', 2.17784, 111.2020035, 'Tanjung Manis', 'Malaysia', 'WBGT'],
['Thọ Xuân Airport', 'THD', 19.901667, 105.467778, 'Sao Vàng', 'Vietnam', 'VVTX'],
['Tuluksak Airport', 'TLT', 61.0877292, -160.9233542, 'Tuluksak', 'United States', 'TLT'],
['Tununak Airport', 'TNK', 60.5755005, -165.2720032, 'Tununak', 'United States', '4KA'],
['Tasiusaq Heliport', 'TQA', 73.373055, -56.06028, 'Tasiusaq', 'Greenland', 'BGTA'],
['Tangshan Sannühe Airport', 'TVS', 39.7178001, 118.0026245, 'Tangshan', 'China', 'ZBTS'],
['Majeed Bin Abdulaziz Airport', 'ULH', 26.48, 38.1288889, 'Al Ula', 'Saudi Arabia', 'OEAO'],
['Upernavik Kujalleq', 'UPK', 72.1527425, -55.5309856, 'Upernavik Kujalleq', 'Greenland', 'BGKL'],
['Dong Hoi Airport', 'VDH', 17.515, 106.590556, 'Dong Hoi', 'Vietnam', 'VVDH'],
['Stebbins Airport', 'WBB', 63.5159988, -162.2779999, 'Stebbins', 'United States', 'WBB'],
['Wenshan Puzhehei Airport', 'WNH', 23.563611, 104.333611, 'Wenshan City', 'China', 'ZPWS'],
['Tuntutuliak Airport', 'WTL', 60.3516111, -162.6539722, 'Tuntutuliak', 'United States', 'A61'],
['Newtok Airport ', 'WWT', 60.9236984, -164.6560059, 'Newtok', 'United States', 'WWT'],
['Tasiusaq Heliport', 'XEQ', 60.193889, -44.811667, 'Tasiusaq', 'Greenland', 'BGTQ'],
['Zhangye Ganzhou Airport', 'YZY', 38.801899, 100.6750031, 'Zhangye', 'China', 'ZLZY'],
['La Araucanía Airport', 'ZCO', -38.9259, -72.6515, 'Temuco', 'Chile', 'SCQP'],
['Zhangjiakou Ningyuan Airport', 'ZQZ', 40.7386017, 114.9300003, 'Zhangjiakou', 'China', 'ZBZJ'],
['Aappilattoq Heliport', 'AOQ',72.886944,-55.596111, 'Aappilattoq', 'Greenland', 'BGAG'],
['Cewlik Bingöl Airport', 'BGG', 38.861111, 40.5925, 'Bingöl', 'Turkey', 'LTCU'],
['Ikerasak Heliport', 'IKE', 70.542368, -51.520606, 'Ikerasak', 'Greenland', ''],
['Isortoq Heliport', 'IOQ', 65.546667, -38.977778, 'Isortoq', 'Greenland', 'BGIS'],
['Illorsuit Heliport', 'IOT', 71.239722, -53.555556, 'Illorsuit', 'Greenland', 'BGLL'],
['Islamabad International Airport', 'ISB', 33.549083, 72.82565, 'Islamabad', 'Pakistan', 'OPIS'],
['Innaarsuit Heliport', 'IUI', 73.2025, -56.011111, 'Innaarsuit', 'Greenland', 'BGIN'],
['Nuugaatsiaq Heliport', 'JUU', 71.538611, -53.205, 'Nuugaatsiaq', 'Greenland', 'BGNQ'],
['Kangersuatsiaq Heliport', 'KGQ', 72.381111, -55.536667, 'Kangersuatsiaq', 'Greenland', 'BGKS'],
['Kullorsuaq Heliport', 'KHQ', 74.579444, -57.235556, 'Kullorsuaq', 'Greenland', 'BGKQ'],
['Simanggang Airport', 'SGG', 1.217, 111.4499969, 'Sri Aman', 'Malaysia', 'WBGY'],
['Sierra Leone Airport', 'SRK', 8.5, -13.5083333, 'Sierra Leone', 'Sierra Leone', ''],
['Svay Rieng Airport', 'SVR', 11.0833333, 105.8055556, 'Svay Rieng', 'Cambodia', ''],
['Biloela Airport', 'ZBL', -24.4, 150.5, 'Biloela', 'Australia', '']]
append_df = pd.DataFrame(newMatrix, columns=['Name', 'IATA', 'Latitude', 'Longitude', 'City', 'Country', 'ICAO'])
airports = airports.append(append_df, sort=False)
In [18]:
df = df[~df.IATA.isin(airports.IATA)]
Append missing to dataframe
In [19]:
airports = airports.append(df, sort=False)
We are only interessted in currently active airlines:
In [20]:
merged_routes = pd.merge(airlines[airlines.Active == 'Y'], routes, on='AirlineID')
In [21]:
merged_routes.head(1)
Out[21]:
Getting whether the flight is international or not:
In [22]:
Airport_to_country = airports.set_index('IATA').Country.to_dict()
In [23]:
def get_international(x):
try:
if Airport_to_country[x.SourceAirport] != Airport_to_country[x.DestinationAirport]:
return 1
else:
return 0
except:
return None
In [24]:
merged_routes['International'] = merged_routes.apply(
lambda x: get_international(x), axis=1)
Reset frames to create mappings:
In [25]:
#only keep values we are interested in
airports_filtered = airports[['Name', 'Country', 'Longitude', 'Latitude', 'Timezone', 'IATA', 'City']].copy()
In [26]:
#IATA airport id -> longitude latitude
airports_filtered.set_index('IATA', inplace=True)
airports_filtered.Longitude.dropna().shape == airports_filtered.Longitude.shape
Out[26]:
In [27]:
#sanity check
print(airports.Longitude.shape)
print(airports_filtered.Longitude.shape)
In [28]:
location_mapping = airports_filtered.apply(lambda x: [x.Longitude, x.Latitude], axis=1).to_dict()
In [29]:
#Airline name -> airlineID
airline_name_to_number = merged_routes.Name.drop_duplicates().reset_index(drop=True).to_dict()
airline_name_to_number = {v: k for k, v in airline_name_to_number.items()}
In [30]:
merged_routes['AirlineNbr'] = merged_routes.Name.map(airline_name_to_number)
Fill in Nan values:
In [31]:
merged_routes['Codeshare'] = merged_routes.Codeshare.fillna('N')
Example of functionality:
In [32]:
element = airports_filtered.apply(lambda x: (x.Latitude, x.Longitude), axis=1)[0]
element2 = airports_filtered.apply(lambda x: (x.Latitude, x.Longitude), axis=1)[1]
In [33]:
distance(element, element2).km
Out[33]:
In [34]:
distance_mapping = airports_filtered.apply(lambda x: (x.Latitude, x.Longitude), axis=1).to_dict()
Additing it to merged_routes:
In [35]:
def get_distance(source, dest):
try:
dist = distance(distance_mapping[source], distance_mapping[dest]).km
return dist
except:
return None
In [36]:
merged_routes['Distance'] = merged_routes.apply(lambda x: get_distance(x.SourceAirport, x.DestinationAirport), axis=1)
In [37]:
relevant_columns = ['Name', 'ICAO', 'Country', 'SourceAirport', 'DestinationAirport', 'Codeshare',
'Stops', 'Equipment', 'AirlineNbr', 'International', 'Distance']
In [38]:
merged_routes.head()
Out[38]:
In [39]:
merged_routes[relevant_columns].head()
Out[39]:
In [40]:
merged_routes = merged_routes[relevant_columns]
In [41]:
Delays_data = pd.read_csv('delays_Data.csv')
In [42]:
def convert_mixed_fractions(x):
if '%' in x:
return float(x[:-1])/100
else:
return float(x)
In [43]:
Delays_data['On-time (A14)'] = Delays_data['On-time (A14)'].map(convert_mixed_fractions)
Airline_list = pd.read_csv('airlines_Data.csv')
In [44]:
url =u'https://en.wikipedia.org/wiki/List_of_low-cost_airlines'
http_pool = urllib3.connection_from_url(url)
r = http_pool.urlopen('GET',url)
html = r.data
tree = BeautifulSoup(html,"lxml")
table_tag = tree.select("table")
tab_data = []
for table in table_tag:
tab_data.append([[item.text.replace('\n', '') for item in row_data.select("li")]
for row_data in table.select("tr")])
cheap_airlines = [ re.sub('\([\s\S]*\)|\[[\d]*\]', '', i).strip() for tab in tab_data for t in tab for i in t]
In [45]:
#add airlines that are for some reason missing/written differently
cheap_airlines = cheap_airlines[:cheap_airlines.index('v')]+['easyJet',
'IndiGo Airlines',
'Jetstar Airways',
'TUIfly',
'Shenzhen Airlines',
'JetBlue Airways']
What are the airlines with the most flight routes:
In [46]:
merged_routes.Name.value_counts().head(10)
Out[46]:
In [47]:
merged_routes.Name.value_counts().plot(kind='hist', log=True, bins=20)
plt.title("Log Histogram of connections per airline")
plt.show()
In [48]:
merged_routes.Name.value_counts().describe()
Out[48]:
What are the airlines with the most airports:
In [49]:
airport_count = merged_routes.groupby('Name').apply(
lambda x: x.SourceAirport.drop_duplicates().count()).sort_values(ascending=False)
airport_count.head(10)
Out[49]:
In [50]:
airport_count.plot(kind='hist', log=True, bins=20)
plt.title('Log Histogram: Number of airports served')
Out[50]:
In [51]:
airport_count.describe()
Out[51]:
Analyzing the metadata we will later use:
In [52]:
Delays_data['Avg. Delay'].plot(kind='box')
Out[52]:
In [53]:
Delays_data['Avg. Delay'].describe()
Out[53]:
In [54]:
merged_routes[merged_routes.Name.isin(cheap_airlines)].Name.drop_duplicates().count()
Out[54]:
Only keep large enough airlines:
In [55]:
reasonably_big_airlines = merged_routes.Name.value_counts()[merged_routes.Name.value_counts() > 130].index
In [56]:
merged_routes = merged_routes[merged_routes.Name.isin(reasonably_big_airlines)]
We look at a total of 118 airlines:
In [57]:
len(merged_routes.Name.unique())
Out[57]:
out of which we categoryze _ as low cost airlines:
In [58]:
merged_routes[merged_routes.Name.isin(cheap_airlines)].Name.drop_duplicates().count()
Out[58]:
We build a function to create the airline network:
In [59]:
def create_airline_network(airline):
df = merged_routes[merged_routes['Name'] == airline]
Airline_Graph = nx.from_pandas_edgelist(df,
source='SourceAirport', target='DestinationAirport', edge_attr=['Country'])
nx.set_node_attributes(Airline_Graph, location_mapping, 'Location')
return Airline_Graph
In [60]:
def draw_airline_network(Airline_Graph, airline):
plt.figure(figsize=(6, 6))
centrality = nx.betweenness_centrality(Airline_Graph)
size = np.array(list(centrality.values()))*600
nx.draw_spring(Airline_Graph, node_size=size, width=0.1)
plt.title(airline)
plt.show()
def get_spectrum_figures(Airline_Graph):
e, U = np.linalg.eigh(nx.normalized_laplacian_matrix(Airline_Graph).todense())
plt.plot(e)
plt.show()
plt.plot(nx.laplacian_spectrum(Airline_Graph))
plt.show()
plt.boxplot(nx.degree_centrality(Airline_Graph).values())
plt.show()
Example of some low cost airlines:
In [61]:
Cheap = create_airline_network('Southwest Airlines')
draw_airline_network(Cheap, 'Southwest Airlines')
Plotting the graph on map
In [62]:
from mpl_toolkits.basemap import Basemap
#run export PROJ_LIB=$CONDA_PREFIX/share/proj if this gives a key error
In [63]:
airport_location = airports.set_index('IATA').apply(lambda x: [x.Longitude, x.Latitude], axis=1)
In [64]:
plt.figure(figsize=(15, 20))
m=Basemap(llcrnrlon=-140,urcrnrlon=-30,llcrnrlat=10,urcrnrlat=70)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
def draw_local_network(Airline_Graph, edge_color='black', node_color='red', alpha=1):
node_size= np.array(list(nx.degree_centrality(Airline_Graph).values()))*500
nx.draw_networkx_edges(Airline_Graph, pos=airport_location, edge_color=edge_color, alpha=0.4)
nx.draw_networkx_nodes(Airline_Graph, pos=airport_location, node_size=node_size, node_color=node_color, alpha=alpha)
draw_local_network(Cheap)
Here we show how to create an interactive graph. As it is quite heavy, we only use it when we are curious about connections.
In [65]:
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
import plotly.plotly as py
from plotly.graph_objs import *
init_notebook_mode()
In [66]:
def draw_map_graph(Airline_Graph):
df_paths = pd.DataFrame(list(Airline_Graph.edges), columns=['source', 'dest'])
df_paths['start'] = df_paths.source.map(airport_location)
df_paths['end'] = df_paths.dest.map(airport_location)
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
markers = [
{'lat': airports[airports.IATA.isin(Airline_Graph.nodes)].Latitude.tolist(),
'lon': airports[airports.IATA.isin(Airline_Graph.nodes)].Longitude.tolist(),
'marker': {'color': 'rgb(40,40,40)',
'line': {'color': 'rgb(40,40,40)', 'width': 0.5},
'size': np.array(list(nx.degree_centrality(Airline_Graph).values()))*10,
'sizemode': 'diameter'},
'text': airports[airports.IATA.isin(Airline_Graph.nodes)].Name.tolist(),
'type': 'scattergeo'},
]
paths = []
for i in range( len( df_paths ) ):
paths.append(
dict(
type = 'scattergeo',
lon = [ df_paths['start'][i][0], df_paths['end'][i][0] ],
lat = [ df_paths['start'][i][1], df_paths['end'][i][1]],
mode = 'lines',
line = dict(
width = 1,
color = 'grey',
),
opacity = 0.5,
)
)
layout = go.Layout(
title = '',
width=1000,
height=700,
showlegend = False,
geo = dict(
scope='world',
projection=dict( type = 'natural earth'),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"
),)
fig = go.Figure(layout=layout, data=paths+markers)
iplot( fig, validate=False)
In [67]:
draw_map_graph(Cheap)
We know that the graph of all airlines is scale-free, but is the graph formed by individual airlines scalefree?
In [68]:
def get_scalefree_network_stats(airline):
Airline_Graph = create_airline_network(airline)
components = nx.number_connected_components(Airline_Graph)
component_ratio = 0
density = nx.density(Airline_Graph)
if components > 1 :
c = sorted(nx.connected_components(Airline_Graph), key = len, reverse=True)
Airline_Graph = Airline_Graph.subgraph(c[0])
degrees = pd.Series(list((dict(nx.degree(Airline_Graph)).values())))
density = nx.density(Airline_Graph)
diameter = nx.diameter(Airline_Graph)
pathlength = nx.average_shortest_path_length(Airline_Graph)
return np.array([ degrees.mean(), degrees.std(), degrees.max(), density, diameter, pathlength])
In [69]:
airlines = merged_routes.Name.drop_duplicates()
stats = np.array(airlines.map(get_scalefree_network_stats).tolist())
In [70]:
plt.figure(figsize=(7, 6))
plt.scatter(stats[:, 0], stats[:, 1])
plt.plot(np.linspace(0, 16), np.sqrt(np.linspace(0, 16)))
plt.xlabel('<k>')
plt.ylabel('$\sigma_k$')
plt.title('Network average degree vs std')
plt.show()
And here we calculate statistical measures of different graph properties:
In [71]:
pd.DataFrame(stats, columns=['Average Degree',
'Standard Dev Degree',
"Maxium degree",
'Density',
'Diameter',
'Average path lenth']
).describe()
Out[71]:
Clustering the networks based on stats:
Our first model only uses the laplacian the quantify the structure of the graph:
In [72]:
from networkx.algorithms.approximation.clique import large_clique_size
from networkx.algorithms.community import greedy_modularity_communities
In [73]:
def get_network_stats(airline):
Airline_Graph = create_airline_network(airline)
components = nx.number_connected_components(Airline_Graph)
component_ratio = 0
density = nx.density(Airline_Graph)
if components > 1 :
print(airline,' ' ,components)
c = sorted(nx.connected_components(Airline_Graph), key = len, reverse=True)
component_ratio = len([i for t in c[1:] for i in t])/ len(c[0])
Airline_Graph = Airline_Graph.subgraph(c[0])
#connectivity pattern of airline
hist = nx.degree_histogram(Airline_Graph)
total = sum(hist)
per_large_degree = sum(hist[5:])/total
deadend = hist[1]/total
#network pattern
algebraic_connectivity = nx.algebraic_connectivity(Airline_Graph)
spectrum = nx.normalized_laplacian_matrix(Airline_Graph).todense()
eig, _ = np.linalg.eigh(spectrum)
eig = np.array(sorted(eig))
return np.array([eig[1], #algebraic_connectivity
eig[-1],
pd.Series(eig.round(decimals=4)).value_counts()[1]/pd.Series(eig.round(decimals=4)).value_counts().sum(),
deadend, per_large_degree
])
Collecting the data:
In [74]:
Stats = []
airlines = merged_routes.Name.unique()
for name in airlines:
stat = get_network_stats(name)
if type(stat) != None:
Stats.append(stat)
Put everything into a matrix, do PCA for dimensionality reduction & cluster:
In [75]:
network_stats = np.array(Stats)
#scale features so that each feature has same distribution
x = StandardScaler().fit_transform(network_stats)
In [76]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x[:, [0, 2]])
print(pca.explained_variance_ratio_)
In [77]:
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=9, affinity='euclidean', linkage='ward')
_ = cluster.fit_predict(principalComponents)
In [78]:
plt.scatter(x[:, 0], x[:,2], c=cluster.labels_, cmap=plt.cm.tab20)
plt.show()
plt.scatter(principalComponents[:,0], principalComponents[:,1], c=cluster.labels_, cmap=plt.cm.tab20)
Out[78]:
In [79]:
plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= cluster.labels_, cmap=plt.cm.tab20)
for i, name in enumerate(airlines):
plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]),
xytext = (0, 0), textcoords = 'offset points')
In [80]:
for d in ['Saudi Arabian Airlines', 'Emirates', 'TUIfly', 'Era Alaska']:
p = create_airline_network(d)
draw_airline_network(p, d)
In [81]:
palette = sns.color_palette(palette='Set1', n_colors=3).as_hex()
In [82]:
Best_airlines = ["Air New Zealand",
"Qantas",
"Singapore Airlines",
"Virgin Atlantic Airlines",
"Etihad Airways",
"Cathay Pacific",
"Japan Airlines",
"Qatar Airways",
"Emirates"
]
In [83]:
def return_color(c):
if c in cheap_airlines:
return palette[0]
elif c in Best_airlines:
return palette[1]
return palette[2]
In [84]:
c = [return_color(a) for a in airlines]
In [85]:
plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= c, s=250)
for i, name in enumerate(airlines):
plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]),
xytext = (0, 0), textcoords = 'offset points')
plt.xticks([])
plt.yticks([])
Out[85]:
We identify nordic airlines from this graph:
In [86]:
mask = (principalComponents[:, 1] > 2)
Components = principalComponents[mask]
In [87]:
for nordic in airlines[mask]:
Nordic = create_airline_network(nordic)
draw_airline_network(Nordic, nordic)
In [88]:
plt.figure(figsize=(15, 20))
palette = sns.color_palette(palette='Set1', n_colors=2).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=40,llcrnrlat=50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
for i, nordic in enumerate(airlines[mask]):
Nordic = create_airline_network(nordic)
draw_local_network(Nordic, node_color=palette[i])
In [89]:
plt.figure(figsize=(5, 5))
palette = sns.color_palette(palette='Set1', n_colors=2).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=-140,llcrnrlat=50,urcrnrlat=80)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
Nordic = create_airline_network(nordic)
draw_local_network(Nordic, node_color=palette[1])
In [90]:
plt.figure(figsize=(5, 5))
Nordic = create_airline_network(airlines[mask][0])
palette = sns.color_palette(palette='Set1', n_colors=2).as_hex()
m=Basemap(llcrnrlon=-10,urcrnrlon=40,llcrnrlat=50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
draw_local_network(Nordic, node_color=palette[1])
Cheap airlines are all located in a corner of the cluster:
In [91]:
mask = (principalComponents[:, 0]-0.5 > principalComponents[:, 1] )
Components = principalComponents[mask]
In [92]:
sum([1 if a in cheap_airlines else 0 for a in airlines[mask]])/sum(mask)
Out[92]:
In [93]:
sum([1 if a in cheap_airlines else 0 for a in airlines[mask]]
)/sum([1 if a in cheap_airlines else 0 for a in airlines])
Out[93]:
In [94]:
palette = sns.color_palette(palette='Set1', n_colors=3).as_hex()
In [95]:
plt.figure(figsize=(10, 10))
plt.scatter(Components[:,0], Components[:,1], cmap=plt.cm.tab20, c = [return_color(a) for a in airlines[mask]])
for i, name in enumerate(airlines[mask]):
plt.annotate(name, xy = (Components[i, 0], Components[i, 1]),
xytext = (0, 0), textcoords = 'offset points')
However, we now expand the features, in hopes of getting an even better partition:
But from the graphs of these airlines we can clearly recognize a strong pattern of large vs prestious airlines: prestigous airlines have a much more starlike structure. Hence, a better method would need to cluster starlike graphs closer together
In [96]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x[:, [0, 2, 3, 4]])
print(pca.explained_variance_ratio_)
cluster = AgglomerativeClustering(n_clusters=10, affinity='euclidean', linkage='ward')
_ = cluster.fit_predict(principalComponents)
plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= cluster.labels_, cmap=plt.cm.tab20)
for i, name in enumerate(airlines):
plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]),
xytext = (0, 0), textcoords = 'offset points')
In [97]:
plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c=c, cmap=plt.cm.tab20, s=250)
plt.xticks([])
plt.yticks([])
for i, name in enumerate(airlines):
plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]),
xytext = (0, 0), textcoords = 'offset points')
Validating that this is indeed a better representation:
In [98]:
mask = (principalComponents[:, 0]-0.5 < principalComponents[:, 1] )
Components = principalComponents[mask]
sum([1 if a in cheap_airlines else 0 for a in airlines[mask]])/sum(mask)
Out[98]:
In [99]:
sum([1 if a in cheap_airlines else 0 for a in airlines[mask]]
)/sum([1 if a in cheap_airlines else 0 for a in airlines])
Out[99]:
In [100]:
mask = (principalComponents[:, 0]-0.5 >= principalComponents[:, 1] )
In [101]:
sum([1 if a in cheap_airlines else 0 for a in airlines[mask]])/sum(mask)
Out[101]:
In [102]:
sum([1 if a in cheap_airlines else 0 for a in airlines[mask]]
)/sum([1 if a in cheap_airlines else 0 for a in airlines])
Out[102]:
Now we see that the best airlines are all clustered together, and the cheapest airlines tend to be clustered together!
In [103]:
plt.figure(figsize=(15, 20))
mask = (principalComponents[:, 0]-3 >= principalComponents[:, 1] )
palette = sns.color_palette(palette='tab20', n_colors=50).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=180,llcrnrlat=-50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
legend = []
for i, cheap in enumerate(airlines[mask]):
Cheap = create_airline_network(cheap)
draw_local_network(Cheap, edge_color=palette[i], node_color=palette[i], alpha=0.5)
legend.append(mpatches.Patch(color=palette[i], label=cheap))
plt.legend(handles=legend)
Out[103]:
Plot shape of networks:
In [104]:
mask = ((principalComponents[:, 0]-2 >= principalComponents[:, 1] ) & (principalComponents[:, 1] > -2.2))
for cheap in airlines[mask]:
Cheap = create_airline_network(cheap)
draw_airline_network(Cheap, cheap)
In [105]:
plt.figure(figsize=(15, 20))
palette = sns.color_palette(palette='tab20c', n_colors=30).as_hex()
m=Basemap(llcrnrlon=-160,urcrnrlon=180,llcrnrlat=-70,urcrnrlat=70)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
legend = []
for i, expensive in enumerate(Best_airlines):
Expensive = create_airline_network(expensive)
draw_local_network(Expensive, node_color=palette[i], alpha=0.9, edge_color=palette[i])
legend.append(mpatches.Patch(color=palette[i], label=expensive))
plt.legend(handles=legend)
Out[105]:
Now, what is the underlying confounding factor that could lead to such a distribution?
Is the clustering due to international vs local flights?
International airlines are much more likely to be world renowed, while cheap airlines may only provide inertcontinetal or even just naional travel. Here we check whether there is an indication of this in the graphs:
In [106]:
d = airports.set_index('IATA').Country.to_dict()
In [107]:
country_json= pd.read_json('country-by-continent.json').set_index('country').continent.to_dict()
In [108]:
merged_routes['Intercontinental'] = merged_routes.apply(
lambda x: country_json[d[x.SourceAirport]] != country_json[d[x.DestinationAirport]] , axis=1)
In [109]:
color_international = merged_routes.groupby('Name').apply(lambda x: x.Intercontinental.sum()).tolist()
In [110]:
np.corrcoef(principalComponents[:, 0], color_international)
Out[110]:
In [111]:
np.corrcoef(principalComponents[:, 1], color_international)
Out[111]:
In [112]:
plt.figure(figsize=(5, 5))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= color_international, cmap=plt.cm.PuBu )
plt.colorbar()
Out[112]:
In [113]:
color_international = merged_routes.groupby('Name').apply(lambda x: x.International.sum()).tolist()
In [114]:
np.corrcoef(principalComponents[:, 0], color_international)
Out[114]:
In [115]:
np.corrcoef(principalComponents[:, 1], color_international)
Out[115]:
In [116]:
plt.figure(figsize=(5,5))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= color_international, cmap=plt.cm.PuBu )
plt.colorbar()
Out[116]:
In [117]:
color_international = merged_routes.groupby('Name').apply(
lambda x: x.Distance.sum()).tolist()
In [118]:
np.corrcoef(principalComponents[:, 0], color_international)
Out[118]:
In [119]:
np.corrcoef(principalComponents[:, 1], color_international)
Out[119]:
In [120]:
plt.figure(figsize=(5, 5))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= color_international, cmap=plt.cm.PuBu )
plt.colorbar()
Out[120]:
Analyzing if there is corrleation between countries and structure:
In [121]:
most_common = merged_routes[~merged_routes.Name.duplicated()].Country.value_counts().head(3).index
most_common
Out[121]:
In [122]:
countries = list(most_common) + list(merged_routes.Country.unique())
airline_to_countryid = merged_routes.set_index('Name').Country.to_dict()
country_coloring = [countries.index(airline_to_countryid[i]
) if airline_to_countryid[i] in most_common else -1 for i in merged_routes.Name.unique() ]
In [123]:
to_keep = [i for i, name in enumerate(airlines) if airline_to_countryid[name] in most_common]
In [124]:
plt.figure(figsize=(10, 10))
plt.scatter(principalComponents[:,0],
principalComponents[:,1],
c= np.array(country_coloring),
cmap=plt.cm.Set1_r)
for i in to_keep:
plt.annotate(airlines[i], xy = (principalComponents[i, 0], principalComponents[i, 1]),
xytext = (0, 0), textcoords = 'offset points')
We see a lot of dispresion and no uniform cluster
Analyzing inference from eignevalues to Delays
In [125]:
scatter_data = pd.DataFrame(principalComponents)
scatter_data['name'] = airlines
scatter_data['labels'] = cluster.labels_
scatter_data['country'] = scatter_data.name.map(airline_to_countryid)
scatter_data['ontime'] = scatter_data.name.map(Delays_data.set_index('On-time')['On-time (A14)'].to_dict())
scatter_data['delay'] = scatter_data.name.map(Delays_data.set_index('On-time')['Avg. Delay'].to_dict())
scatter = scatter_data[scatter_data.delay.notna()].copy()
As we can see in the graph, there is no clear dependence on PCA features.
In [126]:
plt.figure(figsize=(15, 10))
plt.scatter(scatter[0],
scatter[1],
c= scatter['delay'],
cmap=plt.cm.Spectral_r)
for i in scatter.index:
plt.annotate(scatter['name'][i], xy = (scatter[0][i], scatter[1][i]),
xytext = (0, 0), textcoords = 'offset points')
plt.colorbar()
Out[126]:
In [127]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
In [128]:
X = np.array(Stats)[list(scatter.index)]
X = StandardScaler().fit_transform(X)
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X[:,[0]+list(range(2,5))], scatter['delay'])
#resudual sum of squares
regr.score(X[:, [0]+list(range(2,5))], scatter['delay'])
Out[128]:
In [129]:
# Train the model using the training sets
regr.fit(X[:, 1:], scatter['delay'])
#resudual sum of squares
regr.score(X[:, 1:], scatter['delay'])
Out[129]:
In [130]:
scatter['pred'] = regr.predict(X[:, 1:])
In [131]:
plt.figure(figsize=(20, 10))
plt.scatter(scatter[0],
scatter[1],
c= scatter['pred'],
cmap=plt.cm.Spectral_r)
for i in scatter.index:
plt.annotate(scatter['name'][i], xy = (scatter[0][i], scatter[1][i]),
xytext = (0, 0), textcoords = 'offset points')
plt.colorbar()
Out[131]:
Showing the weight of features, to see how important each feature is:
In [132]:
plt.bar(range(len(regr.coef_)), regr.coef_,)
[regr.coef_]
Out[132]:
Adding robustness as a predictor:
In [133]:
def get_network_stats_robustness(airline):
Airline_Graph = create_airline_network(airline)
components = nx.number_connected_components(Airline_Graph)
component_ratio = 0
density = nx.density(Airline_Graph)
if components > 1 :
print(airline,' ' ,components)
c = sorted(nx.connected_components(Airline_Graph), key = len, reverse=True)
component_ratio = len([i for t in c[1:] for i in t])/ len(c[0])
Airline_Graph = Airline_Graph.subgraph(c[0])
#connectivity pattern of airline
diameter = nx.diameter(Airline_Graph)
degree_assortativity = nx.degree_assortativity_coefficient(Airline_Graph)
#network pattern
algebraic_connectivity = nx.algebraic_connectivity(Airline_Graph)
#robustness of network
nb = nx.betweenness_centrality(Airline_Graph, normalized=True)
betweenness = np.array(list(nb.values()))
upper_betweenness = np.quantile(betweenness, 0.75)
median_betweenness = np.quantile(betweenness, 0.5)
lower_betweenness = np.quantile(betweenness, 0.27)
eb = nx.edge_betweenness_centrality(Airline_Graph, normalized=True)
betweenness = np.array(list(eb.values()))
e_upper_betweenness = np.quantile(betweenness, 0.75)
e_median_betweenness = np.quantile(betweenness, 0.5)
e_lower_betweenness = np.quantile(betweenness, 0.27)
node_connectivity = nx.node_connectivity(Airline_Graph)
return np.array([upper_betweenness, median_betweenness,
e_upper_betweenness, e_median_betweenness,
])
In [134]:
Stats_between = []
airlines = scatter.name.unique()
for name in airlines:
stat = get_network_stats_robustness(name)
if type(stat) != None:
Stats_between.append(stat)
network_stats_between = np.array(Stats_between)
x_between = StandardScaler().fit_transform(network_stats_between)
In [135]:
scatter_extended = pd.concat([pd.DataFrame(x_between), pd.DataFrame(X)], ignore_index=True, axis=1)
In [136]:
X_ = scatter_extended.iloc[:, :4]
X_ = StandardScaler().fit_transform(X_)
regr = linear_model.LinearRegression()
regr.fit(X_, scatter['delay'])
#resudual sum of squares
regr.score(X_, scatter['delay'])
Out[136]:
In [137]:
X_ = scatter_extended.iloc[:, list(range(0, 4)) + list(range(5, 8))]
X_ = StandardScaler().fit_transform(X_)
regr = linear_model.LinearRegression()
regr.fit(X_, scatter['delay'])
#resudual sum of squares
regr.score(X_, scatter['delay'])
Out[137]:
Again looking at the weights to get indication of importance of features:
In [138]:
plt.bar(range(len(regr.coef_)), regr.coef_,)
[regr.coef_]
Out[138]:
In [142]:
merged_routes = merged_routes[~merged_routes['SourceAirport'].isin(to_find)]
merged_routes = merged_routes[~merged_routes['DestinationAirport'].isin(to_find)]
In [143]:
merged_routes['SourceCountry'] = merged_routes.apply(lambda x:
Airport_to_country[x.SourceAirport], axis=1)
merged_routes['DestinationCountry'] = merged_routes.apply(lambda x:
Airport_to_country[x.DestinationAirport], axis=1)
merged_routes.head(5)
Out[143]:
In [144]:
def overlaps(route_a, route_b):
source_dist = distance(distance_mapping[route_a['SourceAirport']],
distance_mapping[route_b['SourceAirport']]).km
dest_dist = distance(distance_mapping[route_a['DestinationAirport']],
distance_mapping[route_b['DestinationAirport']]).km
return source_dist <= 100 and dest_dist <= 100
"""
Computed the overlap (competition) score between two airlines.
It takes not only nodes into consideration but also the edges.
So, if Airline A goes from Bucharest to Zurich and Airline B
from Geneva to Zurich, based on this fact only, they are not
competitors.
"""
def compute_overlap_score(airline_x, airline_y, absolute = False):
routes_x = merged_routes[merged_routes.Name == airline_x]
if airline_x == airline_y:
if absolute:
return len(routes_x)
else:
return 1
routes_y = merged_routes[merged_routes.Name == airline_y]
score = 0
for i, row_x in routes_x.iterrows():
does_overlap = False
joined_routes = routes_y[(routes_y.SourceCountry == row_x['SourceCountry']) &
(routes_y.DestinationCountry == row_x['DestinationCountry'])]
for j, row_y in joined_routes.iterrows():
if overlaps(row_x, row_y):
does_overlap = True
if does_overlap:
score += 1
if absolute:
return score
else:
return score / len(routes_x)
compute_overlap_score("Ryanair", "Wizz Air", True)
Out[144]:
In [145]:
"""
Computed the overlap (competition) score between given airlines.
It takes not only nodes into consideration but also the edges.
So, if Airline A goes from Bucharest to Zurich and Airline B
from Geneva to Zurich, based on this fact only, they are not
competitors.
"""
def compute_overlap_scores(airline_list, absolute = False):
scores = []
for airline_a in airline_list:
for airline_b in airline_list:
scores.append([airline_a, airline_b, compute_overlap_score(airline_a, airline_b, absolute)])
return pd.DataFrame(scores, columns = ['AirlineA', 'AirlineB', 'Score'])
In [146]:
overlap_scores = compute_overlap_scores(["Ryanair", "Wizz Air"], True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu")
Out[146]:
In [147]:
Low_cost = ['Ryanair', 'easyJet', 'Wizz Air', 'TUIfly', 'Jet2.com']
overlap_scores = compute_overlap_scores(Low_cost, True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu", vmax=1500)
Out[147]:
In [148]:
plt.figure(figsize=(15, 20))
palette = sns.color_palette(palette='Set1', n_colors=30).as_hex()
m=Basemap(llcrnrlon=-30,urcrnrlon=50,llcrnrlat=10,urcrnrlat=70)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
legend = []
for i, cheap in enumerate(Low_cost):
Cheap = create_airline_network(cheap)
draw_local_network(Cheap, edge_color=palette[i], node_color=palette[i], alpha=0.5)
legend.append(mpatches.Patch(color=palette[i], label=cheap))
plt.legend(handles=legend)
Out[148]:
In [149]:
Big_airlines = ['Lufthansa', 'KLM Royal Dutch Airlines', 'Air France', 'British Airways']
overlap_scores = compute_overlap_scores(Big_airlines, True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu")
Out[149]:
In [150]:
plt.figure(figsize=(15, 20))
palette = sns.color_palette(palette='Set1', n_colors=30).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=180,llcrnrlat=-50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
legend = []
for i, cheap in enumerate(Big_airlines):
Cheap = create_airline_network(cheap)
draw_local_network(Cheap, edge_color=palette[i], node_color=palette[i], alpha=0.5)
legend.append(mpatches.Patch(color=palette[i], label=cheap))
plt.legend(handles=legend)
Out[150]:
In [151]:
overlap_scores = compute_overlap_scores(["Ryanair", "Jet2.com", "easyJet", "Lufthansa", "KLM Royal Dutch Airlines", "Air France", "British Airways"], True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu", vmax=1000)
Out[151]:
In [152]:
"""
Completion score of airline A and airline B is defined as how well the second airline
completes the first one, in the sense that one person taking airline A from
airport X to airport Y can take airline B form Y to airport Z, but cannot take airline
A.
Since we will be focusing on networks with similar structure, we won't scale this score,
but rather, the score will be: the total number of new destiation a traveler can get to
starting with airline A and then taking airline A (in the sense defined above, that is,
so that the second route does not exist for airline A). One can scale that number to the
total number of destination of the airline, so that we get comparable, scaled scores.
"""
def compute_completion_score(airline_a, airline_b):
routes_a = merged_routes[merged_routes['Name'] == airline_a]
destinations_a = routes_a.DestinationAirport
routes_b = merged_routes[merged_routes['Name'] == airline_b]
# Routes where B completes A
completion_routes =(routes_a.merge(routes_b, left_on='DestinationAirport', right_on='SourceAirport'))
# Filter the routes not accesible by airline A
strict_completion_routes = completion_routes[~completion_routes['DestinationAirport_y'].isin(destinations_a)]
return strict_completion_routes['DestinationAirport_y'].size
def compute_completion_scores(airline_list):
scores = []
for airline_a in airline_list:
for airline_b in airline_list:
scores.append([airline_a, airline_b, compute_completion_score(airline_a, airline_b)])
return pd.DataFrame(scores, columns = ['AirlineA', 'AirlineB', 'Score'])
compute_completion_score("Ryanair", "Wizz Air")
compute_completion_scores(["Ryanair", "Wizz Air"])
Out[152]:
We can see the protention corporation bewteen top airlines before scalling in the following graph. Based on the graph, we highly recommand the corporation betwwen Qantas and Emirates. We find that indeed, these two compaines have already become partnerships.
In [153]:
completion_scores = compute_completion_scores(Best_airlines)
sns.heatmap(completion_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu")
Out[153]:
After scalling, we find more interesting results. Like Qatar Airways could also protentially collaborate Singapore Airlines.
In [154]:
completion_scores = compute_completion_scores(Best_airlines)
sns.heatmap(completion_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu", vmax=600)
Out[154]: