Airline Analysis


In [1]:
import  geopy
from  geopy.distance import distance #calculates distance based on coordinates
from geopy.geocoders import Nominatim

import operator
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import urllib3
from bs4 import BeautifulSoup
urllib3.disable_warnings()
from urllib.parse import quote

import unicodedata
import string
import re


import warnings
warnings.filterwarnings('ignore')

Data processing

Loading the dataset

Pull up-to-date data from github


In [2]:
!bash download_data.sh


Getting airports data
Getting airlines data
Getting routes data
Getting planes data

In [3]:
airports = pd.read_csv('airports.dat', header=None, names=
                      ["AirportID","Name", "City", "Country", "IATA", "ICAO",
                       "Latitude", "Longitude", "Altitude", "Timezone", "DST", "TzDatabaseTimeZone",
                       "Type", "Source"],
                      na_values='\\N')
airlines = pd.read_csv('airlines.dat', header=None, names=
                       ["AirlineID", "Name", "Alias", "IATA", "ICAO", "Callsign", "Country", "Active"]
                       ,na_values='\\N')
routes = pd.read_csv('routes.dat', header=None, names=
                     ['Airline', 'AirlineID', 'SourceAirport', 'SourceAirportID', 'DestinationAirport',
                      'DestinationAirportID', 'Codeshare', 'Stops', 'Equipment'],
                    na_values='\\N')
planes = pd.read_csv('planes.dat', header=None, names=['Name', 'IATA code', 'ICAO code'])

In [4]:
routes.head()


Out[4]:
Airline AirlineID SourceAirport SourceAirportID DestinationAirport DestinationAirportID Codeshare Stops Equipment
0 2B 410.0 AER 2965.0 KZN 2990.0 NaN 0 CR2
1 2B 410.0 ASF 2966.0 KZN 2990.0 NaN 0 CR2
2 2B 410.0 ASF 2966.0 MRV 2962.0 NaN 0 CR2
3 2B 410.0 CEK 2968.0 KZN 2990.0 NaN 0 CR2
4 2B 410.0 CEK 2968.0 OVB 4078.0 NaN 0 CR2

In [5]:
airlines.head()


Out[5]:
AirlineID Name Alias IATA ICAO Callsign Country Active
0 -1 Unknown NaN - NaN NaN NaN Y
1 1 Private flight NaN - NaN NaN NaN Y
2 2 135 Airways NaN NaN GNL GENERAL United States N
3 3 1Time Airline NaN 1T RNX NEXTIME South Africa Y
4 4 2 Sqn No 1 Elementary Flying Training School NaN NaN WYT NaN United Kingdom N

Only keep airports in that are both in routes dataframe:


In [6]:
valid_airports = set(routes.SourceAirport).union(set(routes.DestinationAirport)) 
airports = airports[airports.IATA.isin(valid_airports)]

Only keep airlines in intersection of that are both in the airline and in the routes dataframe :


In [7]:
valid_airlines = set(routes.AirlineID)

In [8]:
airlines = airlines[airlines.AirlineID.isin(valid_airlines)]
routes = routes[routes.AirlineID.isin(valid_airlines)]

We check that for each airline we have exactly one edge between a given source and destination none. This means that our graph will be unweighted.


In [9]:
routes_by_airline = routes[['SourceAirport', 'DestinationAirport', 'Airline']]
routes_by_airline.drop_duplicates().shape == routes_by_airline.shape


Out[9]:
True

Only take direct flights, only very few have more than that anyway


In [10]:
routes = routes[routes.Stops == 0]

Adding Missing Airports

Airports to fill in information for :


In [11]:
len(valid_airports - set(airports.IATA))


Out[11]:
204

Web crawling: get IATA, aiportname and address from list


In [12]:
all_data = []
to_find = valid_airports - set(airports.IATA)

for u in string.ascii_uppercase:
    url =u'https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:_'+quote(u)
    http_pool = urllib3.connection_from_url(url)
    r = http_pool.urlopen('GET',url)

    html = r.data

    tree = BeautifulSoup(html,"lxml")
    table_tag = tree.select("table")[0]
    tab_data = [[item.text.replace('\n', '') for item in row_data.select("th,td")]
                    for row_data in table_tag.select("tr")]
    all_data = all_data + tab_data

df = pd.DataFrame(all_data[2:], columns=all_data[0])
df = df[df.IATA.isin(to_find)]

In [13]:
df.count()


Out[13]:
IATA               176
ICAO               176
Airport name       176
Location served    176
Time               105
DST                105
dtype: int64

In [14]:
#makes calls to API to get location based on adress
geolocator = Nominatim(timeout=3, user_agent="my-application")

In [15]:
def get_info(address):
    try:
        location = geolocator.geocode(address, addressdetails=True, language='en')
        return location.raw['address']['country'], location.raw['lon'], location.raw['lat']
    except:
        return None, None, None

Cleaning the dataframe:


In [16]:
details = df['Location\xa0served'].map(lambda x: get_info(x))

df['Country'] = details.map(lambda x: x[0])
df['Longitude'] = details.map(lambda x: x[1])
df['Latitude'] = details.map(lambda x: x[2])

df['Airport\xa0name'] = df['Airport\xa0name'].map(lambda x: re.sub('[\d\[\]]*|\([A-Z\ \:\d]*\)', '', x).strip())
df.rename(columns={'Airport\xa0name': 'Name', 'Location\xa0served': 'City'}, inplace=True)

df['Country'] = df.City.map(lambda x: x.split(',')[-1])
df['City'] = df.City.map(lambda x: x.split(',')[0])

del df['DST']

Manual additions to the dataset:


In [17]:
newMatrix = [['Altay Air Base', 'AAT', 47.7498856, 88.0858078, 'Altay', 'China', 'ZWAT'],
             ['Baise Youjiang Airport ', 'AEB', 23.7206001, 106.9599991, 'Baise', 'China', 'ZGBS'],
             ['Tasiilaq', 'AGM', 65.6122961, -37.6183355, 'Tasiilaq', 'Greenland', 'BGAM'],
             ['Atmautluak Airport', 'ATT', 60.8666992, -162.272995, 'Atmautluak', 'United States', ''],
             ['Branson Airport', 'BKG', 36.532082, -93.200544, 'Branson', 'United States', 'KBBG'],
             ['Baoshan Yunduan Airport', 'BSD', 25.0533009, 99.1682968, 'Baoshan', 'United States', 'ZPBS'],
             ['Laguindingan Airport', 'CGY', 8.612203, 124.456496, 'Cagayan de Oro City', 'Philippines', ''],
             ['Chuathbaluk Airport', 'CHU', 61.579102, -159.216003, 'Chuathbaluk', 'United States', 'PACH'],
             ['Crooked Creek Airport', 'CKD', 61.8679008, -158.1349945, 'Crooked Creek', 'United States', 'CJX'],
             ['Desierto De Atacama Airport', 'CPO', -27.2612, -70.7791977, 'Copiapo', 'Chile', 'SCAT'],
             ['Dandong Airport', 'DDG', 40.0247002, 124.2860031, 'Dandong', 'China', 'ZYDD'],
             ['Hamad International Airport', 'DOH', 25.2620449, 51.6130829, 'Doha', 'Qatar', 'OTHH'],
             ['Dongying Shengli Airport', 'DOY', 37.5085983, 118.788002, 'Dongying', 'China', 'ZSDY'],
             ['Saertu Airport', 'DQA', 46.7463889, 125.1405556, 'Daqing Shi', 'China', 'ZYDQ'],
             ['Førde Airport', 'FDE', 61.3911018, 5.7569399, 'Førde', 'Norway', 'ENBL'],
             ['FMt. Fuji Shizuoka Airport', 'FSZ', 34.7960435, 138.1877518, 'Makinohara', 'Japan', 'RJNS'],
             ['Foshan Shadi Airport', 'FUO', 23.0832996, 113.0699997, 'Foshan', 'China', 'ZGFS'],
             ['Goulimime Airport', 'GLN', 29.0266991, -10.0502996, 'Goulimime', 'Morocco', 'GMAG'],
             ['Gheshm Airport', 'GSM', 26.9487, 56.2687988, 'Gheshm', 'Iran', 'OIKQ'],
             ['Lianshui Airport', 'HIA', 33.7908333, 119.125, 'Huai\'an', 'China', 'ZSSH'],
             ['Iğdır Airport', 'IGD', 39.9766274, 43.876648, 'Iğdır', 'Turkey', 'LTCT'],            
             ['Kingman Airport', 'IGM', 35.2594986, -113.9380035, 'Kingman', 'United States', 'KIGM'],
             ['Jiagedaqi Airport', 'JGD', 50.375, 124.1166667, 'Jiagedaqi', 'China', ''],
             ['Jinggangshan Airport', 'JGS', 26.8568993, 114.7369995, 'Ji\'an', 'China', 'ZSJA'],
             ['Jinchuan Airport', 'JIC', 38.5422222, 102.3483333, 'Jinchang', 'China', 'ZLJC'],
             ['Qianjiang Wulingshan Airport', 'JIQ', 29.5133333, 108.8311111, 'Qianjiang', 'China', 'ZUQJ'],            
             ['Chizhou Jiuhuashan Airport', 'JUH', 30.7403, 117.6856, 'Chizhou', 'China', 'ZSJH'],
             ['Stagen Airport', 'KBU', -3.2947199, 116.1650009, 'Laut Island', 'Indonesia', 'WAOK'],
             ['Grayling Airport', 'KGX', 62.8945999, -160.0650024, 'Grayling', 'United States', 'PAGX'],
             ['Akiachak Airport', 'KKI', 60.9079018, -161.4349976, 'Akiachak', 'United States', 'KKI'],
             ['Kulusuk Airport', 'KUS', 65.5736008, -37.1236, 'Kulusuk', 'Greenland', 'BGKK'],
             ['Long Banga Airport', 'LBP', 3.202, 115.4018, 'Long Banga', 'Malaysia', ''],
             ['Lintsang Airfield', 'LNJ', 23.7381001, 100.0250015, 'Lincang', 'China', 'ZPLC'],
             ['Lublin Airport', 'LUZ', 51.240278, 22.713611, 'Lublin', 'Poland', 'EPLB'],
             ['Tampa Padang Airport', 'MJU', -2.583333, 119.033333, 'Mamuju-Celebes Island', 'Indonesia', 'WAWJ'],
             ['Basel Mulhouse Freiburg Airport', 'MLH',  47.5999, 7.53166, 'Basel Mulhouse Freiburg', 'France', 'LFSB'],
             ['Misratah Airport', 'MRA',  32.3250008, 15.0609999, 'Misratah', 'Libya', 'HLMS'],
             ['Enfidha - Hammamet International Airport', 'NBE',  36.075833, 10.438611, 'Enfidha', 'Tunisia', 'DTNH'],
             ['Changbaishan Airport', 'NBS',  42.0669444, 127.6022222, 'Baishan', 'China', 'ZYBS'],
             ['Nuussuaq Heliport', 'NSQ',  74.109722, -57.065, 'Nuussuaq', 'Greenland', 'BGNU'],
             ['Niaqornat Heliport', 'NIQ',  70.788889, -53.656111, 'Niaqornat', 'Greenland', 'BGNT'],
             ['Nantong Airport', 'NTG',  32.0708008, 120.9759979, 'Nantong', 'China', 'ZSNT'],
             ['Nunapitchuk Airport', 'NUP',  60.9057999, -162.4389954, 'Nunapitchuk', 'United States', 'PPIT'],
             ['Point Hope Airport', 'PHO', 68.3488007 , -166.798996, 'Point Hope', 'United States', 'PAPO'],
             ['Pilot Station Airport', 'PQS', 61.9346008 , -162.8999939, 'Pilot Station', 'United States', ''],
             ['Pomala Airport', 'PUM', -4.1810899 , 121.6179962, 'Kolaka', 'Indonesia', 'WAWP'],
             ['Bao\'anying Airport', 'PZI', 26.54 , 101.79852, 'Panzhihua', 'China', 'ZUZH'],
             ['Narsaq Kujalleq', 'QFN', 60.0046935, -44.6569347, 'Narsaq Kujalleq', 'Greenland', 'BGFD'],
             ['Igaliku', 'QFX', 60.9920065, -45.4323345, 'Igaliku', 'Greenland', 'BGIO'],
             ['Qassimiut Heliport', 'QJH', 60.779444,  -47.1525, 'Qassimiut', 'Greenland', 'BGQT'],
             ['Aappilattoq', 'QUV', 60.1483571,  -44.2869186, 'Nanortalik', 'Greenland', 'BGAQ'],
             ['Ammassivik', 'QUW', 60.5973758,  -45.3824455, 'Ammassivik', 'Greenland', 'BGAS'],
             ['Red Devil Airport', 'RDV', 61.7881012,  -157.3500061, 'Red Devil', 'United States', 'RDV'],
             ['Rio Grande Airport', 'RIG', -32.081699,  -52.163299, 'Rio Grande', 'Brazil', 'SJRG'],
             ['Saattut Heliport', 'SAE', 70.808611,  -51.626667, 'Saattut', 'Greenland', 'BGST'],
             ['Shire Inda Selassie Airport', 'SHC', 14.0781002,  38.2724991, 'Shire Indasilase', 'Ethiopia', ''],
             ['Stony River 2 Airport', 'SRV', 61.7896996,  -156.5890045, 'Stony River', 'United States', 'SRV'],
             ['Sheldon Point Airport', 'SXP', 62.5205994,  -164.8480072, 'Nunam Iqua', 'United States', 'SXP'],
             ['Semera Airport', 'SZE', 11.7875,  40.9915, 'Semera', 'Ethiopia', 'HASM'],
             ['Tuticorin Airport', 'TCR', 8.7241667,  78.0258333, 'Thoothukudi', 'India', 'VOTK'],
             ['Tanjung Manis Airport', 'TGC', 2.17784,  111.2020035, 'Tanjung Manis', 'Malaysia', 'WBGT'],
             ['Thọ Xuân Airport', 'THD', 19.901667,  105.467778, 'Sao Vàng', 'Vietnam', 'VVTX'],
             ['Tuluksak Airport', 'TLT', 61.0877292,  -160.9233542, 'Tuluksak', 'United States', 'TLT'],
             ['Tununak Airport', 'TNK', 60.5755005,  -165.2720032, 'Tununak', 'United States', '4KA'],
             ['Tasiusaq Heliport', 'TQA', 73.373055,  -56.06028, 'Tasiusaq', 'Greenland', 'BGTA'],
             ['Tangshan Sannühe Airport', 'TVS', 39.7178001,  118.0026245, 'Tangshan', 'China', 'ZBTS'],
             ['Majeed Bin Abdulaziz Airport', 'ULH', 26.48,  38.1288889, 'Al Ula', 'Saudi Arabia', 'OEAO'],
             ['Upernavik Kujalleq', 'UPK', 72.1527425,  -55.5309856, 'Upernavik Kujalleq', 'Greenland', 'BGKL'],
             ['Dong Hoi Airport', 'VDH', 17.515,  106.590556, 'Dong Hoi', 'Vietnam', 'VVDH'],
             ['Stebbins Airport', 'WBB', 63.5159988,  -162.2779999, 'Stebbins', 'United States', 'WBB'],
             ['Wenshan Puzhehei Airport', 'WNH', 23.563611,  104.333611, 'Wenshan City', 'China', 'ZPWS'],
             ['Tuntutuliak Airport', 'WTL', 60.3516111,  -162.6539722, 'Tuntutuliak', 'United States', 'A61'],
             ['Newtok Airport ', 'WWT', 60.9236984,  -164.6560059, 'Newtok', 'United States', 'WWT'],
             ['Tasiusaq Heliport', 'XEQ', 60.193889,   -44.811667, 'Tasiusaq', 'Greenland', 'BGTQ'],
             ['Zhangye Ganzhou Airport', 'YZY', 38.801899,   100.6750031, 'Zhangye', 'China', 'ZLZY'],
             ['La Araucanía Airport', 'ZCO', -38.9259, -72.6515, 'Temuco', 'Chile', 'SCQP'],
             ['Zhangjiakou Ningyuan Airport', 'ZQZ', 40.7386017, 114.9300003, 'Zhangjiakou', 'China', 'ZBZJ'],
             ['Aappilattoq Heliport', 'AOQ',72.886944,-55.596111, 'Aappilattoq', 'Greenland', 'BGAG'],
             ['Cewlik Bingöl Airport', 'BGG', 38.861111, 40.5925, 'Bingöl', 'Turkey', 'LTCU'],
             ['Ikerasak Heliport', 'IKE', 70.542368,  -51.520606, 'Ikerasak', 'Greenland', ''],
             ['Isortoq Heliport', 'IOQ', 65.546667,   -38.977778, 'Isortoq', 'Greenland', 'BGIS'],
             ['Illorsuit Heliport', 'IOT', 71.239722, -53.555556, 'Illorsuit', 'Greenland', 'BGLL'],
             ['Islamabad International Airport', 'ISB', 33.549083, 72.82565, 'Islamabad', 'Pakistan', 'OPIS'],
             ['Innaarsuit Heliport', 'IUI', 73.2025, -56.011111, 'Innaarsuit', 'Greenland', 'BGIN'],
             ['Nuugaatsiaq Heliport', 'JUU', 71.538611, -53.205, 'Nuugaatsiaq', 'Greenland', 'BGNQ'],
             ['Kangersuatsiaq Heliport', 'KGQ', 72.381111, -55.536667, 'Kangersuatsiaq', 'Greenland', 'BGKS'],
             ['Kullorsuaq Heliport', 'KHQ', 74.579444, -57.235556, 'Kullorsuaq', 'Greenland', 'BGKQ'],
             ['Simanggang Airport', 'SGG', 1.217, 111.4499969, 'Sri Aman', 'Malaysia', 'WBGY'],
             ['Sierra Leone Airport', 'SRK', 8.5, -13.5083333, 'Sierra Leone', 'Sierra Leone', ''],
             ['Svay Rieng Airport', 'SVR', 11.0833333, 105.8055556, 'Svay Rieng', 'Cambodia', ''],
             ['Biloela Airport', 'ZBL', -24.4, 150.5, 'Biloela', 'Australia', '']]
             
             
append_df = pd.DataFrame(newMatrix, columns=['Name', 'IATA', 'Latitude', 'Longitude', 'City', 'Country', 'ICAO']) 
airports = airports.append(append_df, sort=False)

In [18]:
df = df[~df.IATA.isin(airports.IATA)]

Append missing to dataframe


In [19]:
airports = airports.append(df, sort=False)

Merging Routes with Airlines:

We are only interessted in currently active airlines:


In [20]:
merged_routes = pd.merge(airlines[airlines.Active == 'Y'], routes, on='AirlineID')

In [21]:
merged_routes.head(1)


Out[21]:
AirlineID Name Alias IATA ICAO Callsign Country Active Airline SourceAirport SourceAirportID DestinationAirport DestinationAirportID Codeshare Stops Equipment
0 10 40-Mile Air NaN Q5 MLA MILE-AIR United States Y Q5 CKX NaN TKJ 7235.0 NaN 0 CNA

Getting whether the flight is international or not:


In [22]:
Airport_to_country = airports.set_index('IATA').Country.to_dict()

In [23]:
def get_international(x):
    try:
        if Airport_to_country[x.SourceAirport] != Airport_to_country[x.DestinationAirport]:
            return 1
        else:
            return 0
    except:
        return None

In [24]:
merged_routes['International'] = merged_routes.apply(
    lambda x: get_international(x), axis=1)

Reset frames to create mappings:


In [25]:
#only keep values we are interested in
airports_filtered = airports[['Name', 'Country', 'Longitude', 'Latitude', 'Timezone', 'IATA', 'City']].copy()

In [26]:
#IATA airport id -> longitude latitude
airports_filtered.set_index('IATA', inplace=True)
airports_filtered.Longitude.dropna().shape == airports_filtered.Longitude.shape


Out[26]:
False

In [27]:
#sanity check
print(airports.Longitude.shape)
print(airports_filtered.Longitude.shape)


(3420,)
(3420,)

In [28]:
location_mapping = airports_filtered.apply(lambda x: [x.Longitude, x.Latitude], axis=1).to_dict()

In [29]:
#Airline name -> airlineID
airline_name_to_number = merged_routes.Name.drop_duplicates().reset_index(drop=True).to_dict()
airline_name_to_number = {v: k for k, v in airline_name_to_number.items()}

In [30]:
merged_routes['AirlineNbr'] = merged_routes.Name.map(airline_name_to_number)

Fill in Nan values:


In [31]:
merged_routes['Codeshare'] = merged_routes.Codeshare.fillna('N')

Getting the distance between two airports:

Example of functionality:


In [32]:
element = airports_filtered.apply(lambda x: (x.Latitude, x.Longitude), axis=1)[0]
element2 = airports_filtered.apply(lambda x: (x.Latitude, x.Longitude), axis=1)[1]

In [33]:
distance(element, element2).km


Out[33]:
106.2489585209369

In [34]:
distance_mapping = airports_filtered.apply(lambda x: (x.Latitude, x.Longitude), axis=1).to_dict()

Additing it to merged_routes:


In [35]:
def get_distance(source, dest):
    try:
        dist = distance(distance_mapping[source], distance_mapping[dest]).km
        return dist
    except:
        return None

In [36]:
merged_routes['Distance'] = merged_routes.apply(lambda x: get_distance(x.SourceAirport, x.DestinationAirport), axis=1)

In [37]:
relevant_columns = ['Name', 'ICAO', 'Country', 'SourceAirport', 'DestinationAirport', 'Codeshare',
                    'Stops', 'Equipment', 'AirlineNbr', 'International', 'Distance']

In [38]:
merged_routes.head()


Out[38]:
AirlineID Name Alias IATA ICAO Callsign Country Active Airline SourceAirport SourceAirportID DestinationAirport DestinationAirportID Codeshare Stops Equipment International AirlineNbr Distance
0 10 40-Mile Air NaN Q5 MLA MILE-AIR United States Y Q5 CKX NaN TKJ 7235.0 N 0 CNA 1.0 0 99.698791
1 10 40-Mile Air NaN Q5 MLA MILE-AIR United States Y Q5 FAI 3832.0 HKB 7242.0 N 0 CNA 1.0 0 177.559277
2 10 40-Mile Air NaN Q5 MLA MILE-AIR United States Y Q5 HKB 7242.0 FAI 3832.0 N 0 CNA 1.0 0 177.559277
3 10 40-Mile Air NaN Q5 MLA MILE-AIR United States Y Q5 TKJ 7235.0 CKX NaN N 0 CNA 1.0 0 99.698791
4 21 Aigle Azur NaN ZI AAF AIGLE AZUR France Y ZI AAE 220.0 MRS 1353.0 N 0 319 1.0 1 767.018752

In [39]:
merged_routes[relevant_columns].head()


Out[39]:
Name ICAO Country SourceAirport DestinationAirport Codeshare Stops Equipment AirlineNbr International Distance
0 40-Mile Air MLA United States CKX TKJ N 0 CNA 0 1.0 99.698791
1 40-Mile Air MLA United States FAI HKB N 0 CNA 0 1.0 177.559277
2 40-Mile Air MLA United States HKB FAI N 0 CNA 0 1.0 177.559277
3 40-Mile Air MLA United States TKJ CKX N 0 CNA 0 1.0 99.698791
4 Aigle Azur AAF France AAE MRS N 0 319 1 1.0 767.018752

In [40]:
merged_routes = merged_routes[relevant_columns]

Adding in external information: for selected airlines


In [41]:
Delays_data = pd.read_csv('delays_Data.csv')

In [42]:
def convert_mixed_fractions(x):
    if '%' in x:
        return float(x[:-1])/100
    else:
        return float(x)

In [43]:
Delays_data['On-time (A14)'] = Delays_data['On-time (A14)'].map(convert_mixed_fractions)
Airline_list = pd.read_csv('airlines_Data.csv')

Getting low cost carrier association


In [44]:
url =u'https://en.wikipedia.org/wiki/List_of_low-cost_airlines'
http_pool = urllib3.connection_from_url(url)
r = http_pool.urlopen('GET',url)

html = r.data

tree = BeautifulSoup(html,"lxml")
table_tag = tree.select("table")
tab_data = []
for table in table_tag:
    tab_data.append([[item.text.replace('\n', '') for item in row_data.select("li")]
                for row_data in table.select("tr")])
    
cheap_airlines = [ re.sub('\([\s\S]*\)|\[[\d]*\]', '', i).strip() for tab in tab_data for t in tab for i in t]

In [45]:
#add airlines that are for some reason missing/written differently
cheap_airlines = cheap_airlines[:cheap_airlines.index('v')]+['easyJet', 
                                                             'IndiGo Airlines', 
                                                             'Jetstar Airways', 
                                                             'TUIfly',
                                                            'Shenzhen Airlines',
                                                            'JetBlue Airways']

Preliminary analysis of airlines:

What are the airlines with the most flight routes:


In [46]:
merged_routes.Name.value_counts().head(10)


Out[46]:
Ryanair                    2484
American Airlines          2354
United Airlines            2180
Delta Air Lines            1981
US Airways                 1960
China Southern Airlines    1454
China Eastern Airlines     1263
Air China                  1260
Southwest Airlines         1143
easyJet                    1130
Name: Name, dtype: int64

In [47]:
merged_routes.Name.value_counts().plot(kind='hist', log=True, bins=20)
plt.title("Log Histogram of connections per airline")
plt.show()



In [48]:
merged_routes.Name.value_counts().describe()


Out[48]:
count     522.000000
mean      127.394636
std       277.346000
min         1.000000
25%        14.000000
50%        34.000000
75%       111.000000
max      2484.000000
Name: Name, dtype: float64

What are the airlines with the most airports:


In [49]:
airport_count = merged_routes.groupby('Name').apply(
    lambda x: x.SourceAirport.drop_duplicates().count()).sort_values(ascending=False)
airport_count.head(10)


Out[49]:
Name
American Airlines           429
United Airlines             426
Air France                  378
KLM Royal Dutch Airlines    360
US Airways                  348
Delta Air Lines             347
Alitalia                    269
Turkish Airlines            258
Lufthansa                   243
China Eastern Airlines      222
dtype: int64

In [50]:
airport_count.plot(kind='hist', log=True, bins=20)
plt.title('Log Histogram: Number of airports served')


Out[50]:
Text(0.5, 1.0, 'Log Histogram: Number of airports served')

In [51]:
airport_count.describe()


Out[51]:
count    522.000000
mean      35.954023
std       54.773603
min        1.000000
25%        8.000000
50%       16.000000
75%       41.000000
max      429.000000
dtype: float64

Analyzing the metadata we will later use:


In [52]:
Delays_data['Avg. Delay'].plot(kind='box')


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e9e2748>

In [53]:
Delays_data['Avg. Delay'].describe()


Out[53]:
count    117.000000
mean      43.047009
std       10.196353
min       24.600000
25%       35.400000
50%       43.000000
75%       48.000000
max       76.900000
Name: Avg. Delay, dtype: float64

In [54]:
merged_routes[merged_routes.Name.isin(cheap_airlines)].Name.drop_duplicates().count()


Out[54]:
63

Only keep large enough airlines:


In [55]:
reasonably_big_airlines = merged_routes.Name.value_counts()[merged_routes.Name.value_counts() > 130].index

In [56]:
merged_routes = merged_routes[merged_routes.Name.isin(reasonably_big_airlines)]

We look at a total of 118 airlines:


In [57]:
len(merged_routes.Name.unique())


Out[57]:
118

out of which we categoryze _ as low cost airlines:


In [58]:
merged_routes[merged_routes.Name.isin(cheap_airlines)].Name.drop_duplicates().count()


Out[58]:
25

Creating and plotting at individual airline networks:

We build a function to create the airline network:


In [59]:
def create_airline_network(airline):
    df = merged_routes[merged_routes['Name'] == airline]
    Airline_Graph = nx.from_pandas_edgelist(df, 
                                      source='SourceAirport', target='DestinationAirport', edge_attr=['Country'])
    nx.set_node_attributes(Airline_Graph, location_mapping, 'Location')
    return Airline_Graph

In [60]:
def draw_airline_network(Airline_Graph, airline):
    plt.figure(figsize=(6, 6))
    centrality = nx.betweenness_centrality(Airline_Graph)
    size = np.array(list(centrality.values()))*600
    nx.draw_spring(Airline_Graph, node_size=size, width=0.1)
    plt.title(airline)
    plt.show()
    
def get_spectrum_figures(Airline_Graph):
    e, U = np.linalg.eigh(nx.normalized_laplacian_matrix(Airline_Graph).todense())
    plt.plot(e)
    plt.show()
    plt.plot(nx.laplacian_spectrum(Airline_Graph))
    plt.show()
    plt.boxplot(nx.degree_centrality(Airline_Graph).values())
    plt.show()

Example of some low cost airlines:


In [61]:
Cheap = create_airline_network('Southwest Airlines')
draw_airline_network(Cheap, 'Southwest Airlines')


Plotting the graph on map


In [62]:
from mpl_toolkits.basemap import Basemap
#run export PROJ_LIB=$CONDA_PREFIX/share/proj if this gives a key error

In [63]:
airport_location = airports.set_index('IATA').apply(lambda x: [x.Longitude, x.Latitude], axis=1)

In [64]:
plt.figure(figsize=(15, 20))
m=Basemap(llcrnrlon=-140,urcrnrlon=-30,llcrnrlat=10,urcrnrlat=70)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")


def draw_local_network(Airline_Graph, edge_color='black', node_color='red', alpha=1):
    node_size= np.array(list(nx.degree_centrality(Airline_Graph).values()))*500
    nx.draw_networkx_edges(Airline_Graph, pos=airport_location, edge_color=edge_color, alpha=0.4)
    nx.draw_networkx_nodes(Airline_Graph, pos=airport_location, node_size=node_size, node_color=node_color, alpha=alpha)
    
    
draw_local_network(Cheap)


Here we show how to create an interactive graph. As it is quite heavy, we only use it when we are curious about connections.


In [65]:
from plotly import tools
from plotly.offline import iplot, init_notebook_mode

import plotly.plotly as py
from plotly.graph_objs import *

init_notebook_mode()



In [66]:
def draw_map_graph(Airline_Graph):
    df_paths = pd.DataFrame(list(Airline_Graph.edges), columns=['source', 'dest'])

    df_paths['start'] = df_paths.source.map(airport_location)
    df_paths['end'] = df_paths.dest.map(airport_location)

    import plotly.plotly as py
    import plotly.graph_objs as go
    from plotly import tools
    from plotly.offline import iplot, init_notebook_mode
    init_notebook_mode()



    markers = [
        {'lat': airports[airports.IATA.isin(Airline_Graph.nodes)].Latitude.tolist(),
      'lon': airports[airports.IATA.isin(Airline_Graph.nodes)].Longitude.tolist(),
      'marker': {'color': 'rgb(40,40,40)',
       'line': {'color': 'rgb(40,40,40)', 'width': 0.5},
       'size': np.array(list(nx.degree_centrality(Airline_Graph).values()))*10,
       'sizemode': 'diameter'},
      'text': airports[airports.IATA.isin(Airline_Graph.nodes)].Name.tolist(),
      'type': 'scattergeo'},
    ]


    paths = []
    for i in range( len( df_paths ) ):
        paths.append(
            dict(
                type = 'scattergeo',
                lon = [ df_paths['start'][i][0], df_paths['end'][i][0] ],
                lat = [ df_paths['start'][i][1], df_paths['end'][i][1]],
                mode = 'lines',
                line = dict(
                    width = 1,
                    color = 'grey',
                ),
                opacity = 0.5,
            )
        )


    layout = go.Layout(
        title = '',
        width=1000,
        height=700,
        showlegend = False,
        geo = dict(
                scope='world',
                projection=dict( type = 'natural earth'),
                showland = True,
                landcolor = 'rgb(217, 217, 217)',
                subunitwidth=1,
                countrywidth=1,
                subunitcolor="rgb(255, 255, 255)",
                countrycolor="rgb(255, 255, 255)"
            ),)

    fig =  go.Figure(layout=layout, data=paths+markers)
    iplot( fig, validate=False)

In [67]:
draw_map_graph(Cheap)


Preliminary analysis of network graphs

We know that the graph of all airlines is scale-free, but is the graph formed by individual airlines scalefree?


In [68]:
def get_scalefree_network_stats(airline):
    Airline_Graph = create_airline_network(airline)
    
    components = nx.number_connected_components(Airline_Graph)
    component_ratio = 0
    density = nx.density(Airline_Graph)
    
    if components > 1 :
        c = sorted(nx.connected_components(Airline_Graph), key = len, reverse=True)
        Airline_Graph = Airline_Graph.subgraph(c[0])

    degrees = pd.Series(list((dict(nx.degree(Airline_Graph)).values())))
    
    density = nx.density(Airline_Graph)
    diameter = nx.diameter(Airline_Graph)
    pathlength = nx.average_shortest_path_length(Airline_Graph)

    return np.array([ degrees.mean(), degrees.std(), degrees.max(), density, diameter, pathlength])

In [69]:
airlines = merged_routes.Name.drop_duplicates()
stats = np.array(airlines.map(get_scalefree_network_stats).tolist())

In [70]:
plt.figure(figsize=(7, 6))
plt.scatter(stats[:, 0], stats[:, 1])
plt.plot(np.linspace(0, 16), np.sqrt(np.linspace(0, 16)))
plt.xlabel('<k>')
plt.ylabel('$\sigma_k$')
plt.title('Network average degree vs std')
plt.show()


And here we calculate statistical measures of different graph properties:


In [71]:
pd.DataFrame(stats, columns=['Average Degree', 
                             'Standard Dev Degree', 
                              "Maxium degree",
                             'Density', 
                             'Diameter', 
                             'Average path lenth']
            ).describe()


Out[71]:
Average Degree Standard Dev Degree Maxium degree Density Diameter Average path lenth
count 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000
mean 4.084370 8.215156 68.347458 0.053010 4.847458 2.421107
std 1.885788 2.970817 42.382140 0.038636 1.216883 0.375045
min 2.104478 2.831972 16.000000 0.006723 2.000000 1.937634
25% 2.850649 6.271919 40.000000 0.028092 4.000000 2.178099
50% 3.602309 7.626316 61.000000 0.042197 5.000000 2.362481
75% 4.598352 9.424423 82.000000 0.068677 5.000000 2.615793
max 14.113636 17.216606 226.000000 0.264516 11.000000 5.227186

Network Structure

Clustering the networks based on stats:

Our first model only uses the laplacian the quantify the structure of the graph:


In [72]:
from networkx.algorithms.approximation.clique import large_clique_size
from networkx.algorithms.community import greedy_modularity_communities

In [73]:
def get_network_stats(airline):
    Airline_Graph = create_airline_network(airline)
    
    
    components = nx.number_connected_components(Airline_Graph)
    component_ratio = 0
    density = nx.density(Airline_Graph)
    
    if components > 1 :
        print(airline,'  ' ,components)
        
        c = sorted(nx.connected_components(Airline_Graph), key = len, reverse=True)
        component_ratio = len([i for t in c[1:] for i in t])/ len(c[0])
        Airline_Graph = Airline_Graph.subgraph(c[0])

    #connectivity pattern of airline
    hist = nx.degree_histogram(Airline_Graph)
    total = sum(hist)
    per_large_degree = sum(hist[5:])/total
    deadend = hist[1]/total
    
    #network pattern
    algebraic_connectivity = nx.algebraic_connectivity(Airline_Graph)
    spectrum = nx.normalized_laplacian_matrix(Airline_Graph).todense()
    eig, _ = np.linalg.eigh(spectrum)
    eig = np.array(sorted(eig))
    

    return np.array([eig[1], #algebraic_connectivity
                     eig[-1], 
                     pd.Series(eig.round(decimals=4)).value_counts()[1]/pd.Series(eig.round(decimals=4)).value_counts().sum(),
                     deadend, per_large_degree 
                    ])

Collecting the data:


In [74]:
Stats = []
airlines = merged_routes.Name.unique()
for name in airlines:
    stat = get_network_stats(name)
    if type(stat) != None:
        Stats.append(stat)


Aegean Airlines    2
Cathay Pacific    2
Delta Air Lines    2
Flybe    3
Lufthansa    2
Meridiana    2

Put everything into a matrix, do PCA for dimensionality reduction & cluster:


In [75]:
network_stats = np.array(Stats)

#scale features so that each feature has same distribution
x = StandardScaler().fit_transform(network_stats)

In [76]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x[:, [0, 2]])
print(pca.explained_variance_ratio_)


[0.57115974 0.42884026]

In [77]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=9, affinity='euclidean', linkage='ward')  
_ = cluster.fit_predict(principalComponents)

In [78]:
plt.scatter(x[:, 0], x[:,2], c=cluster.labels_, cmap=plt.cm.tab20)  
plt.show()
plt.scatter(principalComponents[:,0], principalComponents[:,1], c=cluster.labels_, cmap=plt.cm.tab20)


Out[78]:
<matplotlib.collections.PathCollection at 0x1a27c8de10>

In [79]:
plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= cluster.labels_, cmap=plt.cm.tab20)  
for i, name in enumerate(airlines):
    plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]), 
             xytext = (0, 0), textcoords = 'offset points')



In [80]:
for d in ['Saudi Arabian Airlines', 'Emirates', 'TUIfly', 'Era Alaska']:
    p = create_airline_network(d)
    draw_airline_network(p, d)



In [81]:
palette = sns.color_palette(palette='Set1', n_colors=3).as_hex()

In [82]:
Best_airlines = ["Air New Zealand",
"Qantas",
"Singapore Airlines",
"Virgin Atlantic Airlines",
"Etihad Airways",
"Cathay Pacific",
"Japan Airlines",
"Qatar Airways",
"Emirates"
]

In [83]:
def return_color(c):
    if c in cheap_airlines:
        return palette[0]
    elif c in Best_airlines:
        return palette[1] 
    return palette[2]

In [84]:
c = [return_color(a) for a in airlines]

In [85]:
plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= c, s=250)  
for i, name in enumerate(airlines):
    plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]), 
             xytext = (0, 0), textcoords = 'offset points')
    
plt.xticks([])
plt.yticks([])


Out[85]:
([], <a list of 0 Text yticklabel objects>)

We identify nordic airlines from this graph:


In [86]:
mask = (principalComponents[:, 1] > 2)
Components = principalComponents[mask]

In [87]:
for nordic in airlines[mask]:
    Nordic = create_airline_network(nordic)
    draw_airline_network(Nordic, nordic)



In [88]:
plt.figure(figsize=(15, 20))

palette = sns.color_palette(palette='Set1', n_colors=2).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=40,llcrnrlat=50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

for i, nordic in enumerate(airlines[mask]):
    Nordic = create_airline_network(nordic)
    draw_local_network(Nordic, node_color=palette[i])



In [89]:
plt.figure(figsize=(5, 5))

palette = sns.color_palette(palette='Set1', n_colors=2).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=-140,llcrnrlat=50,urcrnrlat=80)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

Nordic = create_airline_network(nordic)
draw_local_network(Nordic, node_color=palette[1])



In [90]:
plt.figure(figsize=(5, 5))
Nordic = create_airline_network(airlines[mask][0])

palette = sns.color_palette(palette='Set1', n_colors=2).as_hex()
m=Basemap(llcrnrlon=-10,urcrnrlon=40,llcrnrlat=50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

draw_local_network(Nordic, node_color=palette[1])


Cheap airlines are all located in a corner of the cluster:


In [91]:
mask = (principalComponents[:, 0]-0.5 > principalComponents[:, 1] )
Components = principalComponents[mask]

In [92]:
sum([1 if a in cheap_airlines else 0  for a in airlines[mask]])/sum(mask)


Out[92]:
0.5

In [93]:
sum([1 if a in cheap_airlines else 0  for a in airlines[mask]]
   )/sum([1 if a in cheap_airlines else 0  for a in airlines])


Out[93]:
0.76

In [94]:
palette = sns.color_palette(palette='Set1', n_colors=3).as_hex()

In [95]:
plt.figure(figsize=(10, 10))
plt.scatter(Components[:,0], Components[:,1], cmap=plt.cm.tab20, c = [return_color(a) for a in airlines[mask]])  

for i, name in enumerate(airlines[mask]):
    plt.annotate(name, xy = (Components[i, 0], Components[i, 1]), 
             xytext = (0, 0), textcoords = 'offset points')


However, we now expand the features, in hopes of getting an even better partition:

But from the graphs of these airlines we can clearly recognize a strong pattern of large vs prestious airlines: prestigous airlines have a much more starlike structure. Hence, a better method would need to cluster starlike graphs closer together


In [96]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x[:, [0, 2, 3, 4]])
print(pca.explained_variance_ratio_) 

cluster = AgglomerativeClustering(n_clusters=10, affinity='euclidean', linkage='ward')  
_ = cluster.fit_predict(principalComponents)  

plt.figure(figsize=(20, 20))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= cluster.labels_, cmap=plt.cm.tab20)  
for i, name in enumerate(airlines):
    plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]), 
             xytext = (0, 0), textcoords = 'offset points')


[0.67181607 0.21889062]

In [97]:
plt.figure(figsize=(20, 20))

plt.scatter(principalComponents[:,0], principalComponents[:,1], c=c, cmap=plt.cm.tab20, s=250)  

plt.xticks([])
plt.yticks([])
for i, name in enumerate(airlines):
    plt.annotate(name, xy = (principalComponents[i, 0], principalComponents[i, 1]), 
             xytext = (0, 0), textcoords = 'offset points')


Validating that this is indeed a better representation:


In [98]:
mask = (principalComponents[:, 0]-0.5 < principalComponents[:, 1] )
Components = principalComponents[mask]
sum([1 if a in cheap_airlines else 0  for a in airlines[mask]])/sum(mask)


Out[98]:
0.06172839506172839

In [99]:
sum([1 if a in cheap_airlines else 0  for a in airlines[mask]]
   )/sum([1 if a in cheap_airlines else 0  for a in airlines])


Out[99]:
0.2

In [100]:
mask = (principalComponents[:, 0]-0.5 >= principalComponents[:, 1] )

In [101]:
sum([1 if a in cheap_airlines else 0  for a in airlines[mask]])/sum(mask)


Out[101]:
0.5405405405405406

In [102]:
sum([1 if a in cheap_airlines else 0  for a in airlines[mask]]
   )/sum([1 if a in cheap_airlines else 0  for a in airlines])


Out[102]:
0.8

Now we see that the best airlines are all clustered together, and the cheapest airlines tend to be clustered together!


In [103]:
plt.figure(figsize=(15, 20))
mask = (principalComponents[:, 0]-3 >= principalComponents[:, 1] )
palette = sns.color_palette(palette='tab20', n_colors=50).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=180,llcrnrlat=-50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

legend = []
for i, cheap in enumerate(airlines[mask]):
    Cheap = create_airline_network(cheap)
    draw_local_network(Cheap, edge_color=palette[i], node_color=palette[i], alpha=0.5)
    legend.append(mpatches.Patch(color=palette[i], label=cheap))
plt.legend(handles=legend)


Out[103]:
<matplotlib.legend.Legend at 0x1a25a19518>

Plot shape of networks:


In [104]:
mask = ((principalComponents[:, 0]-2 >= principalComponents[:, 1] ) & (principalComponents[:, 1] > -2.2))
for cheap in airlines[mask]:
    Cheap = create_airline_network(cheap)
    draw_airline_network(Cheap, cheap)



In [105]:
plt.figure(figsize=(15, 20))

palette = sns.color_palette(palette='tab20c', n_colors=30).as_hex()
m=Basemap(llcrnrlon=-160,urcrnrlon=180,llcrnrlat=-70,urcrnrlat=70)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

legend = []
for i, expensive in enumerate(Best_airlines):
    Expensive = create_airline_network(expensive)
    draw_local_network(Expensive, node_color=palette[i], alpha=0.9, edge_color=palette[i])
    legend.append(mpatches.Patch(color=palette[i], label=expensive))
plt.legend(handles=legend)


Out[105]:
<matplotlib.legend.Legend at 0x1086b6be0>

Now, what is the underlying confounding factor that could lead to such a distribution?

Is the clustering due to international vs local flights?

International airlines are much more likely to be world renowed, while cheap airlines may only provide inertcontinetal or even just naional travel. Here we check whether there is an indication of this in the graphs:


In [106]:
d = airports.set_index('IATA').Country.to_dict()

In [107]:
country_json= pd.read_json('country-by-continent.json').set_index('country').continent.to_dict()

In [108]:
merged_routes['Intercontinental'] = merged_routes.apply(
    lambda x: country_json[d[x.SourceAirport]] != country_json[d[x.DestinationAirport]] , axis=1)

In [109]:
color_international = merged_routes.groupby('Name').apply(lambda x: x.Intercontinental.sum()).tolist()

In [110]:
np.corrcoef(principalComponents[:, 0], color_international)


Out[110]:
array([[ 1.        , -0.17340179],
       [-0.17340179,  1.        ]])

In [111]:
np.corrcoef(principalComponents[:, 1], color_international)


Out[111]:
array([[ 1.        , -0.08988271],
       [-0.08988271,  1.        ]])

In [112]:
plt.figure(figsize=(5, 5))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= color_international, cmap=plt.cm.PuBu )  
plt.colorbar()


Out[112]:
<matplotlib.colorbar.Colorbar at 0x1a27e190b8>

In [113]:
color_international = merged_routes.groupby('Name').apply(lambda x: x.International.sum()).tolist()

In [114]:
np.corrcoef(principalComponents[:, 0], color_international)


Out[114]:
array([[ 1.       , -0.1599437],
       [-0.1599437,  1.       ]])

In [115]:
np.corrcoef(principalComponents[:, 1], color_international)


Out[115]:
array([[ 1.        , -0.09334974],
       [-0.09334974,  1.        ]])

In [116]:
plt.figure(figsize=(5,5))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= color_international, cmap=plt.cm.PuBu )  
plt.colorbar()


Out[116]:
<matplotlib.colorbar.Colorbar at 0x1a27be7f60>

In [117]:
color_international = merged_routes.groupby('Name').apply(
    lambda x: x.Distance.sum()).tolist()

In [118]:
np.corrcoef(principalComponents[:, 0], color_international)


Out[118]:
array([[ 1.        , -0.11797496],
       [-0.11797496,  1.        ]])

In [119]:
np.corrcoef(principalComponents[:, 1], color_international)


Out[119]:
array([[ 1.        , -0.09422998],
       [-0.09422998,  1.        ]])

In [120]:
plt.figure(figsize=(5, 5))
plt.scatter(principalComponents[:,0], principalComponents[:,1], c= color_international, cmap=plt.cm.PuBu )  
plt.colorbar()


Out[120]:
<matplotlib.colorbar.Colorbar at 0x1a1fb4db00>

Analyzing if there is corrleation between countries and structure:


In [121]:
most_common = merged_routes[~merged_routes.Name.duplicated()].Country.value_counts().head(3).index
most_common


Out[121]:
Index(['China', 'United States', 'United Kingdom'], dtype='object')

In [122]:
countries = list(most_common) + list(merged_routes.Country.unique())
airline_to_countryid = merged_routes.set_index('Name').Country.to_dict()
country_coloring = [countries.index(airline_to_countryid[i]
                                   ) if airline_to_countryid[i] in most_common else -1 for i in merged_routes.Name.unique() ]

In [123]:
to_keep = [i for i, name in enumerate(airlines) if airline_to_countryid[name] in most_common]

In [124]:
plt.figure(figsize=(10, 10))
plt.scatter(principalComponents[:,0], 
            principalComponents[:,1], 
            c= np.array(country_coloring), 
            cmap=plt.cm.Set1_r)  
for i in to_keep:
    plt.annotate(airlines[i], xy = (principalComponents[i, 0], principalComponents[i, 1]), 
             xytext = (0, 0), textcoords = 'offset points')


We see a lot of dispresion and no uniform cluster

Analyzing inference from eignevalues to Delays


In [125]:
scatter_data = pd.DataFrame(principalComponents)

scatter_data['name'] = airlines
scatter_data['labels'] = cluster.labels_
scatter_data['country'] = scatter_data.name.map(airline_to_countryid)

scatter_data['ontime'] = scatter_data.name.map(Delays_data.set_index('On-time')['On-time (A14)'].to_dict())
scatter_data['delay'] = scatter_data.name.map(Delays_data.set_index('On-time')['Avg. Delay'].to_dict())

scatter = scatter_data[scatter_data.delay.notna()].copy()

As we can see in the graph, there is no clear dependence on PCA features.


In [126]:
plt.figure(figsize=(15, 10))
plt.scatter(scatter[0], 
            scatter[1], 
            c= scatter['delay'], 
            cmap=plt.cm.Spectral_r)  

for i in scatter.index:
    plt.annotate(scatter['name'][i], xy = (scatter[0][i], scatter[1][i]), 
             xytext = (0, 0), textcoords = 'offset points')
plt.colorbar()


Out[126]:
<matplotlib.colorbar.Colorbar at 0x1a1d1fc630>

In [127]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [128]:
X = np.array(Stats)[list(scatter.index)]

X = StandardScaler().fit_transform(X)

regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X[:,[0]+list(range(2,5))], scatter['delay'])

#resudual sum of squares
regr.score(X[:, [0]+list(range(2,5))], scatter['delay'])


Out[128]:
0.2763329711534741

In [129]:
# Train the model using the training sets
regr.fit(X[:, 1:], scatter['delay'])

#resudual sum of squares
regr.score(X[:, 1:], scatter['delay'])


Out[129]:
0.2941907382087534

In [130]:
scatter['pred'] = regr.predict(X[:, 1:])

In [131]:
plt.figure(figsize=(20, 10))
plt.scatter(scatter[0], 
            scatter[1], 
            c= scatter['pred'], 
            cmap=plt.cm.Spectral_r)  

for i in scatter.index:
    plt.annotate(scatter['name'][i], xy = (scatter[0][i], scatter[1][i]), 
             xytext = (0, 0), textcoords = 'offset points')
plt.colorbar()


Out[131]:
<matplotlib.colorbar.Colorbar at 0x1a2796f160>

Showing the weight of features, to see how important each feature is:


In [132]:
plt.bar(range(len(regr.coef_)), regr.coef_,)
[regr.coef_]


Out[132]:
[array([-3.33306939,  4.08089346, -5.2288589 , -1.53293705])]

Adding robustness as a predictor:


In [133]:
def get_network_stats_robustness(airline):
    Airline_Graph = create_airline_network(airline)
    
    
    components = nx.number_connected_components(Airline_Graph)
    component_ratio = 0
    density = nx.density(Airline_Graph)
    
    if components > 1 :
        print(airline,'  ' ,components)
        
        c = sorted(nx.connected_components(Airline_Graph), key = len, reverse=True)
        component_ratio = len([i for t in c[1:] for i in t])/ len(c[0])
        Airline_Graph = Airline_Graph.subgraph(c[0])

    #connectivity pattern of airline
    diameter = nx.diameter(Airline_Graph)
    degree_assortativity = nx.degree_assortativity_coefficient(Airline_Graph)
    
    #network pattern
    algebraic_connectivity = nx.algebraic_connectivity(Airline_Graph)
    
    
    #robustness of network
    nb = nx.betweenness_centrality(Airline_Graph, normalized=True)
    betweenness = np.array(list(nb.values()))
    upper_betweenness = np.quantile(betweenness, 0.75)
    median_betweenness = np.quantile(betweenness, 0.5)
    lower_betweenness = np.quantile(betweenness, 0.27)
    
    
    eb = nx.edge_betweenness_centrality(Airline_Graph, normalized=True)
    betweenness = np.array(list(eb.values()))
    e_upper_betweenness = np.quantile(betweenness, 0.75)
    e_median_betweenness = np.quantile(betweenness, 0.5)
    e_lower_betweenness = np.quantile(betweenness, 0.27)
    
    node_connectivity = nx.node_connectivity(Airline_Graph)
    
    return np.array([upper_betweenness, median_betweenness, 
                     e_upper_betweenness, e_median_betweenness,
                    ])

In [134]:
Stats_between = []
airlines = scatter.name.unique()
for name in airlines:
    stat = get_network_stats_robustness(name)
    if type(stat) != None:
        Stats_between.append(stat)
network_stats_between = np.array(Stats_between)
x_between = StandardScaler().fit_transform(network_stats_between)


Cathay Pacific    2
Delta Air Lines    2
Flybe    3
Lufthansa    2

In [135]:
scatter_extended = pd.concat([pd.DataFrame(x_between), pd.DataFrame(X)], ignore_index=True, axis=1)

In [136]:
X_ = scatter_extended.iloc[:, :4]
X_ = StandardScaler().fit_transform(X_)

regr = linear_model.LinearRegression()
regr.fit(X_, scatter['delay'])
#resudual sum of squares
regr.score(X_, scatter['delay'])


Out[136]:
0.3912075013111889

In [137]:
X_ = scatter_extended.iloc[:, list(range(0, 4)) + list(range(5, 8))]
X_ = StandardScaler().fit_transform(X_)

regr = linear_model.LinearRegression()
regr.fit(X_, scatter['delay'])
#resudual sum of squares
regr.score(X_, scatter['delay'])


Out[137]:
0.6127163194120251

Again looking at the weights to get indication of importance of features:


In [138]:
plt.bar(range(len(regr.coef_)), regr.coef_,)
[regr.coef_]


Out[138]:
[array([ 9.70945545, -8.19795152,  4.85651575, -5.81753607, -2.53328114,
         6.16753841, -3.99622909])]

Competition Analysis


In [142]:
merged_routes = merged_routes[~merged_routes['SourceAirport'].isin(to_find)]
merged_routes = merged_routes[~merged_routes['DestinationAirport'].isin(to_find)]

In [143]:
merged_routes['SourceCountry'] = merged_routes.apply(lambda x: 
                Airport_to_country[x.SourceAirport], axis=1)
merged_routes['DestinationCountry'] = merged_routes.apply(lambda x: 
                Airport_to_country[x.DestinationAirport], axis=1)
merged_routes.head(5)


Out[143]:
Name ICAO Country SourceAirport DestinationAirport Codeshare Stops Equipment AirlineNbr International Distance Intercontinental SourceCountry DestinationCountry
76 American Airlines AAL United States ABE CLT Y 0 CR9 CR7 CRJ 2 0.0 773.147838 False United States United States
77 American Airlines AAL United States ABE PHL N 0 DH3 2 0.0 88.283143 False United States United States
78 American Airlines AAL United States ABI DFW Y 0 ERD ER4 CRJ 2 0.0 253.808916 False United States United States
79 American Airlines AAL United States ABQ DFW N 0 M83 M80 2 0.0 915.506940 False United States United States
80 American Airlines AAL United States ABQ LAX Y 0 CR7 CRJ 2 0.0 1089.899312 False United States United States

In [144]:
def overlaps(route_a, route_b):
    source_dist = distance(distance_mapping[route_a['SourceAirport']],
                           distance_mapping[route_b['SourceAirport']]).km
    dest_dist = distance(distance_mapping[route_a['DestinationAirport']],
                         distance_mapping[route_b['DestinationAirport']]).km
    return source_dist <= 100 and dest_dist <= 100

"""
Computed the overlap (competition) score between two airlines.
It takes not only nodes into consideration but also the edges.
So, if Airline A goes from Bucharest to Zurich and Airline B
from Geneva to Zurich, based on this fact only, they are not
competitors.
"""
def compute_overlap_score(airline_x, airline_y, absolute = False):
    routes_x = merged_routes[merged_routes.Name == airline_x]
    if airline_x == airline_y:
        if absolute:
            return len(routes_x)
        else:
            return 1
    routes_y = merged_routes[merged_routes.Name == airline_y]
    
    score = 0
    for i, row_x in routes_x.iterrows():
        does_overlap = False
        joined_routes = routes_y[(routes_y.SourceCountry == row_x['SourceCountry']) &
                                 (routes_y.DestinationCountry == row_x['DestinationCountry'])]
        for j, row_y in joined_routes.iterrows():
            if overlaps(row_x, row_y):
                does_overlap = True
        if does_overlap:
            score += 1
    if absolute:
        return score
    else:
        return score / len(routes_x)

compute_overlap_score("Ryanair", "Wizz Air", True)


Out[144]:
114

In [145]:
"""
Computed the overlap (competition) score between given airlines.
It takes not only nodes into consideration but also the edges.
So, if Airline A goes from Bucharest to Zurich and Airline B
from Geneva to Zurich, based on this fact only, they are not
competitors.
"""
def compute_overlap_scores(airline_list, absolute = False):
    scores = []
    for airline_a in airline_list:
        for airline_b in airline_list:
            scores.append([airline_a, airline_b, compute_overlap_score(airline_a, airline_b, absolute)])
    return pd.DataFrame(scores, columns = ['AirlineA', 'AirlineB', 'Score'])

In [146]:
overlap_scores = compute_overlap_scores(["Ryanair", "Wizz Air"], True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu")


Out[146]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ff51048>

In [147]:
Low_cost = ['Ryanair', 'easyJet', 'Wizz Air', 'TUIfly', 'Jet2.com']
overlap_scores = compute_overlap_scores(Low_cost, True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu", vmax=1500)


Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a27bef860>

In [148]:
plt.figure(figsize=(15, 20))

palette = sns.color_palette(palette='Set1', n_colors=30).as_hex()
m=Basemap(llcrnrlon=-30,urcrnrlon=50,llcrnrlat=10,urcrnrlat=70)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

legend = []
for i, cheap in enumerate(Low_cost):
    Cheap = create_airline_network(cheap)
    draw_local_network(Cheap, edge_color=palette[i], node_color=palette[i], alpha=0.5)
    legend.append(mpatches.Patch(color=palette[i], label=cheap))
plt.legend(handles=legend)


Out[148]:
<matplotlib.legend.Legend at 0x1a27e28f98>

In [149]:
Big_airlines = ['Lufthansa', 'KLM Royal Dutch Airlines', 'Air France', 'British Airways']
overlap_scores = compute_overlap_scores(Big_airlines, True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu")


Out[149]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a26960b70>

In [150]:
plt.figure(figsize=(15, 20))

palette = sns.color_palette(palette='Set1', n_colors=30).as_hex()
m=Basemap(llcrnrlon=-180,urcrnrlon=180,llcrnrlat=-50,urcrnrlat=90)
m.drawmapboundary(linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")

legend = []
for i, cheap in enumerate(Big_airlines):
    Cheap = create_airline_network(cheap)
    draw_local_network(Cheap, edge_color=palette[i], node_color=palette[i], alpha=0.5)
    legend.append(mpatches.Patch(color=palette[i], label=cheap))
plt.legend(handles=legend)


Out[150]:
<matplotlib.legend.Legend at 0x1a2639f940>

In [151]:
overlap_scores = compute_overlap_scores(["Ryanair", "Jet2.com", "easyJet", "Lufthansa", "KLM Royal Dutch Airlines", "Air France", "British Airways"], True)
sns.heatmap(overlap_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu", vmax=1000)


Out[151]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a273a9cc0>

In [152]:
"""
Completion score of airline A and airline B is defined as how well the second airline
completes the first one, in the sense that one person taking airline A from
airport X to airport Y can take airline B form Y to airport Z, but cannot take airline
A.

Since we will be focusing on networks with similar structure, we won't scale this score,
but rather, the score will be: the total number of new destiation a traveler can get to
starting with airline A and then taking airline A (in the sense defined above, that is,
so that the second route does not exist for airline A). One can scale that number to the
total number of destination of the airline, so that we get comparable, scaled scores.
"""
def compute_completion_score(airline_a, airline_b):
    routes_a = merged_routes[merged_routes['Name'] == airline_a]
    destinations_a = routes_a.DestinationAirport
    routes_b = merged_routes[merged_routes['Name'] == airline_b]
    # Routes where B completes A
    completion_routes =(routes_a.merge(routes_b, left_on='DestinationAirport', right_on='SourceAirport'))
    # Filter the routes not accesible by airline A
    strict_completion_routes = completion_routes[~completion_routes['DestinationAirport_y'].isin(destinations_a)]
    return strict_completion_routes['DestinationAirport_y'].size

def compute_completion_scores(airline_list):
    scores = []
    for airline_a in airline_list:
        for airline_b in airline_list:
            scores.append([airline_a, airline_b, compute_completion_score(airline_a, airline_b)])
    return pd.DataFrame(scores, columns = ['AirlineA', 'AirlineB', 'Score'])
    
compute_completion_score("Ryanair", "Wizz Air")
compute_completion_scores(["Ryanair", "Wizz Air"])


Out[152]:
AirlineA AirlineB Score
0 Ryanair Ryanair 0
1 Ryanair Wizz Air 3020
2 Wizz Air Ryanair 4076
3 Wizz Air Wizz Air 0

We can see the protention corporation bewteen top airlines before scalling in the following graph. Based on the graph, we highly recommand the corporation betwwen Qantas and Emirates. We find that indeed, these two compaines have already become partnerships.


In [153]:
completion_scores = compute_completion_scores(Best_airlines)
sns.heatmap(completion_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu")


Out[153]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2750cf98>

After scalling, we find more interesting results. Like Qatar Airways could also protentially collaborate Singapore Airlines.


In [154]:
completion_scores = compute_completion_scores(Best_airlines)
sns.heatmap(completion_scores.pivot("AirlineA", "AirlineB", "Score"), cmap="YlGnBu", vmax=600)


Out[154]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a276dbc18>