In [1]:
# date lib
from datetime import datetime
# Pandas and NumPy
import pandas as pd
import numpy as np
In [2]:
# 1. Flight delay: any flight with (real_departure - planned_departure >= 15 minutes)
# 2. The Brazilian Federal Agency for Civil Aviation (ANAC) does not define exactly what is a "flight delay" (in minutes)
# 3. Anyway, the ANAC has a resolution for this subject: https://goo.gl/YBwbMy (last access: nov, 15th, 2017)
# ---
# DELAY, for this analysis, is defined as greater than 15 minutes (local flights only)
DELAY = 15
A note about date columns on this dataset
The regex:
In [3]:
#[flights] dataset_01 => all "Active Regular Flights" from 2017, from january to september
#source: http://www.anac.gov.br/assuntos/dados-e-estatisticas/historico-de-voos
#Last access this website: nov, 14th, 2017
flights = pd.read_csv('data/arf2017ISO.csv', sep = ';', parse_dates = True)
In [59]:
flights.head()
Out[59]:
In [5]:
# Airports four-letters-code from object to string
flights['airportA'] = flights['airportA'].astype('str')
flights['airportB'] = flights['airportB'].astype('str')
In [6]:
flights.info()
Source: https://goo.gl/mNFuPt (a XLS spreadsheet in portuguese; last access on nov, 15th, 2017)
In [7]:
# Airports dataset: all brazilian public airports (updated until october, 2017)
airports = pd.read_csv('data/brazilianPublicAirports-out2017.csv', sep = ';')
In [34]:
airports.info()
In [60]:
airports.head()
Out[60]:
I have found two lists that define two-letter codes used by the aircraft crew to justify the delay of the flights: a short and a long one.
Source: https://goo.gl/vUC8BX (last access: nov, 15th, 2017)
In [9]:
# ------------------------------------------------------------------
# List of codes (two letters) used to justify a delay on the flight
# - delayCodesShortlist.csv: list with YYY codes
# - delayCodesLongList.csv: list with XXX codes
# ------------------------------------------------------------------
delaycodes = pd.read_csv('data/delayCodesShortlist.csv', sep = ';')
delaycodesLongList = pd.read_csv('data/delayCodesLonglist.csv', sep = ';')
In [10]:
delaycodes.head()
Out[10]:
In [11]:
# Transform columns with date/time from 'object' to date format
flights['departureest'] = pd.to_datetime(flights['departureest'],errors='ignore')
flights['departurereal'] = pd.to_datetime(flights['departurereal'],errors='ignore')
flights['arrivalest'] = pd.to_datetime(flights['arrivalest'],errors='ignore')
flights['arrivalreal'] = pd.to_datetime(flights['arrivalreal'],errors='ignore')
In [61]:
# Merge "flights" dataset with "airports" in order to identify
# local flights (origin and destination are in Brazil)
flights = pd.merge(flights, airports, left_on="airportA", right_on="airport", how='left')
In [63]:
flights = pd.merge(flights, airports, left_on="airportB", right_on="airport", how='left')
In [66]:
flights.head()
Out[66]:
In [22]:
dfFlights = flights[['airportA']]
In [23]:
dfAirports = airports[['airport']]
In [27]:
dfFlights.info()
In [28]:
dfAirports.info()
In [25]:
display('dfFlights', 'dfAirports', pd.merge(dfFlights, dfAirports, left_on="airportA", right_on="airport"))
In [35]:
# Not working:
# pd.merge('dfFlights', 'dfAirports', pd.merge(dfFlights, dfAirports, left_on='airportA', right_on='airport'))
# ------------------------
In [40]:
# A more easy attempt
df1 = pd.DataFrame({'airportA': ['SBBH', 'SBCG', 'SBKP', 'KMIA', 'HONO'],
'airportB': ['SBBR', 'SBCT', 'SBAM', 'GERM', 'ENGL']},
columns=['airportA', 'airportB'])
df2 = pd.DataFrame({'airport': ['SBBH', 'SBCG', 'SBKP', 'SBBR', 'SBCT', 'SBAM'],
'state': ['MG', 'SP', 'SP', 'DF', 'PR', 'AM']},
columns=['airport', 'state'])
In [41]:
df1
Out[41]:
In [42]:
df2
Out[42]:
In [45]:
display('df1', 'df2', pd.merge(df1, df2, left_on="airportA", right_on="airport", how='left'))
In [46]:
df3 = pd.merge(df1, df2, left_on="airportA", right_on="airport", how='left')
In [47]:
df3
Out[47]:
In [48]:
display('df3', 'df2', pd.merge(df3, df2, left_on="airportB", right_on="airport", how='left'))
In [49]:
sampleFlights = pd.read_csv('data/sampleFlights.csv', sep = ';', parse_dates = True)
sampleAirports = pd.read_csv('data/sampleAirports.csv', sep = ';', parse_dates = True)
In [50]:
sampleFlights.head()
Out[50]:
In [51]:
sampleAirports.head()
Out[51]:
In [52]:
display('sampleFlights', 'sampleAirports',
pd.merge(sampleFlights, sampleAirports, left_on="airportA", right_on="airport", how='left'))
In [53]:
sampleMergeTemp = pd.merge(sampleFlights, sampleAirports, left_on="airportA", right_on="airport", how='left')
In [54]:
sampleMergeTemp
Out[54]:
In [56]:
display('sampleMergeTemp', 'sampleAirports',
pd.merge(sampleMergeTemp, sampleAirports, left_on="airportB", right_on="airport", how='left'))
In [57]:
sampleMergeFinal = pd.merge(sampleMergeTemp, sampleAirports, left_on="airportB", right_on="airport", how='left')
In [58]:
sampleMergeFinal
Out[58]:
In [ ]: