In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
In [3]:
# Some configuration options:
pd.set_option('max_columns', 70)
pd.set_option('max_rows', 50)
mpl.rcParams['figure.figsize'] = 10, 5
plt.style.use('ggplot')
In [4]:
events = pd.read_csv("./data/events.csv", sep='|', low_memory=False)
events.head()
Out[4]:
In [5]:
# Parsing dates
date_format = "%m/%d/%Y %I:%M:%S %p"
date = events['ev_date'].dropna()
date = pd.to_datetime(date, format=date_format)
time = events['ev_time'].astype(str).str.split('.').str[0].str.zfill(4)
time = pd.to_datetime(time, format="%H%M", errors='coerce') # for nan values
time.fillna('0000', inplace=True)
str_date = date.dt.date.astype(str)
str_time = time.dt.time.astype(str)
date_time = pd.to_datetime(str_date + ' ' + str_time,
format="%Y-%m-%d %H:%M:%S")
events['ev_date'] = date_time
What we don't really need...
In [6]:
events.drop(['ev_dow', 'ev_time', 'ev_tmzn', 'ev_year', 'ev_month'], axis=1, inplace=True)
Dropping events previuos to 1982 (They shouldn't be here!)
In [7]:
cond = events['ev_date'].dt.year < 1982
print('Dates before 1982: ',cond.sum())
indx = events[cond].index
events.drop(indx, inplace=True)
In [8]:
gby_ev_type = events.groupby('ev_type')
gby_ev_type.groups.keys()
Out[8]:
In [9]:
events = gby_ev_type.get_group('ACC')
events['ev_id'].count()
Out[9]:
In [10]:
aircraft = pd.read_csv("./data/aircraft.csv",
encoding = "ISO-8859-1",
sep=';')
aircraft.head(2)
Out[10]:
Aircrafts are classified according to different criteria (Model,Model of the engine,far_part, etc) in the NTSB database.
For this talk we are going to focus on revenue commercial Air Transport. In order to do this, we are going to filter the data according to the FAA (Federal Aviation Administration) criteria:
Code of Federal Regulations (CFR) title 14:
Part 121: Large transport aircraft engaged in revenue operations involving the transport of both passengers and cargo.
Non US and commercial
Part 135: Both scheduled (primarily passenger service) carriers flying aircraft with fewer than 10 passenger seats and on-demand passenger or cargo services using either fixed-wing airplanes or helicopters. On-demand passenger services include air taxi, air medical, and certain air tour operations.
In [11]:
# are accidents
cond = aircraft['ev_id'].isin(events['ev_id'])
aircraft = aircraft[cond]
In [12]:
gby_far_part = aircraft.groupby('far_part')
gby_far_part.groups.keys()
Out[12]:
In [13]:
desired_far_parts = ['NUSC', # Non-U.S. Commercial
'121'] # Air Carrier
In [14]:
cond = aircraft['far_part'].isin(desired_far_parts)
aircraft = aircraft[cond]
aircraft['acft_category'].value_counts()
Out[14]:
In [15]:
cond_2 = aircraft['acft_category'] == 'AIR'
aircraft= aircraft[cond_2]
aircraft.head(2)
Out[15]:
In [16]:
ev_ids_for_desired_criteria = aircraft['ev_id'].drop_duplicates()
# How many different events do we have after this?
print('Number of events: ', ev_ids_for_desired_criteria.count())
mask = events['ev_id'].isin(ev_ids_for_desired_criteria.values)
events = events[mask]
In [17]:
events.info(memory_usage='deep', max_cols=0)
In [18]:
aircraft.info(memory_usage='deep', max_cols=0)
In [19]:
aircraft['far_part'].value_counts()
Out[19]:
From now on, events
and aircraft
only contain accident data from flights corresponding to FAR parts in desired_far_parts.
In [20]:
events[['ev_city', 'ev_state', 'ev_country', 'latitude', 'longitude']].head(10)
Out[20]:
In [21]:
def convert_lat(string):
degs = float(string[0:2])
mins = float(string[2:4])
secs = float(string[4:6])
last = string[6].lower()
if last == 's':
factor = -1.0
elif last == 'n':
factor = 1.0
else:
raise ValueError("invalid hemisphere")
return factor * (degs + mins / 60 + secs / 3600)
def convert_lon(string):
degs = float(string[0:3])
mins = float(string[3:5])
secs = float(string[5:7])
last = string[7].lower()
if last == 'w':
factor = -1.0
elif last == 'e':
factor = 1.0
else:
raise ValueError("invalid direction")
return factor * (degs + mins / 60 + secs / 3600)
In [22]:
# Parsing latitude
events['latitude'] = events['latitude'].replace(' ', np.nan)
lat = events['latitude']
lat.dropna(inplace=True)
mask = lat.str.contains(r'^[0-9]{6}[NnSs]$')
events['latitude_num'] = lat[mask].apply(convert_lat)
# Parsing longitude
events['longitude'] = events['longitude'].replace(' ', np.nan)
lon = events['longitude']
lon.dropna(inplace=True)
mask = lon.str.contains(r'^[0-9]{7}[EeWw]$')
events['longitude_num'] = lon[mask].apply(convert_lon)
events[['longitude_num', 'latitude_num']].head()
Out[22]:
Due to the limitation on the daily number of requests we will load data that have requested previously.
In [23]:
cond = events['longitude_num'].isnull() | events['latitude_num'].isnull()
print("No lat lon:", events['ev_id'][cond].count())
In [24]:
need_location = events.loc[cond][['ev_city', 'ev_country', 'ev_state', 'latitude_num', 'longitude_num']]
need_location.head()
Out[24]:
In [25]:
need_location.count()
Out[25]:
We use need_location.csv
and generate ---> have_location_part1.csv
& have_location_part_2.csv
In [26]:
have_location1 = pd.read_csv('./data/have_location_part1.csv', index_col='Unnamed: 0')
have_location2 = pd.read_csv('./data/have_location_part2.csv', index_col='Unnamed: 0')
have_location = pd.concat([have_location1, have_location2], axis=0)
events.loc[need_location.index, ['latitude_num']] = have_location['latitude']
events.loc[need_location.index, ['longitude_num']] = have_location['longitude']
events.loc[need_location.index, ['ev_city', 'ev_country', 'ev_state', 'latitude_num', 'longitude_num']].head()
Out[26]:
In [27]:
cond = events['longitude_num'].isnull() | events['latitude_num'].isnull()
print("No lat lon:", events['ev_id'][cond].count())
In [28]:
lon_ = events['longitude_num'].values
lat_ = events['latitude_num'].values
In [29]:
from mpl_toolkits.basemap import Basemap
fig=plt.figure(figsize=(15, 15))
m1 = Basemap('mill',lon_0=0, lat_0=0)
m1.bluemarble()
m1.drawcoastlines()
m1.scatter(lon_, lat_, latlon=True, marker='.', color='r')
Out[29]:
In [30]:
header = ['name', 'city', 'country', 'lat', 'lon']
airports = pd.read_csv('./data/openflights/airports.dat', usecols=(1,2,3,6,7), names=header)
airports.head()
Out[30]:
In [31]:
fig, ax = plt.subplots(2, 1, sharex=True, sharey=True)
fig.set_size_inches(15, 15)
ax[0].set_title('airports')
m1 = Basemap('mill',lon_0=0, lat_0=0, ax=ax[0])
m1.drawcoastlines()
m1.bluemarble()
x, y = m1(airports['lon'], airports['lat'])
# retocar para que se vea bien
m1.scatter(x, y, latlon=False, marker='.', color='g')
ax[1].set_title('accidents')
m2 = Basemap('mill',lon_0=0, lat_0=0, ax=ax[1])
m2.drawcoastlines()
m2.bluemarble()
m2.scatter(lon_, lat_, latlon=False, marker='.', color='r')
Out[31]:
In [32]:
detailed_phase = [501,502,503,504,505,511,512,513,514,
521,522,523,531,541,551,552,553,
561,562,563,564,565,566,567,567,568,
569,571,572,573,574,575,576,581,582,583,
591,592, 542]
general_phase = [(ii//10)*10 for ii in detailed_phase[:-1]]
general_phase.append(580)
aircraft['phase_gen'] = aircraft['phase_flt_spec'].replace(to_replace=detailed_phase,
value=general_phase)
occurrences_series = aircraft['phase_gen'].value_counts()
# Only the ten most common
occurrences_series.iloc[0:10]
Out[32]:
In [33]:
phases_dict = {500:'STANDING',
510:'TAXI',
520:'TAKEOFF',
530:'CLIMB',
540:'CRUISE',
550:'DESCENT',
560:'APPROACH',
570:'LANDING',
580:'MANEUVERING',
600:'OTHER',
610:'UNKNOWN'}
occurrences_series = occurrences_series.rename_axis(phases_dict)
occurrences_series.plot.barh(stacked=True)
plt.xlabel('number of accidents')
Out[33]:
In [34]:
aircraft.damage.value_counts()
Out[34]:
In [35]:
group = aircraft.groupby(['phase_gen', 'damage'])
phases_list = phases_dict.keys()
damage_list = ['NONE', 'SUBS', 'DEST', 'MINR', 'UNK']
injured = ['injured', 'fatalities']
phases = pd.DataFrame(columns=damage_list+injured)
phases
Out[35]:
In [36]:
for phase in phases_list:
sum_inj = 0
sum_fat = 0
for dam in damage_list:
try:
gg = group.get_group((phase, dam))
phases.loc[phase, [dam]] = gg['ev_id'].count()
mask = events['ev_id'].isin(gg['ev_id'])
#print(mask.sum())
inj_m = events[mask]['inj_tot_m'].sum()
inj_s = events[mask]['inj_tot_s'].sum()
fat = events[mask]['inj_tot_f'].sum()
if not np.isnan(inj_m):
sum_inj += inj_m
if not np.isnan(inj_s):
sum_inj += inj_s
if not np.isnan(fat):
sum_fat += fat
except KeyError:
pass
phases.loc[phase, ['injured']] = sum_inj
phases.loc[phase, ['fatalities']] = sum_fat
phases.rename_axis(phases_dict, inplace=True)
phases
Out[36]:
In [37]:
phases['TOTAL'] = phases[['NONE', 'MINR', 'SUBS', 'DEST']].sum(axis=1)
phases = phases.sort_values(by='TOTAL', ascending=False)
phases[['NONE', 'MINR', 'SUBS', 'DEST']].plot.barh(stacked=True)
Out[37]:
In [38]:
phases[['injured', 'fatalities']].plot.barh()
Out[38]:
In [39]:
phases = phases.sort_values(by='fatalities', ascending=False)
phases[['injured', 'fatalities']].plot.barh()
Out[39]:
In [40]:
occurrences = pd.read_csv("./data/ocurrences.csv", sep=';')
occurrences = occurrences[occurrences['ev_id'].isin(events['ev_id'])]
In [41]:
# ocurrences_filtered['ev_id'].value_counts()
occurrences[occurrences['ev_id']=='20001214X36685']
Out[41]:
In [42]:
events[events['ev_id']=='20001214X36685']
Out[42]:
In [43]:
from IPython.display import HTML
HTML('<iframe src="http://www.ntsb.gov/_layouts/ntsb.aviation/brief.aspx?ev_id=20001214X36685&key=2&queryId=9811677a-e5d5-43d4-8d8e-89599d9d47a4&pgno=3&pgsize=50"\
width="700" height="400"></iframe>')
Out[43]:
In [44]:
occurrences['Occurrence_Code'] = occurrences['Occurrence_Code'].replace(
to_replace=[131,171,172,191,192,193,195,232,271,351,352, 353,351,354],
value=[130,170,170,190,190,190,190,230,270,520,520,350,350,350])
occurrences_series_2 = occurrences['Occurrence_Code'].value_counts()
occurrences_series_2 = occurrences_series_2[0:13]
In [45]:
occurrences_dict = {130:'AIRFRAME/COMPONENT/SYSTEM FAILURE/MALFUNCTION',
170:'FIRE/EXPLOSION',
180:'FORCED LANDING',
190:'GEAR COLLAPSED',
200:'HARD LANDING',
220:'IN FLIGHT COLLISION WITH OBJECT',
230:'IN FLIGHT COLLISION WITH TERRAIN/WATER',
240:'IN FLIGHT ENCOUNTER WITH WEATHER',
250:'LOSS OF CONTROL - IN FLIGHT',
260:'LOSS OF CONTROL - ON GROUND/WATER',
270:'MIDAIR COLLISION',
280:'NEAR COLLISION BETWEEN AIRCRAFT',
300:'NOSE OVER',
310:'ON GROUND/WATER COLLISION WITH OBJECT',
320:'ON GROUND/WATER ENCOUNTER WITH TERRAIN/WATER',
340:'OVERRUN',
350:'LOSS OF ENGINE POWER',
430:'MISCELLANEOUS/OTHER',
520:'DESCENT'}
occurrences_series_2 = occurrences_series_2.rename_axis(occurrences_dict)
In [46]:
occurrences_series_2.plot.barh(stacked=True)
plt.xlabel('number of accidents')
Out[46]:
In [47]:
flight_crew = pd.read_csv('./data/flight_crew.csv')
# Only crew for the selected events:
flight_crew = flight_crew[flight_crew['ev_id'].isin(events['ev_id'])]
flight_crew.head(5)
Out[47]:
In [48]:
# Remove spaces
flight_crew['crew_category'] = flight_crew['crew_category'].str.strip()
# Pilots and copilots
crew_cat = ['PLT', 'CPLT']
# crew_cat = ['CPLT']
is_plt_cplt = flight_crew['crew_category'].isin(crew_cat)
flight_crew_pc = flight_crew[is_plt_cplt]
flight_crew_pc['crew_age'].describe()
Out[48]:
In [49]:
flight_crew_pc['crew_age'].plot.box()
plt.xticks([1],['crew age'])
Out[49]:
In [50]:
bins = np.arange(14, 85, 5)
flight_crew_pc['crew_age'].hist(bins=bins)
plt.xlabel('crew age')
Out[50]:
In [51]:
licenses_by_age_airline = np.array([0, 572, 5199, 12003, 15507,
18337, 23058, 25882, 24220, 16824,
10184, 4284, 1766, 787])
age_group = pd.cut(flight_crew_pc['crew_age'], bins)
gby_age = flight_crew_pc['ev_id'].groupby(age_group).count()
accident_rate_age = gby_age / licenses_by_age_airline
accident_rate_age.plot.bar()
plt.ylabel('accidents / licenses')
plt.xlabel('crew age')
Out[51]:
It seems that some people have investigated this issue before:
[...] the accident rate of airline transport rated (ATR) pilots aged 55–59 (3.78/1,000) is approximately one-third of that of pilots with the same rating who are aged 20–24 (11.71/10,000). John A. Wise, V. David Hopkin, Daniel J. Garland, **Handbook of Aviation Human Factors**, Second Edition.
In [52]:
flight_time = pd.read_csv('./data/flight_time.csv', sep=';', usecols=(0,1,2,3,4,5))
cond = flight_time['ev_id'].isin(events['ev_id'])
flight_time = flight_time[cond]
flight_time['flight_type'] = flight_time['flight_type'].str.split().str[0]
flight_time['flight_craft'] = flight_time['flight_craft'].str.split().str[0]
In [53]:
flight_type = list(flight_time.groupby('flight_type').groups.keys())
print(flight_type)
In [54]:
flight_craft = list(flight_time.groupby('flight_craft').groups.keys())
print(flight_craft)
In [55]:
gby_flight_time = flight_time.groupby(['flight_craft', 'flight_type'])
In [56]:
from ipywidgets import interact
In [57]:
def my_widget_function(fcraft, ftype, factor):
flight_h = gby_flight_time.get_group((fcraft, ftype))['flight_hours']
print(flight_h.describe())
max_v = flight_h
cond = flight_h < factor * flight_h.max()
fig, ax = plt.subplots(1, 2, figsize=(12,4))
flight_h[cond].hist(ax=ax[0])
ax[0].set_ylabel('flight hours')
flight_h[cond].plot.box(ax=ax[1])
In [58]:
interact(my_widget_function,
fcraft=flight_craft,
ftype=flight_type,
factor=(0., 1., 0.05))
Is there a limit on flight hours per year?
In [60]:
ev_before_2016 = events[events['ev_date'].dt.year < 2016]
gby_year = events.groupby(ev_before_2016.ev_date.dt.year)
injured_per_year = gby_year[['inj_tot_f', 'inj_tot_s', 'inj_tot_m']].count()
injured_per_year.tail()
Out[60]:
In [61]:
injured_per_year.plot.area(alpha=0.8, figsize=(11,5))
plt.xlabel('Year')
plt.ylabel('Number of passengers')
Out[61]:
In [62]:
passengers = pd.read_csv('./data/annual_passengers_carried/data.csv', nrows=1, usecols=range(4,60))
passengers = passengers.transpose()
# renaming column
passengers.columns = ['passengers']
# parsing date in index
passengers.index = pd.to_datetime(passengers.index.str[:4])
# converting flight number to number
passengers['passengers'] = pd.to_numeric(passengers['passengers'], errors='coerce') / 1e6
passengers.index = passengers.index.year
In [63]:
flights = pd.read_csv('./data/annual_worldwide_departures/data.csv', nrows=1, usecols=range(4,59))
flights = flights.transpose()
# renaming column
flights.columns = ['flights']
flights
# parsing date in index
flights.index = pd.to_datetime(flights.index.str[:4])
# converting flight number to number
flights['flights'] = pd.to_numeric(flights['flights'], errors='coerce') / 1e6
flights.index = np.arange(1960,2015)
In [64]:
fig, ax = plt.subplots(2, 1, figsize=(12,6))
ax[0].set_title('Millions of passengers transported')
passengers['passengers'].plot.area(ax=ax[0], alpha=0.6, color="#0072B2")
ax[0].set_xlim(1975, 2014)
ax[1].set_title('Millions of flights')
flights['flights'].plot.area(ax=ax[1], alpha=0.6)
ax[1].set_xlim(1975, 2014)
plt.tight_layout()
In [65]:
letality_rate = injured_per_year['inj_tot_f'] / passengers['passengers']
ax = letality_rate.plot.area(alpha=0.8, figsize=(11,5))
ax.set_xlim(1982, 2015)
ax.set_ylabel('fatalities rate')
Out[65]:
In [66]:
accidents_gby_year = events['ev_id'].groupby(events.ev_date.dt.year).count()
ax = accidents_gby_year.plot.area(alpha=0.8, figsize=(11,5))
ax.set_xlim(1982, 2015)
ax.set_ylabel('Number of accidents')
Out[66]:
In [67]:
accident_rate = accidents_gby_year / flights.flights
accident_rate.tail()
Out[67]:
In [68]:
ax = accident_rate.plot.area(alpha=0.8, figsize=(11,5))
ax.set_xlim(1982, 2015)
ax.set_ylabel('accident rate')
Out[68]:
In [69]:
ax = accident_rate.plot(alpha=0.8, figsize=(11,5), marker='o')
ax.set_xlim(2000, 2009)
ax.set_ylabel('accident rate')
Out[69]:
In [1]:
# Notebook style
from IPython.core.display import HTML
css_file = './static/style.css'
HTML(open(css_file, "r").read())
Out[1]: