This exercise is trying to make some more sense out of the data collected at http://PlaneCrashInfo.com, which contains way too few charts for this topic. Using this notebook you can explore the data and add your own charts.
N.B. This is work in progress… If you have any suggestion, please feel free to raise an issue or provide a pull request in the repo on GitHub!
In [1]:
%matplotlib inline
In [2]:
import pandas as pd
df = pd.read_csv('data/data.csv')
In [3]:
df.head()
Out[3]:
In [5]:
import planecrashinfo_light as pci
df = pci.clean_database(df)
In [6]:
df.head()
Out[6]:
In [7]:
s = df.groupby(df.index.year).size()
s.plot.line(title='#Accidents (total, by year)')
Out[7]:
In [8]:
s = df.groupby(df.index.year).size().cumsum()
s.plot.line(title='#Accidents (cumulated, by year)')
Out[8]:
In [9]:
s = df.groupby(df.index.month).size()
s.plot.bar(title='#Accidents (total, by month)')
Out[9]:
In [10]:
s = df.groupby(df.index.day).size()
s.plot.bar(title='#Accidents (total, by day)')
Out[10]:
In [11]:
s = df.groupby((df.index.year // 10) * 10).size()
s.plot.bar(title='#Accidents (total, by decade)')
Out[11]:
In [12]:
s = df.groupby((df.index.quarter)).size()
s.plot.bar(title='#Accidents (total, by quarter)')
Out[12]:
In [13]:
df['Fatalities total'].groupby(df.index.year).sum()
Out[13]:
In [14]:
df['Fatalities total'].groupby(df.index.year).sum().plot.line(title='#Fatalities (total)')
Out[14]:
In [15]:
df['Ground'].groupby(df.index.year).sum().plot(title='#Ground fatalities (total)')
Out[15]:
In [16]:
df.groupby(df.index.year).sum().plot(title='#Fatalities')
Out[16]:
In [17]:
s = df.groupby('AC Type').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 aircraft with highest #accidents')
Out[17]:
In [18]:
doug = [op for op in df['AC Type'].unique() if type(op) == str and op.startswith('Douglas')]
anto = [op for op in df['AC Type'].unique() if type(op) == str and op.startswith('Antonov')]
airb = [op for op in df['AC Type'].unique() if type(op) == str and op.startswith('Airbus')]
boei = [op for op in df['AC Type'].unique() if type(op) == str and op.startswith('Boeing')]
In [19]:
airb
Out[19]:
In [20]:
s = df[df['AC Type'].isin(airb)].groupby('AC Type').size()
s.plot.bar(title='#Accidents for Airbus Aircraft')
Out[20]:
In [21]:
import re
def extract_airbus_type(ac_type):
"Extract main Airbus type from 'AC Type', e.g. 'A300' from 'Airbus A-300-605R'."
if type(ac_type) != str:
return ''
pat = 'Airbus ?(A[\.\-]?\d{3,3})'
m = re.search(pat, ac_type)
if m:
return m.groups()[0].replace('-', '').replace('.', '')
else:
return ''
In [22]:
airbus_models = set([extract_airbus_type(a) for a in airb])
print(airbus_models)
In [23]:
df['Airbus Model'] = df['AC Type'].apply(extract_airbus_type)
In [24]:
s = df[df['Airbus Model'].isin(airbus_models)].groupby('Airbus Model').size()
s.plot.bar(title='#Accidents for Airbus Aircraft by Model')
Out[24]:
In [25]:
df2 = df[df['AC Type'].isin(airb)]
s2 = df2.groupby(df2.index.year).size()
s2.plot.bar(title='#Accidents for Airbus Aircraft')
Out[25]:
In [26]:
df1 = df[df['AC Type'].isin(doug)]
s1 = df1.groupby(df1.index.year).size()
s1.plot.line(title='#Accidents for Douglas Aircraft')
Out[26]:
In [27]:
df3 = df[df['AC Type'].isin(anto)]
s3 = df3.groupby(df3.index.year).size()
s3.plot.line(title='#Accidents for Antonov Aircraft')
Out[27]:
In [28]:
df4 = df[df['AC Type'].isin(boei)]
s4 = df4.groupby(df4.index.year).size()
s4.plot.line(title='#Accidents for Boeing Aircraft')
Out[28]:
In [29]:
dfab = pd.DataFrame(data={'Douglas': s1, 'Airbus': s2, 'Antonov': s3, 'Boeing': s4})
dfab.plot.line()
Out[29]:
In [30]:
s = df.groupby('Operator').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 airlines with highest #accidents')
Out[30]:
In [31]:
mil_ops = sorted([op for op in df['Operator'].unique() if type(op) == str and 'Military' in op])
In [32]:
df4 = df[df['Operator'].isin(mil_ops)]
s4 = df4.groupby(df4.index.year).size()
s4.plot.line(title='#Accidents for Military Operators (total, by year)')
Out[32]:
In [33]:
s = df.groupby(df.index.year).size()
s.plot.line(title='#Accidents (total, by year)')
Out[33]:
In [34]:
s = df.groupby('Origin').size().sort_values(ascending=True)
s[-20:].plot.barh(title='Top 20 most frequest origins (of %d)' % len(s))
Out[34]:
In [35]:
s = df.groupby('Destination').size().sort_values(ascending=True)
s[-20:].plot.barh(title='Top 20 most frequent destinations (of %d)' % len(s))
Out[35]:
In [36]:
s = df.groupby('Location Country').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 countries with highest #accidents')
Out[36]:
For respective maps see below...
In [37]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
def show_map(positions, title='', proj='mill', lat_0=0, lon_0=0):
"Show a world map."
plt.figure(num=1, figsize=(20,10))
plt.title(title)
map = Basemap(projection=proj, lat_0=lat_0, lon_0=lon_0, resolution='c')
map.drawcoastlines()
map.drawcountries()
map.drawparallels(np.arange(-90, 90, 30), labels=[1, 0, 0, 0])
map.drawmeridians(np.arange(map.lonmin, map.lonmax + 30, 60), labels=[0, 0, 0, 1])
map.drawmapboundary(fill_color='#aaddff')
map.fillcontinents(color='#dddddd', lake_color='#aaddff')
for pos in positions:
lon, lat = pos['lon'], pos['lat']
map.plot(lon, lat, 'ro', markersize=3, latlon=True)
plt.show()
In [38]:
import json
locs = json.load(open('data/geolocs.json'))
positions = list(filter(None, locs.values()))
print('#locations found: %d' % len(locs))
print('#coordinates found: %d' % len(positions))
In [39]:
title = 'Approx. Plane Crash Locations (%d out of %d)'% (len(positions), len(locs))
show_map(positions, title=title, proj='mill')
In [40]:
title = 'Approx. Plane Crash Locations (%d out of %d)'% (len(positions), len(locs))
show_map(positions, title=title, proj='ortho', lat_0=30, lon_0=-40)