This exercise is trying to make some more sense out of the data collected at http://PlaneCrashInfo.com, which contains way too few charts for this topic. Using this notebook you can explore the data and add your own charts.
N.B. This is work in progress… If you have any suggestion, please feel free to raise an issue or provide a pull request in the repo on GitHub!
In [1]:
import pandas as pd
import re
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from mpl_toolkits.basemap import Basemap
import planecrashinfo_light as pci
%matplotlib inline
In [2]:
df = pd.read_csv('data/data.csv')
In [3]:
df.head()
Out[3]:
In [4]:
df = pci.clean_database(df)
In [5]:
df.head()
Out[5]:
In [6]:
s = df.groupby(df.index.year).size()
s.plot(title='#Accidents (total, by year)');
In [7]:
s = df.groupby(df.index.year).size().cumsum()
s.plot(title='#Accidents (cumulated, by year)');
In [8]:
s = df.groupby(df.index.month).size()
s.plot.bar(title='#Accidents (total, by month)');
In [9]:
s = df.groupby(df.index.weekday).size()
s.plot.bar(title='#Accidents (total, by day of week)')
plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']);
In [10]:
s = df.groupby(df.index.day).size()
s.plot.bar(title='#Accidents (total, by day)');
In [11]:
s = df.groupby((df.index.year // 10) * 10).size()
s.plot.bar(title='#Accidents (total, by decade)');
In [12]:
s = df.groupby((df.index.quarter)).size()
s.plot.bar(title='#Accidents (total, by quarter)')
plt.xticks(range(5), ['spring', 'summer', 'autumn', 'winter'], rotation='horizontal');
In [13]:
df.Fatalities_total.groupby(df.index.year).sum().plot(title='#Fatalities (total)');
In [14]:
s = df.Fatalities_total.groupby((df.index.year // 10) * 10).sum()
plt.bar(list(s.keys()), s.values, width=5)
plt.title('#Fatalities (total, by decad)');
In [15]:
s = df.groupby(by=['AC_Type']).Fatalities_total.sum().sort_values(ascending=True, na_position='first')[-20:]
s.plot.barh(title='Biggest 20 CA types by the number of #fatalities');
In [16]:
s = df.groupby(by=['Origin']).Fatalities_total.sum().sort_values(ascending=True, na_position='first')[-20:]
s.plot.barh(title='Top 20 origins with max number of #fatalities');
In [17]:
s = df.groupby(by=['Destination']).Fatalities_total.sum().sort_values(ascending=True, na_position='first')[-20:]
s.plot.barh(title='Top 20 destinations with max number of #fatalities');
In [18]:
df.Ground.groupby(df.index.year).sum().plot(title='#Ground fatalities (total)');
In [19]:
df.groupby(df.index.year).sum().plot(title='#Fatalities');
In [20]:
s = df.groupby('AC_Type').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 aircraft with highest #accidents');
In [22]:
plane_brands = df.AC_Type.unique()
def get_plane_brand(brands, brand_name):
return [op for op in brands if type(op) == str and op.startswith(brand_name)]
doug = get_plane_brand(plane_brands, 'Douglas')
anto = get_plane_brand(plane_brands, 'Antonov')
airb = get_plane_brand(plane_brands, 'Airbus')
boei = get_plane_brand(plane_brands, 'Boeing')
In [23]:
print('A number of the Boeing models %s' % len(boei))
print('A number of the Douglas models %s' % len(doug))
print('A number of the Antonov models %s' % len(anto))
print('A number of the Airbus models %s' % len(airb))
In [24]:
s = df[df.AC_Type.isin(airb)].groupby('AC_Type').size()
s.plot.bar(title='#Accidents for Airbus Aircraft', figsize=(12, 6));
In [27]:
def extract_airbus_type(ac_type):
"Extract main Airbus type from 'AC Type', e.g. 'A300' from 'Airbus A-300-605R'."
if type(ac_type) != str:
return ''
pat = 'Airbus ?(A[\.\-]?\d{3,3})'
m = re.search(pat, ac_type)
return m.groups()[0].replace('-', '').replace('.', '') if m else ''
In [28]:
airbus_models = set([extract_airbus_type(a) for a in airb])
print(airbus_models)
In [29]:
df['Airbus_Model'] = df.AC_Type.apply(extract_airbus_type)
In [30]:
s = df[df.Airbus_Model.isin(airbus_models)].groupby('Airbus_Model').size()
s.plot.bar(title='#Accidents for Airbus Aircraft by Model');
In [31]:
df2 = df[df.AC_Type.isin(airb)]
s2 = df2.groupby(df2.index.year).size()
s2.plot(title='#Accidents for Airbus Aircraft');
In [32]:
df1 = df[df.AC_Type.isin(doug)]
s1 = df1.groupby(df1.index.year).size()
s1.plot(title='#Accidents for Douglas Aircraft');
In [34]:
df3 = df[df.AC_Type.isin(anto)]
s3 = df3.groupby(df3.index.year).size()
s3.plot.line(title='#Accidents for Antonov Aircraft');
In [35]:
df4 = df[df.AC_Type.isin(boei)]
s4 = df4.groupby(df4.index.year).size()
s4.plot(title='#Accidents for Boeing Aircraft');
In [32]:
dfab = pd.DataFrame(data={'Douglas': s1, 'Airbus': s2, 'Antonov': s3, 'Boeing': s4})
plt.figure(figsize=(10, 6))
plt.plot(dfab)
plt.show()
In [36]:
s = df.groupby('Operator').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 airlines with highest #accidents');
In [38]:
mil_ops = sorted([op for op in df.Operator.unique() if type(op) == str and 'Military' in op])
In [39]:
df4 = df[df.Operator.isin(mil_ops)]
s4 = df4.groupby(df4.index.year).size()
s4.plot.line(title='#Accidents for Military Operators (total, by year)')
s = df.groupby(df.index.year).size()
s.plot.line(title='#Accidents (total, by year)');
In [40]:
s = df.groupby('Origin').size().sort_values(ascending=True)
s[-20:].plot.barh(title='Top 20 most frequest origins (of %d)' % len(s), figsize=(10, 6));
In [41]:
s = df.groupby('Destination').size().sort_values(ascending=True)
s[-20:].plot.barh(title='Top 20 most frequent destinations (of %d)' % len(s), figsize=(10, 6));
In [42]:
s = df.groupby('Location_Country').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 countries with highest #accidents', figsize=(10, 6));
In [43]:
s = df.groupby('Location').size().sort_values(ascending=True)[-20:]
s.plot.barh(title='Top 20 location of the #accidents', figsize=(10, 6));
In [44]:
labels = ['unknown', 'take-off', 'landing', 'others']
ac_type_frequency = df.Accident_type.value_counts()
ac_type_frequency.plot.bar()
plt.title('Number of the #accidents during take-off, landing and in the way')
plt.xticks(range(4), [labels[key] for key in (ac_type_frequency.keys())]);
In [7]:
s = df.groupby((df.index.year // 10) * 10)['Accident_type'].agg('value_counts')
years_range = np.arange(1920, 2020, 10)
unknowns = []
takeoffs = []
landings = []
others = []
all_types = [unknowns, takeoffs, landings, others]
for year in years_range:
for i in range(4):
all_types[i].append(s[year][i])
In [9]:
ax1 = plt.plot(years_range, unknowns, label='unknowns')
ax2 = plt.plot(years_range, takeoffs, label='takeoffs')
ax3 = plt.plot(years_range, landings, label='landings')
ax4 = plt.plot(years_range, others, label='others')
plt.title('#Accidents types by decade')
plt.legend()
plt.show()
For respective maps see below...
In [6]:
def show_map(positions, title='', proj='mill', lat_0=0, lon_0=0):
"Show a world map."
plt.figure(num=1, figsize=(20,10))
plt.title(title)
map = Basemap(projection=proj, lat_0=lat_0, lon_0=lon_0, resolution='c')
map.drawcoastlines()
map.drawcountries()
map.drawparallels(np.arange(-90, 90, 30), labels=[1, 0, 0, 0])
map.drawmeridians(np.arange(map.lonmin, map.lonmax + 30, 60), labels=[0, 0, 0, 1])
map.drawmapboundary(fill_color='#aaddff')
map.fillcontinents(color='#dddddd', lake_color='#aaddff')
for pos in positions:
lon, lat = pos['lon'], pos['lat']
map.plot(lon, lat, 'ro', markersize=3, latlon=True)
plt.show()
In [7]:
locs = json.load(open('data/geolocs.json'))
positions = list(filter(None, locs.values()))
print('#locations found: %d' % len(locs))
print('#coordinates found: %d' % len(positions))
In [8]:
title = 'Approx. Plane Crash Locations (%d out of %d)'% (len(positions), len(locs))
show_map(positions, title=title, proj='mill')
In [38]:
title = 'Approx. Plane Crash Locations (%d out of %d)'% (len(positions), len(locs))
show_map(positions, title=title, proj='ortho', lat_0=30, lon_0=-40)
In [ ]: