Getting things set up
In [1]:
import pandas as pd
import numpy as np
import seaborn
import matplotlib as mpl
from bokeh import palettes
from bokeh.charts import Bar
import math
from IPython.display import Image
In [2]:
from bokeh.io import output_notebook, show
output_notebook()
In [3]:
%matplotlib inline
The below file is a sample day of data stored as an HDF5 file. It can be read as a DataFrame.
In [16]:
file = r'D:\ADSB\working\h5\2016-12-02.h5'
In [17]:
store = pd.HDFStore(file)
In [18]:
store
Out[18]:
How many rows are in the file?
In [19]:
store.get_storer('data').nrows
Out[19]:
In [21]:
day_of_data_df = store['data']
And how many unique aircraft reports?
In [23]:
len(day_of_data_df['Icao'].unique())
Out[23]:
And how many of the rows contain lat/long/alt?
In [24]:
day_of_data_df['Lat'].count(), day_of_data_df['Long'].count(), day_of_data_df['Alt'].count()
Out[24]:
What about Manufacturer or Aircraft Type?
In [25]:
day_of_data_df['Man'].count(),day_of_data_df['Type'].count()
Out[25]:
Below is a Pivot Table showing the number of unique aircraft and the number of unique aircraft manufacturers by Airaft Country (where it is registered).
In [26]:
pd.pivot_table(day_of_data_df,values=['Id','Man'],index='Cou',
aggfunc=lambda x: len(x.unique())).sort_values(by='Id',ascending=False)[:30]
Out[26]:
In [27]:
aircraft_df = day_of_data_df[['Id','Cou','Year','Man','Type']].dropna(how='all')
aircraft_df = aircraft_df.drop_duplicates(subset='Id')
Here are the top aircraft manufacturers.
In [28]:
mfr = aircraft_df.groupby('Man').count()['Id'].sort_values(ascending=False)[:10]
aircraft_df_by_mfr = aircraft_df[aircraft_df['Man'].isin(list(mfr.index))].groupby(['Man']).count()
aircraft_df_by_mfr = aircraft_df_by_mfr.sort_values(by='Id', ascending=False)
print('Total number of manufactuers: %i' % len(aircraft_df.groupby('Man').count()['Id']))
aircraft_df_by_mfr['Id']
Out[28]:
In [29]:
p = Bar(aircraft_df_by_mfr, label='index', values='Id',
xlabel='Aircraft Manufacturer', ylabel='Number of Aircraft',
title="Major Aircraft Manufactures", width=800, legend=None,
tools='pan,wheel_zoom,box_zoom,reset',color='wheat')
p.title.text_font_size = "12pt"
p.xaxis.axis_label_text_font_size = "12pt"
p.yaxis.axis_label_text_font_size = "12pt"
p.xaxis.major_label_text_font = "10pt"
p.yaxis.major_label_text_font = "10pt"
show(p)
Grabed a csv that provides airports (by ICAO name) to county.
In [30]:
airport_codes = pd.read_csv('https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat',
index_col=0,header=None)
airport_codes.columns = ['Name', 'City', 'Country', 'IATA', 'ICAO', 'Latitude',
'Longitude', 'Altitude', 'Timezone', 'DST', 'Tz', 'Type', 'Source']
airport_codes.set_index('ICAO',drop=True,inplace=True)
airport_codes.head(3)
Out[30]:
Construct a dataframe listing the number (and percentage) of each arrival/departure.
In [31]:
def country_lu(s):
if s in airport_codes.index:
return airport_codes.loc[airport_codes.index == s, 'Country'].item()
else:
return ''
def city_lu(s):
if s in airport_codes.index:
return airport_codes.loc[airport_codes.index == s, 'City'].item()
else:
return ''
In [36]:
us_airports = airport_codes[airport_codes['Country'] == 'United States'].index.values
usa_from = day_of_data_df[day_of_data_df['From'].isin(us_airports)]
usa_to = day_of_data_df[day_of_data_df['To'].isin(us_airports)]
us_airports_df = pd.concat((usa_from, usa_to)).drop_duplicates()
In [37]:
airports = pd.DataFrame()
airports['From'] = day_of_data_df.groupby(["To"]).Id.nunique().sort_values(ascending=False).sort_values(ascending=True)
airports['To'] = day_of_data_df.groupby(["From"]).Id.nunique().sort_values(ascending=False).sort_values(ascending=True)
airports['From'] = airports['From'].fillna(0).map(np.int64)
airports['To'] = airports['To'].fillna(0).map(np.int64)
airports['From (%)'] = airports['From']/airports['From'].sum() * 100
airports['To (%)'] = airports['To']/airports['To'].sum() * 100
airports['From'] = airports['From'].fillna(0)
airports.sort_values(by='To',ascending=False).head(20)
Out[37]:
And here it is grouped by country.
In [542]:
airport_country = airports.groupby(airports['Country'])
airport_country.sum().sort_values(by='From',ascending=False).head(20)
Out[542]:
Below is a plot of the top 30 international departure airports.
In [47]:
mpl.rcParams['figure.figsize'] = 10, 8
mpl.rcParams['figure.dpi'] = 100
d = airports.sort_values(by='From',ascending=False)[:30].sort_values(by='From',ascending=True)
color = np.where(d.index.isin(us_airports),palettes.d3['Category20c'][5][0],palettes.d3['Category20c'][5][-1])
ax = d['From (%)'].plot(kind='barh',color=color, title='Top 30 Airports Internationally (Departures)')
city = [city_lu(code) for code in d.index.values]
country = [country_lu(code) for code in d.index.values]
city_countr = [', '.join(s) for s in zip(city,country)]
ax.set_xbound(upper=3.5)
rects = ax.patches
for rect, label in zip(rects, city_countr):
width = rect.get_width()
height = rect.get_height()
y = rect.get_y()
ax.text(width + .10,y + height * .95, label, color='black',fontsize=12, ha='left', va='top', weight='heavy')
Below is a plot of the top 30 international arrival airports.
In [50]:
mpl.rcParams['figure.figsize'] = 10, 8
mpl.rcParams['figure.dpi'] = 100
d = airports.sort_values(by='To',ascending=False)[:30].sort_values(by='To',ascending=True)
color = np.where(d.index.isin(us_airports),palettes.d3['Category20c'][5][0],palettes.d3['Category20c'][5][-1])
ax = d['To (%)'].plot(kind='barh', color=color, title='Top 30 Airports Internationally (Arrivals)')
city = [city_lu(code) for code in d.index.values]
country = [country_lu(code) for code in d.index.values]
city_countr = [', '.join(s) for s in zip(city,country)]
ax.set_xbound(upper=3)
rects = ax.patches
for rect, label in zip(rects, city_countr):
width = rect.get_width()
height = rect.get_height()
y = rect.get_y()
ax.text(width + .1,y + height * .95, label, color='black',fontsize=12, ha='left', va='top', weight='heavy')
In [51]:
usa_airports = pd.DataFrame()
usa_airports['From'] = day_of_data_df.groupby(["From"]).Id.nunique()
usa_airports['To'] = day_of_data_df.groupby(["To"]).Id.nunique()
usa_airports['From'] = usa_airports['From'].fillna(0).map(np.int64)
usa_airports['To'] = usa_airports['To'].fillna(0).map(np.int64)
usa_airports['From (%)'] = usa_airports['From']/usa_airports['From'].sum() * 100
usa_airports['To (%)'] = usa_airports['To']/usa_airports['To'].sum() * 100
usa_airports['Country'] = [country_lu(code) for code in usa_airports.index.values]
usa_airports['Domestic'] = usa_airports.index.isin(us_airports)
usa_airports.index.name = 'Airport'
usa_airports.sort_values(by='To',ascending=False).head(5)
Out[51]:
In [52]:
usa_airports.groupby(usa_airports['Domestic']).sum()
Out[52]:
The top 30 airports where the destination is in the US.
In [54]:
mpl.rcParams['figure.figsize'] = 10, 8
mpl.rcParams['figure.dpi'] = 100
d = usa_airports[usa_airports.To.index.isin(us_airports)].sort_values(by='From',ascending=False)[:30].sort_values(by='From',ascending=True)
color = np.where(d.index.isin(us_airports),
palettes.d3['Category20c'][5][0],palettes.d3['Category20c'][5][-1])
ax = d['From (%)'].plot(kind='barh', color=color, title='Top 30 Departure Airports with US Destination')
city = [city_lu(code) for code in d.index.values]
country = [country_lu(code) for code in d.index.values]
ax.set_xbound(upper=2.5)
rects = ax.patches
for rect, label in zip(rects, city):
width = rect.get_width()
height = rect.get_height()
y = rect.get_y()
ax.text(width + .1,y + height * .95, label, color='black',fontsize=10, ha='left', va='top', weight='heavy')
In [56]:
mpl.rcParams['figure.figsize'] = 10, 8
mpl.rcParams['figure.dpi'] = 100
d = usa_airports[usa_airports.From.index.isin(us_airports)].sort_values(by='To',ascending=False)[:30].sort_values(by='To',ascending=True)
color = np.where(d.index.isin(us_airports), palettes.d3['Category20c'][5][0],palettes.d3['Category20c'][5][-1])
ax = d['To (%)'].plot(kind='barh', color=color, title='Top 30 Arrival Airports with US Departure')
city = [city_lu(code) for code in d.index.values]
country = [country_lu(code) for code in d.index.values]
ax.set_xbound(upper=2.5)
rects = ax.patches
for rect, label in zip(rects, city):
width = rect.get_width()
height = rect.get_height()
y = rect.get_y()
ax.text(width + .1,y + height * .95, label, color='black',fontsize=10, ha='left', va='top', weight='heavy')
In [57]:
international = usa_airports[~usa_airports.From.index.isin(us_airports)]
international = international[~international.To.index.isin(us_airports)]
international.sort_values(by='From',ascending=False).head(10)
Out[57]:
In [58]:
international_country = international.groupby(international['Country'])
print('Main contries traveled to:\n',', '.join(list(international_country.sum().sort_values(by='To',
ascending=False).head(20).index)))
print('Main countries traveled from:\n', ', '.join(list(international_country.sum().sort_values(by='From',
ascending=False).head(20).index)))
In [62]:
mpl.rcParams['figure.figsize'] = 10, 6
mpl.rcParams['figure.dpi'] = 100
from_ = international_country.sum().sort_values(by='From',ascending=False).head(15)['From']
from_ = from_ / from_.sum() * 100
to_ = international_country.sum().sort_values(by='To',ascending=False).head(15)['To']
to_ = to_ / to_.sum() * 100
ax = pd.concat([from_, to_], axis=1).sort_values(by='To',ascending=False).plot(kind='barh')
ax.set_xlabel('Percentage of Flights Originating or Ending in USA')
ax.set_ylabel('Origin/Destination')
Out[62]:
And Atlanta...
In [288]:
Image("http://wallpapercave.com/wp/E48LtWF.jpg")
Out[288]:
In [65]:
atl_ = store.select('data',where='From = "KATL" | To = "KATL"').drop_duplicates()
atl = pd.DataFrame()
atl['From'] = atl_.groupby(["To"]).Id.nunique().sort_values(ascending=False).sort_values(ascending=True)
atl['To'] = atl_.groupby(["From"]).Id.nunique().sort_values(ascending=False).sort_values(ascending=True)
atl['From'] = atl['From'].fillna(0).map(np.int64)
atl['To'] = atl['To'].fillna(0).map(np.int64)
atl['From (%)'] = atl['From']/atl['From'].sum() * 100
atl['To (%)'] = atl['To']/atl['To'].sum() * 100
atl['Country'] = [country_lu(code) for code in atl.index.values]
atl.drop('KATL', inplace=True)
atl.sort_values(by='To',ascending=False).head(20)
Out[65]:
In [ ]: