In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import csv
import hashlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [2]:
data = pd.read_csv('july_2016_ontime.csv', usecols=[0,1])
data.describe()
Out[2]:
In [3]:
# get the 9 most common origin states
x = pd.value_counts(data['ORIGIN_STATE_ABR'])[:9]
states = [state for state, count in x.items()]
states
Out[3]:
In [4]:
# for the 10 most common states, what are the most common airlines
data_most_common_states = data[data['ORIGIN_STATE_ABR'].isin(states)]
x = pd.value_counts(data_most_common_states['CARRIER'])[:7]
carriers = [carrier for carrier, count in x.items()]
carriers
Out[4]:
In [5]:
CARRIER_NAMES = {
'WN': 'Southwest',
'AA': 'American',
'DL': 'Delta',
'UA': 'United',
'OO': 'SkyWest',
'EV': 'ExpressJet',
'B6': 'JetBlue',
'AS': 'Alaska',
'NK': 'Spirit',
'F9': 'Frontier',
'VX': 'Virgin',
'HA': 'Hawaiian'
}
STATE_NAMES = {
'CA': 'California',
'TX': 'Texas',
'FL': 'Florida',
'GA': 'Georgia',
'IL': 'Illinois',
'CO': 'Colorado',
'NY': 'New York',
'AZ': 'Arizona',
'NV': 'Nevada',
'WA': 'Washington'
}
In [6]:
data_filtered = data[data['ORIGIN_STATE_ABR'].isin(states) & data['CARRIER'].isin(carriers)]
data_filtered
Out[6]:
In [7]:
aggregated = data_filtered.groupby(['CARRIER', 'ORIGIN_STATE_ABR']).size().unstack()
sns.heatmap(aggregated,square=True)
Out[7]:
In [8]:
colors = sns.color_palette("Set1", n_colors=9, desat=.8)
def plot(hist, state, state_index, frac, seq):
hist = hist.sort_values(ascending=True)
matplotlib.rc('text', usetex=True)
fig = plt.figure(figsize=(7,hist.size*0.7))
ax = fig.add_subplot(111)
hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=colors[state_index])
# title
ax.set_title(label=r"Number of Flights out of the State of \textbf{" + STATE_NAMES[state] + "}", fontsize=18)
ax.title.set_position([.5, 1.05])
# x
import matplotlib.ticker as plticker
loc = plticker.MaxNLocator(nbins=7)
ax.xaxis.set_major_locator(loc)
ax.set_xlabel('Number of Flights', fontsize=14)
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# y
ax.set_ylabel('Airline', fontsize=14)
ax.set_yticklabels(hist.keys().map(lambda x: CARRIER_NAMES[x]))
fig.savefig('images/{}_{}_{}.png'.format(state, frac, seq), bbox_inches='tight', dpi=200)
plt.close(fig)
In [9]:
# what fraction of the data to use for sample
FRACS = [100, 200, 500, 1000]
# maximum number of carriers per state
LIMIT = 7
all_carriers = list(data['CARRIER'].unique())
with open('images/_data.csv', 'w') as csvfile:
fieldnames = ['state', 'frac', 'seq'] + all_carriers
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for state_index, state in enumerate(states):
state_data = data[data['ORIGIN_STATE_ABR'] == state]['CARRIER']
# precise, limited
hist = pd.value_counts(state_data)[:LIMIT]
plot(hist, state, state_index, 1, 0)
d = {
'state': state,
'frac': 1,
'seq': 0
}
d.update(dict(hist))
writer.writerow(d)
# top airlines from the state
top_carriers = [carrier for carrier, count in hist.items()]
state_data = state_data[state_data.isin(top_carriers)]
for frac in FRACS:
for seq in range(1, 21):
rand = int(hashlib.md5(r'{}_{}_{}'.format(state, frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295
# approx
sample = state_data.sample(frac=1.0/frac, replace=True, random_state=rand)
hist = pd.value_counts(sample) * frac
plot(hist, state, state_index, frac, seq)
d = {
'state': state,
'frac': frac,
'seq': seq
}
d.update(dict(hist))
writer.writerow(d)
csvfile.flush()
print("Done with {}".format(state))
In [ ]: