In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import csv
import hashlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [2]:
data = pd.read_csv('july_2016_ontime.csv', usecols=[0,1,2])
data.describe()
Out[2]:
In [3]:
x = pd.value_counts(data['ORIGIN_STATE_ABR'])
states = [state for state, count in x.items()]
In [4]:
x = pd.value_counts(data['CARRIER'])
carriers = [carrier for carrier, count in x.items()]
In [5]:
CARRIER_NAMES = {
'WN': 'Southwest',
'AA': 'American',
'DL': 'Delta',
'UA': 'United',
'OO': 'SkyWest',
'EV': 'ExpressJet',
'B6': 'JetBlue',
'AS': 'Alaska',
'NK': 'Spirit',
'F9': 'Frontier',
'VX': 'Virgin',
'HA': 'Hawaiian'
}
STATE_NAMES = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
In [6]:
blue = sns.color_palette("Set1", n_colors=8)[1]
orange = '#fd6a28'
def plot_states(hist, frac, seq, precise, approx):
return
hist = hist.sort_values(ascending=True)
fig = plt.figure(figsize=(7,hist.size*0.7))
ax = fig.add_subplot(111)
hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=blue if precise or approx is not None else orange)
if approx is not None:
ax.set_xlim([0, max(max(hist), max(approx) + 50)])
for k, y in approx.items():
i = np.nonzero(hist.index==k)[0][0]
ax.plot((y, y), (i -0.3, i + 0.3), 'k-', color=orange, solid_capstyle=None)
# title
ax.set_title(label=r"Number of Flights per State", fontsize=18)
ax.title.set_position([.5, 1.05])
# x
import matplotlib.ticker as plticker
loc = plticker.MaxNLocator(nbins=7)
ax.xaxis.set_major_locator(loc)
ax.set_xlabel('Number of Flights', fontsize=14)
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# y
ax.set_ylabel('State', fontsize=14)
ax.set_yticklabels(hist.keys().map(lambda x: STATE_NAMES[x]))
fig.savefig('images/states_{}_{}{}.png'.format(frac, seq, '_diff' if approx is not None else ''), bbox_inches='tight', dpi=200)
plt.close(fig)
def plot_airlines(hist, frac, seq, precise, approx):
return
hist = hist.sort_values(ascending=True)
fig = plt.figure(figsize=(7,hist.size*0.7))
ax = fig.add_subplot(111)
hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=blue if precise or approx is not None else orange)
if approx is not None:
ax.set_xlim([0, max(max(hist), max(approx) + 50)])
for k, y in approx.items():
i = np.nonzero(hist.index==k)[0][0]
ax.plot((y, y), (i -0.3, i + 0.3), 'k-', color=orange, solid_capstyle=None)
# title
ax.set_title(label=r"Number of Flights per Airline", fontsize=18)
ax.title.set_position([.5, 1.05])
# x
import matplotlib.ticker as plticker
loc = plticker.MaxNLocator(nbins=7)
ax.xaxis.set_major_locator(loc)
ax.set_xlabel('Number of Flights', fontsize=14)
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# y
ax.set_ylabel('Airline', fontsize=14)
ax.set_yticklabels(hist.keys().map(lambda x: CARRIER_NAMES[x]))
fig.savefig('images/airline_{}_{}{}.png'.format(frac, seq, '_diff' if approx is not None else ''), bbox_inches='tight', dpi=200)
plt.close(fig)
In [7]:
FRACS = [10, 100, 200, 500, 1000]
LIMIT = 7
SEQS = 20
with open('images/_states.csv', 'w') as csvfile:
filtered_data = data[data['CARRIER'] == 'B6']
state_data = filtered_data['ORIGIN_STATE_ABR']
hist = pd.value_counts(state_data)[:LIMIT+1]
# remove rows from each group corresponding to how many rows are the (LIMIT+1)th group
top_states = [state for state, count in hist.items()]
to_remove = int(0.8*hist[-1])
filtered_data = None
for state in top_states[:LIMIT]:
new_data = state_data[state_data==state][to_remove:]
if filtered_data is None:
filtered_data = new_data
else:
filtered_data = filtered_data.append(new_data)
hist = pd.value_counts(filtered_data)
fieldnames = ['frac', 'seq'] + top_states[:LIMIT]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# precise
plot_states(hist, 1, 0, True, None)
d = {
'frac': 1,
'seq': 0
}
d.update(dict(hist))
writer.writerow(d)
# fake approx
plot_states(hist, 1, 1, False, None)
plot_states(hist, 1, 1, True, hist)
d = {
'frac': 1,
'seq': 1
}
d.update(dict(hist))
writer.writerow(d)
for frac in FRACS:
for seq in range(2, 2 + SEQS):
rand = int(hashlib.md5(r'{}_{}'.format(frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295
# approx
sample = filtered_data.sample(frac=1.0/frac, replace=True, random_state=rand)
approx_hist = pd.value_counts(sample) * frac
plot_states(approx_hist, frac, seq, False, None)
plot_states(hist, frac, seq, True, approx_hist)
d = {
'frac': frac,
'seq': seq
}
d.update(dict(approx_hist))
writer.writerow(d)
In [8]:
FRACS = [10, 100, 200, 500, 1000]
LIMIT = 7
with open('images/_airlines.csv', 'w') as csvfile:
filtered_data = data[data['ORIGIN_STATE_ABR'] == 'NY']
state_data = filtered_data['CARRIER']
hist = pd.value_counts(state_data)[:LIMIT+1]
# remove rows from each group corresponding to how many rows are the (LIMIT+1)th group
top_states = [state for state, count in hist.items()]
to_remove = int(0.8*hist[-1])
filtered_data = None
for state in top_states[:LIMIT]:
new_data = state_data[state_data==state][to_remove:]
if filtered_data is None:
filtered_data = new_data
else:
filtered_data = filtered_data.append(new_data)
hist = pd.value_counts(filtered_data)
fieldnames = ['frac', 'seq'] + top_states[:LIMIT]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# precise
plot_airlines(hist, 1, 0, True, None)
d = {
'frac': 1,
'seq': 0
}
d.update(dict(hist))
writer.writerow(d)
plot_airlines(hist, 1, 1, False, None)
plot_airlines(hist, 1, 1, True, hist)
d = {
'frac': 1,
'seq': 1
}
d.update(dict(hist))
writer.writerow(d)
for frac in FRACS:
for seq in range(2, 2 + SEQS):
rand = int(hashlib.md5(r'{}_{}'.format(frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295
# approx
sample = filtered_data.sample(frac=1.0/frac, replace=True, random_state=rand)
approx_hist = pd.value_counts(sample) * frac
plot_airlines(approx_hist, frac, seq, False, None)
plot_airlines(hist, frac, seq, True, approx_hist)
d = {
'frac': frac,
'seq': seq
}
d.update(dict(approx_hist))
writer.writerow(d)
In [32]:
def to_dict(file_path):
d = {}
with open(file_path) as f:
reader = csv.DictReader(f)
for line in reader:
frac = line['frac']
seq = line['seq']
del line['frac']
del line['seq']
to_delete = []
for k, v in line.items():
if (v):
line[k] = int(v)
else:
to_delete.append(k)
for k in to_delete:
del line[k]
d['{}_{}'.format(frac, seq)] = line
return d
In [33]:
import json
with open('images/_states.json', 'w') as f:
f.write(json.dumps(to_dict('images/_states.csv')))
with open('images/_airlines.json', 'w') as f:
f.write(json.dumps(to_dict('images/_airlines.csv')))
In [ ]: