In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import csv
import hashlib

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
data = pd.read_csv('july_2016_ontime.csv', usecols=[0,1])
data.describe()


Out[2]:
CARRIER ORIGIN_STATE_ABR
count 502457 502457
unique 12 52
top WN CA
freq 115208 65622

In [3]:
# get the 9 most common origin states

x = pd.value_counts(data['ORIGIN_STATE_ABR'])[:9]
states = [state for state, count in x.items()]
states


Out[3]:
['CA', 'TX', 'FL', 'GA', 'IL', 'CO', 'NY', 'AZ', 'NV']

In [4]:
# for the 10 most common states, what are the most common airlines

data_most_common_states = data[data['ORIGIN_STATE_ABR'].isin(states)]
x = pd.value_counts(data_most_common_states['CARRIER'])[:7]
carriers = [carrier for carrier, count in x.items()]
carriers


Out[4]:
['WN', 'AA', 'DL', 'UA', 'OO', 'EV', 'B6']

In [5]:
CARRIER_NAMES = {
    'WN': 'Southwest',
    'AA': 'American',
    'DL': 'Delta',
    'UA': 'United',
    'OO': 'SkyWest',
    'EV': 'ExpressJet',
    'B6': 'JetBlue',
    'AS': 'Alaska',
    'NK': 'Spirit',
    'F9': 'Frontier',
    'VX': 'Virgin',
    'HA': 'Hawaiian'
}

STATE_NAMES = {
    'CA': 'California',
    'TX': 'Texas',
    'FL': 'Florida',
    'GA': 'Georgia',
    'IL': 'Illinois',
    'CO': 'Colorado',
    'NY': 'New York',
    'AZ': 'Arizona',
    'NV': 'Nevada',
    'WA': 'Washington'
}

In [6]:
data_filtered = data[data['ORIGIN_STATE_ABR'].isin(states) & data['CARRIER'].isin(carriers)]
data_filtered


Out[6]:
CARRIER ORIGIN_STATE_ABR
26 AA NY
27 AA NY
28 AA NY
29 AA NY
30 AA NY
31 AA NY
32 AA NY
33 AA NY
34 AA NY
35 AA NY
36 AA NY
37 AA NY
38 AA NY
39 AA NY
40 AA NY
41 AA NY
42 AA NY
43 AA NY
44 AA NY
45 AA NY
46 AA NY
47 AA NY
48 AA NY
49 AA NY
50 AA NY
51 AA NY
52 AA NY
53 AA NY
54 AA NY
55 AA NY
... ... ...
502366 WN TX
502367 WN TX
502368 WN TX
502369 WN TX
502370 WN TX
502371 WN TX
502372 WN TX
502373 WN TX
502374 WN TX
502375 WN TX
502376 WN TX
502377 WN TX
502378 WN TX
502379 WN TX
502380 WN TX
502381 WN TX
502382 WN TX
502383 WN TX
502384 WN TX
502385 WN TX
502386 WN TX
502387 WN TX
502388 WN TX
502389 WN TX
502390 WN TX
502391 WN TX
502392 WN TX
502393 WN TX
502394 WN TX
502395 WN TX

270230 rows × 2 columns


In [7]:
aggregated = data_filtered.groupby(['CARRIER', 'ORIGIN_STATE_ABR']).size().unstack()
sns.heatmap(aggregated,square=True)


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1069983c8>

In [8]:
colors = sns.color_palette("Set1", n_colors=9, desat=.8)

def plot(hist, state, state_index, frac, seq):
    hist = hist.sort_values(ascending=True)
    
    matplotlib.rc('text', usetex=True)
    fig = plt.figure(figsize=(7,hist.size*0.7))
    ax = fig.add_subplot(111)
    hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=colors[state_index])
    
    # title
    ax.set_title(label=r"Number of Flights out of the State of \textbf{" + STATE_NAMES[state] + "}", fontsize=18)
    ax.title.set_position([.5, 1.05])
    
    # x
    import matplotlib.ticker as plticker

    loc = plticker.MaxNLocator(nbins=7)
    ax.xaxis.set_major_locator(loc)
    ax.set_xlabel('Number of Flights', fontsize=14)
    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    
    # y
    ax.set_ylabel('Airline', fontsize=14)
    ax.set_yticklabels(hist.keys().map(lambda x: CARRIER_NAMES[x]))
    
    fig.savefig('images/{}_{}_{}.png'.format(state, frac, seq), bbox_inches='tight', dpi=200)
    plt.close(fig)

In [9]:
# what fraction of the data to use for sample
FRACS = [100, 200, 500, 1000]

# maximum number of carriers per state
LIMIT = 7

all_carriers = list(data['CARRIER'].unique())

with open('images/_data.csv', 'w') as csvfile:
    fieldnames = ['state', 'frac', 'seq'] + all_carriers
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for state_index, state in enumerate(states):
        state_data = data[data['ORIGIN_STATE_ABR'] == state]['CARRIER']

        # precise, limited
        hist = pd.value_counts(state_data)[:LIMIT]
        plot(hist, state, state_index, 1, 0)
        d = {
            'state': state,
            'frac': 1,
            'seq': 0
        }
        d.update(dict(hist))
        writer.writerow(d)

        # top airlines from the state
        top_carriers = [carrier for carrier, count in hist.items()]
        
        state_data = state_data[state_data.isin(top_carriers)]
        
        for frac in FRACS:
            for seq in range(1, 21):
                rand = int(hashlib.md5(r'{}_{}_{}'.format(state, frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295

                # approx
                sample = state_data.sample(frac=1.0/frac, replace=True, random_state=rand)
                hist = pd.value_counts(sample) * frac
                plot(hist, state, state_index, frac, seq)
                
                d = {
                    'state': state,
                    'frac': frac,
                    'seq': seq
                }
                d.update(dict(hist))
                writer.writerow(d)

        csvfile.flush()
        print("Done with {}".format(state))


Done with CA
Done with TX
Done with FL
Done with GA
Done with IL
Done with CO
Done with NY
Done with AZ
Done with NV

In [ ]: