In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import csv
import hashlib

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
data = pd.read_csv('july_2016_ontime.csv', usecols=[0,1,2])
data.describe()


Out[2]:
DAY_OF_MONTH
count 502457.000000
mean 16.160416
std 8.858628
min 1.000000
25% 8.000000
50% 16.000000
75% 24.000000
max 31.000000

In [3]:
x = pd.value_counts(data['ORIGIN_STATE_ABR'])
states = [state for state, count in x.items()]

In [4]:
x = pd.value_counts(data['CARRIER'])
carriers = [carrier for carrier, count in x.items()]

In [5]:
CARRIER_NAMES = {
    'WN': 'Southwest',
    'AA': 'American',
    'DL': 'Delta',
    'UA': 'United',
    'OO': 'SkyWest',
    'EV': 'ExpressJet',
    'B6': 'JetBlue',
    'AS': 'Alaska',
    'NK': 'Spirit',
    'F9': 'Frontier',
    'VX': 'Virgin',
    'HA': 'Hawaiian'
}

STATE_NAMES = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AS': 'American Samoa',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'GU': 'Guam',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MP': 'Northern Mariana Islands',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NA': 'National',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'PR': 'Puerto Rico',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VI': 'Virgin Islands',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}

In [6]:
blue = sns.color_palette("Set1", n_colors=8)[1]
orange = '#fd6a28'

def plot_states(hist, frac, seq, precise, approx):
    return
    hist = hist.sort_values(ascending=True)
    
    fig = plt.figure(figsize=(7,hist.size*0.7))
    ax = fig.add_subplot(111)
    hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=blue if precise or approx is not None else orange)
    
    if approx is not None:
        ax.set_xlim([0, max(max(hist), max(approx) + 50)])
        for k, y in approx.items():
            i = np.nonzero(hist.index==k)[0][0]
            ax.plot((y, y), (i -0.3, i + 0.3), 'k-', color=orange, solid_capstyle=None)
    
    # title
    ax.set_title(label=r"Number of Flights per State", fontsize=18)
    ax.title.set_position([.5, 1.05])
    
    # x
    import matplotlib.ticker as plticker

    loc = plticker.MaxNLocator(nbins=7)
    ax.xaxis.set_major_locator(loc)
    ax.set_xlabel('Number of Flights', fontsize=14)
    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    
    # y
    ax.set_ylabel('State', fontsize=14)
    ax.set_yticklabels(hist.keys().map(lambda x: STATE_NAMES[x]))
    
    fig.savefig('images/states_{}_{}{}.png'.format(frac, seq, '_diff' if approx is not None else ''), bbox_inches='tight', dpi=200)
    plt.close(fig)
    
def plot_airlines(hist, frac, seq, precise, approx):
    return
    hist = hist.sort_values(ascending=True)
    
    fig = plt.figure(figsize=(7,hist.size*0.7))
    ax = fig.add_subplot(111)
    hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=blue if precise or approx is not None else orange)
    
    if approx is not None:
        ax.set_xlim([0, max(max(hist), max(approx) + 50)])
        for k, y in approx.items():
            i = np.nonzero(hist.index==k)[0][0]
            ax.plot((y, y), (i -0.3, i + 0.3), 'k-', color=orange, solid_capstyle=None)
    
    # title
    ax.set_title(label=r"Number of Flights per Airline", fontsize=18)
    ax.title.set_position([.5, 1.05])
    
    # x
    import matplotlib.ticker as plticker

    loc = plticker.MaxNLocator(nbins=7)
    ax.xaxis.set_major_locator(loc)
    ax.set_xlabel('Number of Flights', fontsize=14)
    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    
    # y
    ax.set_ylabel('Airline', fontsize=14)
    ax.set_yticklabels(hist.keys().map(lambda x: CARRIER_NAMES[x]))
    
    fig.savefig('images/airline_{}_{}{}.png'.format(frac, seq, '_diff' if approx is not None else ''), bbox_inches='tight', dpi=200)
    plt.close(fig)

In [7]:
FRACS = [10, 100, 200, 500, 1000]
LIMIT = 7
SEQS = 20

with open('images/_states.csv', 'w') as csvfile:
    filtered_data = data[data['CARRIER'] == 'B6']
    
    state_data = filtered_data['ORIGIN_STATE_ABR']
    
    hist = pd.value_counts(state_data)[:LIMIT+1]
    
    # remove rows from each group corresponding to how many rows are the (LIMIT+1)th group
    top_states = [state for state, count in hist.items()]
    to_remove = int(0.8*hist[-1])
    filtered_data = None
    for state in top_states[:LIMIT]:
        new_data = state_data[state_data==state][to_remove:]
        if filtered_data is None:
            filtered_data = new_data
        else:
            filtered_data = filtered_data.append(new_data)
    
    hist = pd.value_counts(filtered_data)
    
    fieldnames = ['frac', 'seq'] + top_states[:LIMIT]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # precise
    plot_states(hist, 1, 0, True, None)
    d = {
        'frac': 1,
        'seq': 0
    }
    d.update(dict(hist))
    writer.writerow(d)
    
    # fake approx
    plot_states(hist, 1, 1, False, None)
    plot_states(hist, 1, 1, True, hist)
    d = {
        'frac': 1,
        'seq': 1
    }
    d.update(dict(hist))
    writer.writerow(d)

    for frac in FRACS:
        for seq in range(2, 2 + SEQS):
            rand = int(hashlib.md5(r'{}_{}'.format(frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295

            # approx
            sample = filtered_data.sample(frac=1.0/frac, replace=True, random_state=rand)
            approx_hist = pd.value_counts(sample) * frac
            plot_states(approx_hist, frac, seq, False, None)
            plot_states(hist, frac, seq, True, approx_hist)

            d = {
                'frac': frac,
                'seq': seq
            }
            d.update(dict(approx_hist))
            writer.writerow(d)

In [8]:
FRACS = [10, 100, 200, 500, 1000]
LIMIT = 7

with open('images/_airlines.csv', 'w') as csvfile:
    filtered_data = data[data['ORIGIN_STATE_ABR'] == 'NY']
    
    state_data = filtered_data['CARRIER']
    
    hist = pd.value_counts(state_data)[:LIMIT+1]
    
    # remove rows from each group corresponding to how many rows are the (LIMIT+1)th group
    top_states = [state for state, count in hist.items()]
    to_remove = int(0.8*hist[-1])
    filtered_data = None
    for state in top_states[:LIMIT]:
        new_data = state_data[state_data==state][to_remove:]
        if filtered_data is None:
            filtered_data = new_data
        else:
            filtered_data = filtered_data.append(new_data)
    
    hist = pd.value_counts(filtered_data)
    
    fieldnames = ['frac', 'seq'] + top_states[:LIMIT]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # precise
    plot_airlines(hist, 1, 0, True, None)
    d = {
        'frac': 1,
        'seq': 0
    }
    d.update(dict(hist))
    writer.writerow(d)
    
    plot_airlines(hist, 1, 1, False, None)
    plot_airlines(hist, 1, 1, True, hist)
    d = {
        'frac': 1,
        'seq': 1
    }
    d.update(dict(hist))
    writer.writerow(d)

    for frac in FRACS:
        for seq in range(2, 2 + SEQS):
            rand = int(hashlib.md5(r'{}_{}'.format(frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295

            # approx
            sample = filtered_data.sample(frac=1.0/frac, replace=True, random_state=rand)
            approx_hist = pd.value_counts(sample) * frac
            plot_airlines(approx_hist, frac, seq, False, None)
            plot_airlines(hist, frac, seq, True, approx_hist)

            d = {
                'frac': frac,
                'seq': seq
            }
            d.update(dict(approx_hist))
            writer.writerow(d)

In [32]:
def to_dict(file_path):
    d = {}
    with open(file_path) as f:
        reader = csv.DictReader(f)
        for line in reader:
            frac = line['frac']
            seq = line['seq']
            del line['frac']
            del line['seq']
            to_delete = []
            for k, v in line.items():
                if (v):
                    line[k] = int(v)
                else:
                    to_delete.append(k)
            for k in to_delete:
                del line[k]
            d['{}_{}'.format(frac, seq)] = line
    return d

In [33]:
import json
with open('images/_states.json', 'w') as f:
    f.write(json.dumps(to_dict('images/_states.csv')))
with open('images/_airlines.json', 'w') as f:
    f.write(json.dumps(to_dict('images/_airlines.csv')))

In [ ]: