notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import csv
import hashlib

%matplotlib inline
%config InlineBackend.figure_format = 'retina'



In [2]:

    
data = pd.read_csv('july_2016_ontime.csv', usecols=[0,1])
data.describe()









    Out[2]:






  
    
      
      CARRIER
      ORIGIN_STATE_ABR
    
  
  
    
      count
      502457
      502457
    
    
      unique
      12
      52
    
    
      top
      WN
      CA
    
    
      freq
      115208
      65622



In [3]:

    
# get the 9 most common origin states

x = pd.value_counts(data['ORIGIN_STATE_ABR'])[:9]
states = [state for state, count in x.items()]
states









    Out[3]:





['CA', 'TX', 'FL', 'GA', 'IL', 'CO', 'NY', 'AZ', 'NV']



In [4]:

    
# for the 10 most common states, what are the most common airlines

data_most_common_states = data[data['ORIGIN_STATE_ABR'].isin(states)]
x = pd.value_counts(data_most_common_states['CARRIER'])[:7]
carriers = [carrier for carrier, count in x.items()]
carriers









    Out[4]:





['WN', 'AA', 'DL', 'UA', 'OO', 'EV', 'B6']



In [5]:

    
CARRIER_NAMES = {
    'WN': 'Southwest',
    'AA': 'American',
    'DL': 'Delta',
    'UA': 'United',
    'OO': 'SkyWest',
    'EV': 'ExpressJet',
    'B6': 'JetBlue',
    'AS': 'Alaska',
    'NK': 'Spirit',
    'F9': 'Frontier',
    'VX': 'Virgin',
    'HA': 'Hawaiian'
}

STATE_NAMES = {
    'CA': 'California',
    'TX': 'Texas',
    'FL': 'Florida',
    'GA': 'Georgia',
    'IL': 'Illinois',
    'CO': 'Colorado',
    'NY': 'New York',
    'AZ': 'Arizona',
    'NV': 'Nevada',
    'WA': 'Washington'
}



In [6]:

    
data_filtered = data[data['ORIGIN_STATE_ABR'].isin(states) & data['CARRIER'].isin(carriers)]
data_filtered









    Out[6]:






  
    
      
      CARRIER
      ORIGIN_STATE_ABR
    
  
  
    
      26
      AA
      NY
    
    
      27
      AA
      NY
    
    
      28
      AA
      NY
    
    
      29
      AA
      NY
    
    
      30
      AA
      NY
    
    
      31
      AA
      NY
    
    
      32
      AA
      NY
    
    
      33
      AA
      NY
    
    
      34
      AA
      NY
    
    
      35
      AA
      NY
    
    
      36
      AA
      NY
    
    
      37
      AA
      NY
    
    
      38
      AA
      NY
    
    
      39
      AA
      NY
    
    
      40
      AA
      NY
    
    
      41
      AA
      NY
    
    
      42
      AA
      NY
    
    
      43
      AA
      NY
    
    
      44
      AA
      NY
    
    
      45
      AA
      NY
    
    
      46
      AA
      NY
    
    
      47
      AA
      NY
    
    
      48
      AA
      NY
    
    
      49
      AA
      NY
    
    
      50
      AA
      NY
    
    
      51
      AA
      NY
    
    
      52
      AA
      NY
    
    
      53
      AA
      NY
    
    
      54
      AA
      NY
    
    
      55
      AA
      NY
    
    
      ...
      ...
      ...
    
    
      502366
      WN
      TX
    
    
      502367
      WN
      TX
    
    
      502368
      WN
      TX
    
    
      502369
      WN
      TX
    
    
      502370
      WN
      TX
    
    
      502371
      WN
      TX
    
    
      502372
      WN
      TX
    
    
      502373
      WN
      TX
    
    
      502374
      WN
      TX
    
    
      502375
      WN
      TX
    
    
      502376
      WN
      TX
    
    
      502377
      WN
      TX
    
    
      502378
      WN
      TX
    
    
      502379
      WN
      TX
    
    
      502380
      WN
      TX
    
    
      502381
      WN
      TX
    
    
      502382
      WN
      TX
    
    
      502383
      WN
      TX
    
    
      502384
      WN
      TX
    
    
      502385
      WN
      TX
    
    
      502386
      WN
      TX
    
    
      502387
      WN
      TX
    
    
      502388
      WN
      TX
    
    
      502389
      WN
      TX
    
    
      502390
      WN
      TX
    
    
      502391
      WN
      TX
    
    
      502392
      WN
      TX
    
    
      502393
      WN
      TX
    
    
      502394
      WN
      TX
    
    
      502395
      WN
      TX
    
  

270230 rows × 2 columns



In [7]:

    
aggregated = data_filtered.groupby(['CARRIER', 'ORIGIN_STATE_ABR']).size().unstack()
sns.heatmap(aggregated,square=True)









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x1069983c8>



In [8]:

    
colors = sns.color_palette("Set1", n_colors=9, desat=.8)

def plot(hist, state, state_index, frac, seq):
    hist = hist.sort_values(ascending=True)
    
    matplotlib.rc('text', usetex=True)
    fig = plt.figure(figsize=(7,hist.size*0.7))
    ax = fig.add_subplot(111)
    hist.plot.barh(ax=ax, edgecolor='none', width=0.6, color=colors[state_index])
    
    # title
    ax.set_title(label=r"Number of Flights out of the State of \textbf{" + STATE_NAMES[state] + "}", fontsize=18)
    ax.title.set_position([.5, 1.05])
    
    # x
    import matplotlib.ticker as plticker

    loc = plticker.MaxNLocator(nbins=7)
    ax.xaxis.set_major_locator(loc)
    ax.set_xlabel('Number of Flights', fontsize=14)
    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    
    # y
    ax.set_ylabel('Airline', fontsize=14)
    ax.set_yticklabels(hist.keys().map(lambda x: CARRIER_NAMES[x]))
    
    fig.savefig('images/{}_{}_{}.png'.format(state, frac, seq), bbox_inches='tight', dpi=200)
    plt.close(fig)



In [9]:

    
# what fraction of the data to use for sample
FRACS = [100, 200, 500, 1000]

# maximum number of carriers per state
LIMIT = 7

all_carriers = list(data['CARRIER'].unique())

with open('images/_data.csv', 'w') as csvfile:
    fieldnames = ['state', 'frac', 'seq'] + all_carriers
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for state_index, state in enumerate(states):
        state_data = data[data['ORIGIN_STATE_ABR'] == state]['CARRIER']

        # precise, limited
        hist = pd.value_counts(state_data)[:LIMIT]
        plot(hist, state, state_index, 1, 0)
        d = {
            'state': state,
            'frac': 1,
            'seq': 0
        }
        d.update(dict(hist))
        writer.writerow(d)

        # top airlines from the state
        top_carriers = [carrier for carrier, count in hist.items()]
        
        state_data = state_data[state_data.isin(top_carriers)]
        
        for frac in FRACS:
            for seq in range(1, 21):
                rand = int(hashlib.md5(r'{}_{}_{}'.format(state, frac, seq).encode('utf-8')).hexdigest(), 16) % 4294967295

                # approx
                sample = state_data.sample(frac=1.0/frac, replace=True, random_state=rand)
                hist = pd.value_counts(sample) * frac
                plot(hist, state, state_index, frac, seq)
                
                d = {
                    'state': state,
                    'frac': frac,
                    'seq': seq
                }
                d.update(dict(hist))
                writer.writerow(d)

        csvfile.flush()
        print("Done with {}".format(state))









    



Done with CA
Done with TX
Done with FL
Done with GA
Done with IL
Done with CO
Done with NY
Done with AZ
Done with NV



In [ ]:

	CARRIER	ORIGIN_STATE_ABR
26	AA	NY
27	AA	NY
28	AA	NY
29	AA	NY
30	AA	NY
31	AA	NY
32	AA	NY
33	AA	NY
34	AA	NY
35	AA	NY
36	AA	NY
37	AA	NY
38	AA	NY
39	AA	NY
40	AA	NY
41	AA	NY
42	AA	NY
43	AA	NY
44	AA	NY
45	AA	NY
46	AA	NY
47	AA	NY
48	AA	NY
49	AA	NY
50	AA	NY
51	AA	NY
52	AA	NY
53	AA	NY
54	AA	NY
55	AA	NY
...	...	...
502366	WN	TX
502367	WN	TX
502368	WN	TX
502369	WN	TX
502370	WN	TX
502371	WN	TX
502372	WN	TX
502373	WN	TX
502374	WN	TX
502375	WN	TX
502376	WN	TX
502377	WN	TX
502378	WN	TX
502379	WN	TX
502380	WN	TX
502381	WN	TX
502382	WN	TX
502383	WN	TX
502384	WN	TX
502385	WN	TX
502386	WN	TX
502387	WN	TX
502388	WN	TX
502389	WN	TX
502390	WN	TX
502391	WN	TX
502392	WN	TX
502393	WN	TX
502394	WN	TX
502395	WN	TX