In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

plt.rcParams['figure.figsize'] = (12, 8)

In [2]:
df = pd.read_csv("flights.csv", index_col=0)

In [3]:
df.head()


Out[3]:
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight origin dest air_time distance hour minute
1 2013 1 1 517 2 830 11 UA N14228 1545 EWR IAH 227 1400 5 17
2 2013 1 1 533 4 850 20 UA N24211 1714 LGA IAH 227 1416 5 33
3 2013 1 1 542 2 923 33 AA N619AA 1141 JFK MIA 160 1089 5 42
4 2013 1 1 544 -1 1004 -18 B6 N804JB 725 JFK BQN 183 1576 5 44
5 2013 1 1 554 -6 812 -25 DL N668DN 461 LGA ATL 116 762 5 54

In [16]:
df['date'] = pd.to_datetime(df.year.astype(str)
                            + '-' + df.month.astype(str)
                            + '-' + df.day.astype(str))
df.head()


Out[16]:
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight origin dest air_time distance hour minute date
1 2013 1 1 517 2 830 11 UA N14228 1545 EWR IAH 227 1400 5 17 2013-01-01
2 2013 1 1 533 4 850 20 UA N24211 1714 LGA IAH 227 1416 5 33 2013-01-01
3 2013 1 1 542 2 923 33 AA N619AA 1141 JFK MIA 160 1089 5 42 2013-01-01
4 2013 1 1 544 -1 1004 -18 B6 N804JB 725 JFK BQN 183 1576 5 44 2013-01-01
5 2013 1 1 554 -6 812 -25 DL N668DN 461 LGA ATL 116 762 5 54 2013-01-01

In [6]:
avg_delay = df.dep_delay.mean()
print("Average departure delay: {:.2f} minutes".format(avg_delay))


Average departure delay: 12.64 minutes

Which airline is the worst?

Group by carrier and get the average dep_delay.


In [30]:
df.groupby('carrier')['dep_delay'].mean().order(ascending=False)


Out[30]:
carrier
F9         20.215543
EV         19.955390
YV         18.996330
FL         18.726075
WN         17.711744
9E         16.725769
B6         13.022522
VX         12.869421
OO         12.586207
UA         12.106073
MQ         10.552041
DL          9.264505
AA          8.586016
AS          5.804775
HA          4.900585
US          3.782418
Name: dep_delay, dtype: float64

In [22]:
fig, ax = plt.subplots(figsize=(16, 6))
sns.kdeplot(df.dep_time / 60, shade=True, ax=ax)


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x109c74ed0>

In [39]:
df.plot(kind='scatter', x='dep_time', y='dep_delay')


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x109409990>

In [20]:
df.describe()


Out[20]:
dep_time dep_delay arr_time arr_delay flight air_time distance hour minute
count 328521.000000 328521.000000 328063.000000 327346.000000 336776.000000 327346.000000 336776.000000 328521.000000 328521.000000
mean 1349.109947 12.639070 1502.054999 6.895377 1971.923620 150.686460 1039.912604 13.173544 31.755501
std 488.281791 40.210061 533.264132 44.633292 1632.471938 93.688305 733.233033 4.894426 18.230997
min 1.000000 -43.000000 1.000000 -86.000000 1.000000 20.000000 17.000000 0.000000 0.000000
25% 907.000000 -5.000000 1104.000000 -17.000000 553.000000 82.000000 502.000000 9.000000 16.000000
50% 1401.000000 -2.000000 1535.000000 -5.000000 1496.000000 129.000000 872.000000 14.000000 31.000000
75% 1744.000000 11.000000 1940.000000 14.000000 3465.000000 192.000000 1389.000000 17.000000 49.000000
max 2400.000000 1301.000000 2400.000000 1272.000000 8500.000000 695.000000 4983.000000 24.000000 59.000000

In [9]:
df.groupby('carrier')['dep_delay'].mean().order(ascending=False)


Out[9]:
carrier
F9         20.215543
EV         19.955390
YV         18.996330
FL         18.726075
WN         17.711744
9E         16.725769
B6         13.022522
VX         12.869421
OO         12.586207
UA         12.106073
MQ         10.552041
DL          9.264505
AA          8.586016
AS          5.804775
HA          4.900585
US          3.782418
Name: dep_delay, dtype: float64

In [15]:
count_ts = df.groupby('date')['flight'].count()
count_ts.plot()


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x108f44210>

In [19]:
pd.rolling_mean(count_ts, 7).plot()


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ce3fb10>

In [25]:
datacols = ['dep_time', 'dep_delay', 'arr_time', 'arr_delay', 'air_time', 'distance']

In [28]:
gr = sns.PairGrid(df, hue='carrier', vars=datacols, size=6)
gr.map_upper(plt.scatter)
gr.map_diag(sns.kdeplot)
gr.map_lower(plt.scatter)



In [8]:
df.head()


Out[8]:
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight origin dest air_time distance hour minute
1 2013 1 1 517 2 830 11 UA N14228 1545 EWR IAH 227 1400 5 17
2 2013 1 1 533 4 850 20 UA N24211 1714 LGA IAH 227 1416 5 33
3 2013 1 1 542 2 923 33 AA N619AA 1141 JFK MIA 160 1089 5 42
4 2013 1 1 544 -1 1004 -18 B6 N804JB 725 JFK BQN 183 1576 5 44
5 2013 1 1 554 -6 812 -25 DL N668DN 461 LGA ATL 116 762 5 54

In [13]:
delay.head()


Out[13]:
flight distance dep_delay
flight tailnum
1 N15710 1 719 1
N209WN 1 725 9
N216JB 1 1069 -1
N232WN 1 725 4
N234WN 1 725 45

In [27]:
planes = df.groupby('tailnum')
delay = planes.agg({'flight': 'count', 'distance': 'mean',
                    'dep_delay': 'mean'})
delay = delay.query('flight > 20 & distance < 2000')
delay.head()


Out[27]:
flight distance dep_delay
tailnum
N0EGMQ 371 676.188679 8.491525
N10156 153 757.947712 17.815068
N102UW 48 535.875000 8.000000
N103US 46 535.195652 -3.195652
N104UW 47 535.255319 9.936170

In [45]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = delay.plot(kind='scatter', x='distance', y='dep_delay',
                color='k', alpha=.5, s=(5 + df.flight.values /  100),
                ax=ax)
xy = lowess(delay.dep_delay, delay.distance, frac=1/3)
ax.plot(xy[:, 0], xy[:, 1], linewidth=4)


Out[45]:
[<matplotlib.lines.Line2D at 0x10c7dd128>]

In [46]:
from IPython.html.widgets import interact

In [49]:
def f(frac=1/3):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax = delay.plot(kind='scatter', x='distance', y='dep_delay',
                    color='k', alpha=.5, s=(5 + df.flight.values /  100),
                    ax=ax)
    xy = lowess(delay.dep_delay, delay.distance, frac=frac)
    ax.plot(xy[:, 0], xy[:, 1], linewidth=4)
    return ax

In [50]:
interact(f)



In [63]:
sns.set_context("talk", rc={"figure.figsize": (12, 9)})
networks = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)
corrmap = networks.iloc[:, :30].corr()

In [71]:
cmap = sns.choose_diverging_palette(as_cmap=True)



In [72]:
sns.heatmap(corrmap, square=True, linewidths=1, cmap=cmap);



In [ ]: