In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta

# We will explore statistics looking at these packages
import pandas as pd
import seaborn as sn
import statsmodels.api as sm

Datasets

We will look at two datasets. First the Goddard Institute for Space Studies surface temperature analysis dataset from NASA. There are many intersting products here; we will look at zonal mean temperature anomalies. The 'base' period that defines the mean is 1951 through 1980.


In [4]:
url = 'https://data.giss.nasa.gov/gistemp/tabledata_v3/ZonAnn.Ts+dSST.csv'
df_zonal = pd.read_csv(url, index_col=0, parse_dates=True)

# The `describe` method is a good way to get an idea of the statistical
# properties of a variable or dataset.
df_zonal['Glob'].describe()


Out[4]:
count    137.000000
mean       0.025985
std        0.324288
min       -0.470000
25%       -0.200000
50%       -0.070000
75%        0.190000
max        0.980000
Name: Glob, dtype: float64

In [3]:
df_zonal


Out[3]:
Glob NHem SHem 24N-90N 24S-24N 90S-24S 64N-90N 44N-64N 24N-44N EQU-24N 24S-EQU 44S-24S 64S-44S 90S-64S
Year
1880-01-01 -0.20 -0.33 -0.06 -0.41 -0.14 -0.06 -0.95 -0.62 -0.21 -0.22 -0.07 -0.02 -0.08 0.34
1881-01-01 -0.11 -0.19 -0.03 -0.30 0.00 -0.06 -0.67 -0.47 -0.13 -0.01 0.01 -0.06 -0.02 0.32
1882-01-01 -0.09 -0.17 -0.02 -0.23 -0.09 0.03 -1.37 -0.26 -0.01 -0.08 -0.09 0.03 0.08 0.37
1883-01-01 -0.20 -0.30 -0.09 -0.36 -0.21 -0.02 -0.34 -0.63 -0.18 -0.21 -0.20 -0.01 0.00 0.32
1884-01-01 -0.27 -0.41 -0.13 -0.56 -0.17 -0.12 -1.31 -0.62 -0.38 -0.18 -0.15 -0.14 -0.05 0.36
1885-01-01 -0.31 -0.41 -0.22 -0.59 -0.17 -0.20 -1.16 -0.71 -0.40 -0.11 -0.24 -0.27 -0.07 0.33
1886-01-01 -0.30 -0.39 -0.22 -0.48 -0.24 -0.20 -1.21 -0.46 -0.36 -0.23 -0.25 -0.17 -0.21 0.23
1887-01-01 -0.33 -0.36 -0.29 -0.46 -0.27 -0.27 -1.54 -0.56 -0.19 -0.21 -0.33 -0.24 -0.29 0.16
1888-01-01 -0.20 -0.22 -0.17 -0.42 0.08 -0.34 -1.40 -0.47 -0.20 0.09 0.06 -0.29 -0.38 0.11
1889-01-01 -0.11 -0.15 -0.07 -0.26 0.05 -0.17 -0.84 -0.18 -0.19 0.01 0.09 -0.15 -0.17 0.14
1890-01-01 -0.36 -0.39 -0.34 -0.45 -0.37 -0.26 -1.30 -0.46 -0.28 -0.31 -0.44 -0.16 -0.40 -0.01
1891-01-01 -0.24 -0.24 -0.25 -0.37 -0.15 -0.24 -1.30 -0.25 -0.25 -0.04 -0.26 -0.20 -0.27 0.04
1892-01-01 -0.27 -0.31 -0.22 -0.32 -0.30 -0.16 -1.29 -0.35 -0.11 -0.30 -0.30 -0.03 -0.33 0.00
1893-01-01 -0.30 -0.40 -0.20 -0.38 -0.40 -0.07 -0.84 -0.39 -0.28 -0.42 -0.38 0.04 -0.21 -0.03
1894-01-01 -0.30 -0.36 -0.24 -0.27 -0.38 -0.23 -1.28 -0.20 -0.07 -0.51 -0.25 -0.15 -0.31 -0.16
1895-01-01 -0.21 -0.26 -0.15 -0.33 -0.13 -0.19 -0.89 -0.38 -0.15 -0.16 -0.09 -0.20 -0.13 -0.06
1896-01-01 -0.14 -0.20 -0.08 -0.33 0.03 -0.19 -1.22 -0.36 -0.08 0.01 0.06 -0.15 -0.21 -0.01
1897-01-01 -0.10 -0.11 -0.09 -0.29 0.17 -0.28 -0.76 -0.32 -0.15 0.17 0.17 -0.20 -0.37 0.00
1898-01-01 -0.28 -0.26 -0.30 -0.27 -0.27 -0.29 -1.22 -0.03 -0.18 -0.24 -0.30 -0.26 -0.30 0.03
1899-01-01 -0.15 -0.16 -0.15 -0.17 -0.12 -0.18 -1.05 0.03 -0.09 -0.12 -0.12 -0.13 -0.22 0.08
1900-01-01 -0.08 -0.03 -0.13 -0.12 0.14 -0.35 -0.57 -0.05 -0.04 0.12 0.17 -0.31 -0.38 0.07
1901-01-01 -0.14 -0.07 -0.21 -0.11 -0.05 -0.30 -0.53 -0.02 -0.05 -0.01 -0.08 -0.30 -0.26 -0.04
1902-01-01 -0.28 -0.31 -0.24 -0.51 -0.06 -0.34 -1.61 -0.42 -0.27 -0.01 -0.10 -0.27 -0.42 0.01
1903-01-01 -0.36 -0.35 -0.37 -0.42 -0.31 -0.35 -0.51 -0.31 -0.46 -0.24 -0.39 -0.37 -0.25 -0.59
1904-01-01 -0.44 -0.45 -0.44 -0.47 -0.45 -0.40 -0.35 -0.55 -0.46 -0.41 -0.50 -0.37 -0.38 -1.26
1905-01-01 -0.28 -0.28 -0.27 -0.38 -0.13 -0.38 -0.22 -0.18 -0.54 -0.13 -0.13 -0.38 -0.34 -0.37
1906-01-01 -0.22 -0.20 -0.25 -0.21 -0.21 -0.25 -0.40 0.01 -0.30 -0.19 -0.24 -0.24 -0.22 -0.66
1907-01-01 -0.40 -0.46 -0.33 -0.57 -0.34 -0.30 -0.80 -0.72 -0.42 -0.29 -0.38 -0.25 -0.31 -1.18
1908-01-01 -0.43 -0.46 -0.41 -0.47 -0.47 -0.34 -0.49 -0.53 -0.43 -0.44 -0.51 -0.31 -0.38 0.65
1909-01-01 -0.47 -0.47 -0.48 -0.48 -0.51 -0.41 -0.84 -0.52 -0.36 -0.45 -0.57 -0.38 -0.41 -0.49
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1987-01-01 0.33 0.26 0.40 0.07 0.58 0.25 -0.28 0.13 0.14 0.55 0.61 0.33 0.21 0.09
1988-01-01 0.40 0.37 0.43 0.41 0.37 0.44 0.81 0.50 0.23 0.31 0.42 0.39 0.18 1.14
1989-01-01 0.28 0.29 0.28 0.43 0.14 0.33 0.44 0.67 0.27 0.09 0.20 0.37 0.25 0.34
1990-01-01 0.44 0.51 0.36 0.62 0.36 0.36 0.64 0.84 0.48 0.35 0.37 0.38 0.33 0.35
1991-01-01 0.42 0.41 0.43 0.47 0.38 0.44 0.79 0.60 0.28 0.33 0.42 0.34 0.34 1.02
1992-01-01 0.23 0.13 0.32 0.08 0.29 0.28 -0.14 0.38 -0.02 0.19 0.39 0.22 0.34 0.41
1993-01-01 0.24 0.20 0.28 0.14 0.32 0.23 0.67 0.24 -0.08 0.28 0.35 0.26 0.35 -0.15
1994-01-01 0.31 0.37 0.26 0.44 0.30 0.21 0.39 0.47 0.43 0.26 0.35 0.28 0.25 -0.13
1995-01-01 0.45 0.58 0.33 0.70 0.43 0.23 1.38 0.95 0.33 0.40 0.46 0.33 0.16 0.06
1996-01-01 0.35 0.29 0.40 0.27 0.32 0.45 0.86 0.20 0.13 0.31 0.34 0.35 0.31 1.09
1997-01-01 0.48 0.54 0.42 0.56 0.51 0.35 0.81 0.88 0.27 0.52 0.51 0.44 0.38 -0.01
1998-01-01 0.63 0.74 0.53 0.82 0.70 0.36 0.99 0.93 0.69 0.63 0.78 0.44 0.32 0.17
1999-01-01 0.41 0.52 0.31 0.74 0.24 0.32 0.47 0.83 0.78 0.19 0.29 0.50 0.16 0.02
2000-01-01 0.42 0.52 0.32 0.72 0.26 0.33 1.12 0.77 0.57 0.23 0.29 0.47 0.10 0.34
2001-01-01 0.54 0.65 0.43 0.81 0.42 0.43 1.06 0.82 0.72 0.41 0.43 0.58 0.20 0.43
2002-01-01 0.63 0.72 0.54 0.84 0.58 0.48 1.36 0.97 0.59 0.55 0.62 0.49 0.29 0.85
2003-01-01 0.62 0.74 0.50 0.83 0.61 0.41 1.55 0.95 0.53 0.60 0.63 0.47 0.23 0.53
2004-01-01 0.54 0.68 0.41 0.76 0.55 0.32 0.62 0.93 0.69 0.57 0.53 0.50 0.21 -0.07
2005-01-01 0.69 0.84 0.53 1.00 0.63 0.46 2.01 1.17 0.57 0.62 0.63 0.52 0.21 0.79
2006-01-01 0.63 0.81 0.46 0.97 0.55 0.40 1.70 1.06 0.68 0.56 0.55 0.54 0.19 0.33
2007-01-01 0.66 0.84 0.47 1.10 0.46 0.48 1.95 1.30 0.70 0.46 0.47 0.54 0.06 1.14
2008-01-01 0.54 0.67 0.40 0.89 0.38 0.39 1.44 1.03 0.62 0.34 0.41 0.56 0.10 0.44
2009-01-01 0.64 0.72 0.58 0.75 0.67 0.50 1.25 0.59 0.69 0.67 0.67 0.61 0.18 0.81
2010-01-01 0.71 0.89 0.54 1.00 0.68 0.48 1.99 0.87 0.75 0.73 0.62 0.66 0.23 0.35
2011-01-01 0.60 0.72 0.48 0.94 0.36 0.58 2.11 0.91 0.59 0.39 0.33 0.68 0.26 0.92
2012-01-01 0.63 0.78 0.48 0.99 0.51 0.45 1.89 0.91 0.74 0.48 0.53 0.60 0.25 0.32
2013-01-01 0.65 0.76 0.54 0.89 0.56 0.54 1.18 1.04 0.70 0.58 0.55 0.64 0.30 0.68
2014-01-01 0.74 0.92 0.57 1.06 0.66 0.55 1.80 1.14 0.77 0.71 0.61 0.75 0.23 0.49
2015-01-01 0.86 1.13 0.60 1.26 0.91 0.41 1.64 1.46 1.02 0.94 0.87 0.75 0.19 -0.31
2016-01-01 0.98 1.25 0.71 1.48 0.96 0.51 2.85 1.41 1.08 0.93 0.99 0.70 0.28 0.34

137 rows × 14 columns


In [ ]:


In [5]:
df_zonal.describe()


Out[5]:
Glob NHem SHem 24N-90N 24S-24N 90S-24S 64N-90N 44N-64N 24N-44N EQU-24N 24S-EQU 44S-24S 64S-44S 90S-64S
count 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000 137.000000
mean 0.025985 0.052701 0.000073 0.069708 0.031971 -0.025182 0.075985 0.102628 0.033869 0.029854 0.034672 0.018029 -0.070803 -0.117518
std 0.324288 0.374762 0.295171 0.443739 0.319204 0.291414 0.881053 0.504323 0.332215 0.313287 0.333107 0.310445 0.260751 0.707656
min -0.470000 -0.500000 -0.480000 -0.590000 -0.590000 -0.490000 -1.610000 -0.730000 -0.540000 -0.650000 -0.570000 -0.430000 -0.620000 -2.550000
25% -0.200000 -0.210000 -0.220000 -0.260000 -0.210000 -0.260000 -0.530000 -0.270000 -0.180000 -0.210000 -0.220000 -0.230000 -0.270000 -0.480000
50% -0.070000 -0.010000 -0.080000 0.010000 -0.030000 -0.110000 0.010000 0.010000 -0.050000 -0.010000 -0.030000 -0.090000 -0.090000 0.000000
75% 0.190000 0.170000 0.250000 0.230000 0.240000 0.230000 0.610000 0.360000 0.140000 0.210000 0.290000 0.260000 0.160000 0.340000
max 0.980000 1.250000 0.710000 1.480000 0.960000 0.580000 2.850000 1.460000 1.080000 0.940000 0.990000 0.750000 0.390000 1.270000

The second dataset will be monthly mean temperature data for states published by NOAA. The README file for this dataset describes the mangled time, location, and category code in the first column. They also have a wide range of data available.


In [7]:
# Region IDs according to the README file. A new first value 'null' has been
# added so that the index of the list will corresond to the code (i.e., 'Alabama' is 001)
# We will only deal with the contiguous 48, so other regions are ignored.
Region_ID = ['null', 'Alabama', 'Arizona', 'Arkansas', 'California', 
             'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Idaho', 
             'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 
             'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 
             'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
             'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 
             'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 
             'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 
             'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

In [32]:
import urllib.request # use this package to read html files as text

url = 'https://www1.ncdc.noaa.gov/pub/data/cirs/climdiv/climdiv-tmpcst-v1.0.0-20170404'
with urllib.request.urlopen(url) as f:
   html = f.read()  # the entire file as a single string

lines = html.split(b'\n')  # split into lines. The 'b' is needed since the 'string' is actually
                           # defined as a 'bytes' object, that could contain special characters

In [33]:
times = {state:[] for state in Region_ID}
monthly_temp = {state:[] for state in Region_ID}

for line in lines:
    data = line.split()
    if not data: 
        continue
    
    # First parse the first element into state, division, element, and year according 
    # to the README file
    state_code = int(data[0][:3])
    if state_code > 48:  
        continue # ignore regions outside the contiguous 48.
    state = Region_ID[state_code]
    division = int(data[0][3])  # Zero for area-averaged element. We won't use this
    element = int(data[0][4:6]) # Should all be 02 (average temperature) for this dataset
    year = int(data[0][6:])
    
    monthly_temp[state] += [float(temp) for temp in data[1:]]
    times[state] += [datetime(year, month, 15) for month in range(1, 13)]

dfs = [pd.DataFrame(monthly_temp[state], index=times[state], columns=[state])
       for state in Region_ID]

df_states = pd.concat(dfs[1:], axis=1) # concatinate and remove the 'null' state
df_states[df_states == -99.9] = np.nan

# Remove 1951 - 1980 mean to get anomalies similar to GISS data.
df_states -= df_states['1951':'1980'].mean()

In [34]:
df_states


Out[34]:
Alabama Arizona Arkansas California Colorado Connecticut Delaware Florida Georgia Idaho ... South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia Wisconsin Wyoming
1895-01-15 -19.417222 -18.876111 -23.9925 -16.913333 -23.001944 -25.381111 -23.037222 -12.364722 -18.031389 -22.516944 ... -37.505556 -23.211944 -19.858889 -24.768611 -26.4425 -23.437222 -19.864167 -24.651944 -34.793889 -26.373333
1895-02-15 -25.117222 -15.776111 -27.0925 -11.413333 -25.401944 -29.481111 -29.937222 -19.264722 -24.431389 -18.916944 ... -32.305556 -27.911944 -25.958889 -22.768611 -29.8425 -29.037222 -11.364167 -29.651944 -33.193889 -26.273333
1895-03-15 -8.017222 -9.176111 -8.8925 -9.613333 -12.901944 -16.681111 -14.637222 -6.364722 -7.931389 -12.016944 ... -15.405556 -10.211944 -8.058889 -12.868611 -19.1425 -11.937222 -8.364167 -12.251944 -16.393889 -15.473333
1895-04-15 0.882778 -0.976111 2.1075 -3.713333 -0.101944 -3.681111 -3.037222 -2.064722 -0.331389 0.883056 ... 7.194444 1.088056 0.341111 -0.568611 -1.9425 -1.337222 -0.464167 0.348056 4.206111 1.226667
1895-05-15 6.982778 6.523889 8.2075 3.886667 6.998056 9.618889 6.562778 4.835278 6.468611 6.083056 ... 11.094444 7.488056 5.241111 6.931389 14.1575 6.962778 4.335833 8.848056 14.106111 6.026667
1895-06-15 14.982778 12.323889 16.6075 11.186667 13.898056 19.318889 18.062778 9.435278 15.068611 11.983056 ... 17.694444 18.288056 13.541111 13.731389 23.4575 18.062778 11.735833 19.348056 24.106111 12.426667
1895-07-15 16.682778 19.023889 18.6075 14.986667 18.698056 18.218889 17.762778 10.835278 15.868611 20.283056 ... 25.394444 18.088056 15.841111 21.231389 21.5575 17.662778 15.935833 18.048056 24.606111 20.426667
1895-08-15 16.982778 18.523889 18.7075 14.886667 20.298056 20.518889 21.462778 11.035278 16.468611 21.183056 ... 24.594444 18.788056 16.941111 21.831389 20.5575 19.962778 16.235833 20.848056 25.106111 21.526667
1895-09-15 15.282778 12.223889 16.3075 8.286667 14.198056 15.218889 16.362778 9.435278 13.968611 8.483056 ... 16.894444 16.388056 12.641111 12.031389 15.2575 16.862778 4.935833 17.348056 19.906111 11.726667
1895-10-15 -2.817222 1.923889 -4.3925 2.586667 0.498056 -3.181111 -3.137222 0.435278 -1.431389 3.183056 ... -0.905556 -4.711944 -2.758889 0.431389 -2.0425 -3.537222 1.535833 -4.251944 -2.293889 0.726667
1895-11-15 -9.317222 -13.276111 -12.4925 -9.213333 -15.401944 -6.981111 -7.937222 -6.364722 -8.831389 -11.516944 ... -17.305556 -9.311944 -14.458889 -15.968611 -7.8425 -8.537222 -10.564167 -7.351944 -13.193889 -15.873333
1895-12-15 -17.617222 -21.776111 -18.4925 -16.413333 -23.701944 -16.481111 -15.637222 -14.564722 -17.731389 -20.416944 ... -23.105556 -17.311944 -19.058889 -27.068611 -18.2425 -17.037222 -15.064167 -16.251944 -21.893889 -24.973333
1896-01-15 -19.017222 -14.676111 -19.8925 -11.913333 -17.201944 -27.081111 -22.737222 -15.864722 -19.031389 -15.616944 ... -27.505556 -19.811944 -19.258889 -18.568611 -27.9425 -21.137222 -15.864167 -20.651944 -26.693889 -19.673333
1896-02-15 -14.817222 -13.976111 -16.1925 -10.313333 -17.501944 -22.381111 -19.137222 -12.864722 -14.531389 -12.216944 ... -20.305556 -17.011944 -15.058889 -17.568611 -25.0425 -17.737222 -9.564167 -18.651944 -23.993889 -16.773333
1896-03-15 -10.017222 -7.976111 -12.3925 -7.713333 -13.001944 -19.381111 -17.037222 -7.864722 -10.131389 -11.516944 ... -22.605556 -12.711944 -10.658889 -10.768611 -20.6425 -14.437222 -9.664167 -16.251944 -18.893889 -16.473333
1896-04-15 5.582778 -4.076111 7.3075 -8.413333 -1.001944 -0.381111 0.162778 0.335278 4.668611 -4.516944 ... 0.294444 6.588056 2.841111 -4.268611 0.4575 3.162778 -4.164167 6.348056 4.306111 -3.773333
1896-05-15 13.382778 6.523889 14.1075 1.486667 8.498056 12.018889 12.462778 6.735278 13.168611 1.183056 ... 14.494444 14.788056 12.941111 3.931389 14.4575 14.562778 2.335833 16.548056 19.406111 5.926667
1896-06-15 14.882778 18.623889 17.2075 13.486667 19.198056 15.118889 15.062778 10.035278 15.068611 16.683056 ... 22.194444 15.888056 17.541111 20.031389 18.5575 16.062778 11.635833 17.648056 23.906111 19.226667
1896-07-15 18.682778 19.223889 23.2075 18.386667 22.398056 22.318889 21.562778 11.335278 17.668611 24.983056 ... 25.794444 20.388056 17.541111 23.031389 25.3575 20.762778 20.435833 21.248056 26.506111 23.326667
1896-08-15 19.682778 18.023889 22.1075 15.586667 21.298056 20.918889 20.362778 12.335278 18.368611 21.083056 ... 24.894444 20.488056 18.441111 20.831389 22.2575 19.762778 16.835833 20.548056 25.406111 21.426667
1896-09-15 13.382778 12.223889 12.8075 8.986667 11.898056 11.518889 12.062778 9.135278 13.168611 10.583056 ... 11.094444 12.388056 10.841111 11.431389 13.1575 12.262778 8.035833 13.048056 12.006111 9.926667
1896-10-15 0.682778 1.523889 -0.4925 2.586667 0.098056 -1.381111 -1.337222 1.135278 0.568611 0.983056 ... -1.405556 -0.511944 -1.058889 0.631389 0.4575 -1.137222 0.735833 -1.451944 -0.193889 0.426667
1896-11-15 -5.217222 -10.976111 -8.4925 -9.713333 -15.601944 -5.181111 -3.937222 -1.264722 -4.231389 -15.716944 ... -30.005556 -6.411944 -10.658889 -15.468611 -5.7425 -4.037222 -16.564167 -3.751944 -17.193889 -21.373333
1896-12-15 -16.117222 -15.976111 -15.4925 -11.813333 -15.801944 -22.181111 -20.537222 -13.564722 -16.731389 -12.816944 ... -19.405556 -17.411944 -15.958889 -17.668611 -23.8425 -18.837222 -12.064167 -17.451944 -20.393889 -13.473333
1897-01-15 -20.717222 -18.076111 -22.3925 -15.413333 -23.101944 -23.681111 -23.837222 -16.164722 -20.031389 -20.416944 ... -33.205556 -22.911944 -23.658889 -22.668611 -25.5425 -22.937222 -18.064167 -24.151944 -29.393889 -24.273333
1897-02-15 -11.417222 -17.376111 -14.2925 -15.213333 -19.901944 -21.881111 -19.337222 -7.564722 -11.631389 -17.216944 ... -29.305556 -13.711944 -13.158889 -21.168611 -23.8425 -16.237222 -13.564167 -16.051944 -22.893889 -21.373333
1897-03-15 -2.317222 -14.176111 -4.9925 -14.313333 -14.401944 -13.181111 -10.337222 -0.864722 -4.031389 -17.416944 ... -23.305556 -4.711944 -5.358889 -17.768611 -15.2425 -7.437222 -14.364167 -6.551944 -17.493889 -19.173333
1897-04-15 -0.117222 -1.876111 0.4075 -1.413333 -2.301944 -1.681111 -2.737222 -0.964722 -0.031389 -0.416944 ... -0.505556 0.288056 0.041111 -2.268611 -0.5425 -0.837222 1.235833 -0.451944 0.306111 -3.173333
1897-05-15 6.482778 8.823889 7.6075 6.386667 10.898056 8.418889 6.562778 3.335278 6.668611 13.583056 ... 12.894444 5.888056 7.041111 11.731389 10.8575 6.862778 9.835833 6.648056 10.706111 12.226667
1897-06-15 18.682778 13.423889 17.6075 10.586667 16.798056 13.418889 13.262778 11.935278 18.268611 13.983056 ... 20.094444 17.888056 15.241111 15.331389 16.2575 16.762778 11.035833 17.148056 19.906111 17.226667
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2015-07-15 19.082778 19.323889 21.8075 17.586667 21.998056 24.018889 22.762778 12.835278 18.568611 22.583056 ... 27.694444 21.088056 18.741111 22.531389 24.7575 20.762778 22.535833 20.948056 26.306111 23.126667
2015-08-15 16.482778 21.323889 18.0075 19.086667 22.298056 23.818889 20.962778 12.435278 16.568611 23.283056 ... 25.494444 16.988056 18.841111 23.431389 25.1575 18.662778 20.535833 18.748056 24.006111 23.526667
2015-09-15 11.882778 15.723889 14.7075 14.786667 17.898056 19.618889 17.862778 10.035278 10.968611 14.683056 ... 21.494444 13.688056 14.841111 17.931389 20.9575 14.762778 10.135833 15.948056 22.606111 18.626667
2015-10-15 2.482778 5.223889 3.4075 7.586667 6.798056 3.218889 2.362778 4.435278 1.868611 8.183056 ... 7.194444 2.488056 4.741111 6.831389 2.8575 1.762778 6.335833 2.848056 6.406111 7.926667
2015-11-15 -3.317222 -10.976111 -5.8925 -9.813333 -10.001944 -2.081111 -2.537222 2.235278 -3.231389 -12.616944 ... -7.905556 -3.711944 -7.158889 -13.068611 -2.2425 -3.437222 -10.064167 -2.851944 -2.693889 -11.573333
2015-12-15 -5.617222 -18.376111 -10.0925 -15.113333 -18.801944 -4.681111 -3.337222 -0.364722 -4.031389 -18.216944 ... -20.105556 -5.911944 -12.958889 -20.968611 -6.3425 -4.937222 -13.964167 -4.851944 -10.293889 -19.473333
2016-01-15 -19.517222 -17.376111 -20.6925 -12.813333 -19.001944 -18.681111 -19.837222 -12.964722 -18.431389 -16.716944 ... -24.605556 -22.411944 -18.058889 -20.868611 -20.5425 -21.437222 -13.064167 -23.551944 -25.893889 -18.073333
2016-02-15 -12.817222 -8.776111 -13.1925 -5.613333 -12.301944 -16.381111 -15.737222 -10.564722 -12.931389 -10.616944 ... -13.205556 -14.811944 -9.958889 -13.768611 -17.5425 -16.137222 -6.864167 -16.251944 -19.393889 -10.773333
2016-03-15 -2.917222 -4.376111 -4.5925 -4.813333 -6.601944 -5.181111 -4.437222 -1.164722 -1.931389 -5.516944 ... -4.305556 -3.211944 -3.158889 -5.468611 -7.9425 -2.737222 -4.264167 -2.551944 -5.293889 -6.273333
2016-04-15 1.082778 -0.776111 2.4075 0.586667 -0.901944 -1.581111 -1.037222 1.135278 1.068611 4.483056 ... 1.894444 2.488056 1.541111 0.231389 -3.2425 0.662778 5.535833 1.748056 0.806111 1.026667
2016-05-15 7.682778 5.423889 7.0075 4.786667 6.198056 9.918889 7.362778 6.135278 7.868611 8.183056 ... 12.294444 8.088056 7.141111 6.831389 12.5575 7.362778 9.135833 8.348056 14.006111 7.026667
2016-06-15 16.682778 21.023889 18.8075 16.086667 21.798056 19.018889 17.962778 11.935278 16.868611 18.683056 ... 26.194444 19.188056 15.641111 23.231389 20.5575 17.362778 13.835833 18.348056 23.806111 23.026667
2016-07-15 19.282778 23.123889 21.9075 19.386667 24.598056 25.918889 25.262778 14.035278 19.868611 22.383056 ... 28.494444 22.588056 20.941111 26.131389 25.8575 22.862778 17.635833 22.548056 27.806111 25.926667
2016-08-15 18.982778 18.323889 19.8075 18.786667 19.898056 25.518889 24.362778 13.135278 18.668611 22.283056 ... 26.494444 21.688056 17.141111 22.631389 26.0575 22.562778 19.835833 23.948056 27.406111 22.626667
2016-09-15 15.982778 12.523889 15.6075 12.086667 15.098056 17.918889 17.762778 11.235278 14.668611 12.183056 ... 17.894444 16.888056 13.141111 13.531389 18.4575 17.462778 10.435833 18.248056 20.506111 14.526667
2016-10-15 5.882778 6.823889 6.8075 2.986667 7.198056 5.218889 5.962778 4.835278 4.968611 3.283056 ... 7.194444 7.388056 6.441111 5.631389 6.3575 5.262778 1.335833 6.248056 8.306111 5.626667
2016-11-15 -4.817222 -6.076111 -4.4925 -4.613333 -4.401944 -4.681111 -5.337222 -3.264722 -5.031389 -3.716944 ... -2.405556 -4.511944 -3.358889 -5.868611 -4.6425 -6.137222 -3.064167 -5.551944 -0.493889 -3.773333
2016-12-15 -12.517222 -14.476111 -17.5925 -13.913333 -19.301944 -15.681111 -15.237222 -3.964722 -10.731389 -23.316944 ... -29.005556 -16.311944 -14.858889 -20.168611 -17.5425 -15.437222 -19.864167 -15.551944 -22.193889 -24.073333
2017-01-15 -10.217222 -16.776111 -14.8925 -15.213333 -19.201944 -15.481111 -15.037222 -7.264722 -9.931389 -23.816944 ... -27.905556 -12.211944 -13.658889 -21.668611 -16.7425 -14.537222 -21.264167 -13.551944 -22.593889 -24.273333
2017-02-15 -7.017222 -9.876111 -7.5925 -10.113333 -9.101944 -13.781111 -10.037222 -4.464722 -6.531389 -12.916944 ... -15.005556 -8.411944 -5.958889 -11.168611 -15.7425 -8.637222 -14.964167 -8.951944 -14.293889 -13.073333
2017-03-15 -4.017222 -2.476111 -4.3925 -4.013333 -2.101944 -15.081111 -10.837222 -3.964722 -5.231389 -4.016944 ... -8.905556 -6.111944 -0.758889 -2.368611 -17.7425 -8.637222 -6.864167 -8.951944 -11.793889 -2.473333
2017-04-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-05-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-06-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-07-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-08-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-09-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-11-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2017-12-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

1476 rows × 48 columns


In [35]:
df_states['Texas']['1951':'1980'].plot()


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce60d0fc88>

In [36]:
df_states.resample('AS').mean()['Texas'].plot()


Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcec41c0780>

In [37]:
df = pd.concat([df_zonal, df_states.resample('AS').mean()['Texas']], axis=1).dropna()

In [15]:
df.describe()


Out[15]:
Glob NHem SHem 24N-90N 24S-24N 90S-24S 64N-90N 44N-64N 24N-44N EQU-24N 24S-EQU 44S-24S 64S-44S 90S-64S Texas
count 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000 122.000000
mean 0.058607 0.097131 0.020984 0.126311 0.058525 -0.008852 0.223033 0.169590 0.064590 0.055902 0.061885 0.035164 -0.057295 -0.152623 0.323010
std 0.327970 0.372418 0.304602 0.436466 0.324694 0.302849 0.813104 0.491238 0.337492 0.317742 0.339100 0.322947 0.268590 0.740347 1.056474
min -0.470000 -0.500000 -0.480000 -0.570000 -0.590000 -0.490000 -1.610000 -0.730000 -0.540000 -0.650000 -0.570000 -0.430000 -0.620000 -2.550000 -2.133889
25% -0.187500 -0.167500 -0.220000 -0.170000 -0.200000 -0.277500 -0.335000 -0.157500 -0.130000 -0.190000 -0.190000 -0.237500 -0.245000 -0.567500 -0.492222
50% -0.020000 0.035000 -0.065000 0.065000 0.020000 -0.095000 0.125000 0.065000 -0.020000 0.035000 0.010000 -0.080000 -0.075000 -0.010000 0.320278
75% 0.265000 0.250000 0.277500 0.317500 0.282500 0.250000 0.667500 0.387500 0.167500 0.252500 0.312500 0.290000 0.190000 0.340000 0.936944
max 0.980000 1.250000 0.710000 1.480000 0.960000 0.580000 2.850000 1.460000 1.080000 0.940000 0.990000 0.750000 0.390000 1.270000 3.324444

In [16]:
# We can quickly calculate a correlation matrix between all the columns
df.corr()


Out[16]:
Glob NHem SHem 24N-90N 24S-24N 90S-24S 64N-90N 44N-64N 24N-44N EQU-24N 24S-EQU 44S-24S 64S-44S 90S-64S Texas
Glob 1.000000 0.975517 0.963644 0.935436 0.947599 0.912153 0.824861 0.890983 0.917213 0.934765 0.939024 0.944705 0.782213 0.491374 0.366355
NHem 0.975517 1.000000 0.881598 0.980091 0.908790 0.815613 0.877802 0.934521 0.950878 0.915667 0.882181 0.875729 0.658797 0.402197 0.428655
SHem 0.963644 0.881598 1.000000 0.818097 0.935815 0.966161 0.704238 0.777761 0.815276 0.899446 0.950137 0.965006 0.876082 0.562960 0.262310
24N-90N 0.935436 0.980091 0.818097 1.000000 0.813556 0.779740 0.911440 0.959961 0.953353 0.817847 0.790858 0.845237 0.608663 0.377788 0.486877
24S-24N 0.947599 0.908790 0.935815 0.813556 1.000000 0.821486 0.693170 0.770717 0.818276 0.988717 0.989776 0.857811 0.720218 0.418141 0.235556
90S-24S 0.912153 0.815613 0.966161 0.779740 0.821486 1.000000 0.679553 0.736778 0.775128 0.784722 0.837899 0.972941 0.918933 0.635731 0.287451
64N-90N 0.824861 0.877802 0.704238 0.911440 0.693170 0.679553 1.000000 0.830752 0.790121 0.700286 0.668809 0.729404 0.526732 0.293047 0.447521
44N-64N 0.890983 0.934521 0.777761 0.959961 0.770717 0.736778 0.830752 1.000000 0.876222 0.766224 0.757346 0.799065 0.575175 0.368080 0.401001
24N-44N 0.917213 0.950878 0.815276 0.953353 0.818276 0.775128 0.790121 0.876222 1.000000 0.827706 0.791810 0.844759 0.609620 0.387811 0.518752
EQU-24N 0.934765 0.915667 0.899446 0.817847 0.988717 0.784722 0.700286 0.766224 0.827706 1.000000 0.957501 0.826890 0.676261 0.400921 0.257518
24S-EQU 0.939024 0.882181 0.950137 0.790858 0.989776 0.837899 0.668809 0.757346 0.791810 0.957501 1.000000 0.867680 0.745877 0.425460 0.207955
44S-24S 0.944705 0.875729 0.965006 0.845237 0.857811 0.972941 0.729404 0.799065 0.844759 0.826890 0.867680 1.000000 0.848053 0.540485 0.331289
64S-44S 0.782213 0.658797 0.876082 0.608663 0.720218 0.918933 0.526732 0.575175 0.609620 0.676261 0.745877 0.848053 1.000000 0.547706 0.161505
90S-64S 0.491374 0.402197 0.562960 0.377788 0.418141 0.635731 0.293047 0.368080 0.387811 0.400921 0.425460 0.540485 0.547706 1.000000 0.144904
Texas 0.366355 0.428655 0.262310 0.486877 0.235556 0.287451 0.447521 0.401001 0.518752 0.257518 0.207955 0.331289 0.161505 0.144904 1.000000

We are now ready to start doing some statistics. Fitting a model in statsmodels typically involves 3 easy steps:

  1. Use the model class to describe the model
  2. Fit the model using a class method
  3. Inspect the results using a summary method

In [17]:
# 1. Describe the model
mod = sm.OLS(df['Texas'], df.drop('Texas', axis=1))   # Ordinary Least Squares

# 2. Fit the model
res = mod.fit()

# 3. Summarize the model fit
res.summary()


Out[17]:
OLS Regression Results
Dep. Variable: Texas R-squared: 0.459
Model: OLS Adj. R-squared: 0.389
Method: Least Squares F-statistic: 6.545
Date: Tue, 11 Apr 2017 Prob (F-statistic): 2.10e-09
Time: 10:27:42 Log-Likelihood: -147.33
No. Observations: 122 AIC: 322.7
Df Residuals: 108 BIC: 361.9
Df Model: 14
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Glob 39.6494 26.754 1.482 0.141 -13.382 92.681
NHem 10.4157 23.999 0.434 0.665 -37.154 57.985
SHem 5.0687 19.720 0.257 0.798 -34.019 44.157
24N-90N 5.5436 15.300 0.362 0.718 -24.784 35.871
24S-24N 3.9676 21.881 0.181 0.856 -39.404 47.339
90S-24S -14.1363 13.191 -1.072 0.286 -40.284 12.011
64N-90N -3.2933 1.928 -1.708 0.091 -7.115 0.529
44N-64N -8.1399 4.102 -1.984 0.050 -16.271 -0.009
24N-44N -9.1017 6.488 -1.403 0.164 -21.961 3.758
EQU-24N -15.4135 14.561 -1.059 0.292 -44.275 13.448
24S-EQU -12.4807 13.499 -0.925 0.357 -39.237 14.276
44S-24S -0.3458 1.751 -0.197 0.844 -3.817 3.126
64S-44S -0.9190 0.998 -0.921 0.359 -2.897 1.059
90S-64S -0.0188 0.177 -0.106 0.916 -0.369 0.331
Omnibus: 1.777 Durbin-Watson: 1.803
Prob(Omnibus): 0.411 Jarque-Bera (JB): 1.487
Skew: 0.110 Prob(JB): 0.475
Kurtosis: 2.506 Cond. No. 529.

In [18]:
res.pvalues  # lower is better. Significant usually for p < 0.05


Out[18]:
Glob       0.141256
NHem       0.665145
SHem       0.797638
24N-90N    0.717821
24S-24N    0.856450
90S-24S    0.286272
64N-90N    0.090527
44N-64N    0.049765
24N-44N    0.163502
EQU-24N    0.292158
24S-EQU    0.357239
44S-24S    0.843875
64S-44S    0.359064
90S-64S    0.915527
dtype: float64

In [ ]:

Visualizing datasets

We can use seaborn to visulize joint dataset distributions


In [19]:
sn.jointplot(df['Texas'], df['24N-44N'])


Out[19]:
<seaborn.axisgrid.JointGrid at 0x7fcea82bce80>

In [23]:
# Seaborn also plays nice with pandas -- you can provide the dataframe as 'data'
# and then just reference the columns.
# Excercise -- Try kind as 'reg', 'kde', and 'hex'
sn.jointplot('Texas', '24N-44N', data=df, kind='reg')


/opt/anaconda3/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[23]:
<seaborn.axisgrid.JointGrid at 0x7fcea00a6978>

In [24]:
# We can quicly visualize a 'heatmap' of the correlation coefficients.
sn.heatmap(df.corr())


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcea0371f60>

In [38]:
fig = plt.figure(figsize=(12, 4))
# sn.boxplot(df)
sn.violinplot(df.drop('Texas', axis=1))


/opt/anaconda3/lib/python3.5/site-packages/seaborn/categorical.py:2342: UserWarning: The violinplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcec42b6198>

In [26]:
df[['Glob', 'NHem', 'SHem', 'Texas']].plot(figsize=(15, 5))


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce60fe9ac8>

In [27]:
import statsmodels.formula.api as smf

model = smf.ols(formula="Texas ~ NHem + SHem", data=df).fit()
model.summary()


Out[27]:
OLS Regression Results
Dep. Variable: Texas R-squared: 0.244
Model: OLS Adj. R-squared: 0.231
Method: Least Squares F-statistic: 19.17
Date: Tue, 11 Apr 2017 Prob (F-statistic): 6.05e-08
Time: 10:39:52 Log-Likelihood: -162.27
No. Observations: 122 AIC: 330.5
Df Residuals: 119 BIC: 339.0
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 0.1166 0.091 1.277 0.204 -0.064 0.297
NHem 2.5136 0.479 5.246 0.000 1.565 3.462
SHem -1.7995 0.586 -3.072 0.003 -2.959 -0.640
Omnibus: 6.713 Durbin-Watson: 1.856
Prob(Omnibus): 0.035 Jarque-Bera (JB): 3.605
Skew: 0.196 Prob(JB): 0.165
Kurtosis: 2.254 Cond. No. 8.82

In [29]:
df['year'] = df.index.year
model = smf.ols(formula="Glob ~ year", data=df).fit()
model.summary()


Out[29]:
OLS Regression Results
Dep. Variable: Glob R-squared: 0.787
Model: OLS Adj. R-squared: 0.785
Method: Least Squares F-statistic: 442.6
Date: Tue, 11 Apr 2017 Prob (F-statistic): 4.47e-42
Time: 10:42:00 Log-Likelihood: 57.654
No. Observations: 122 AIC: -111.3
Df Residuals: 120 BIC: -105.7
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -16.0278 0.765 -20.959 0.000 -17.542 -14.514
year 0.0082 0.000 21.039 0.000 0.007 0.009
Omnibus: 1.182 Durbin-Watson: 0.531
Prob(Omnibus): 0.554 Jarque-Bera (JB): 1.252
Skew: 0.223 Prob(JB): 0.535
Kurtosis: 2.784 Cond. No. 1.09e+05

In [30]:
sm.stats.anova_lm(model)


/opt/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/opt/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/opt/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[30]:
df sum_sq mean_sq F PR(>F)
year 1.0 10.239312 10.239312 442.629368 4.467934e-42
Residual 120.0 2.775951 0.023133 NaN NaN

In [ ]:


In [ ]: