In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta
# We will explore statistics looking at these packages
import pandas as pd
import seaborn as sn
import statsmodels.api as sm
We will look at two datasets. First the Goddard Institute for Space Studies surface temperature analysis dataset from NASA. There are many intersting products here; we will look at zonal mean temperature anomalies. The 'base' period that defines the mean is 1951 through 1980.
In [4]:
url = 'https://data.giss.nasa.gov/gistemp/tabledata_v3/ZonAnn.Ts+dSST.csv'
df_zonal = pd.read_csv(url, index_col=0, parse_dates=True)
# The `describe` method is a good way to get an idea of the statistical
# properties of a variable or dataset.
df_zonal['Glob'].describe()
Out[4]:
count 137.000000
mean 0.025985
std 0.324288
min -0.470000
25% -0.200000
50% -0.070000
75% 0.190000
max 0.980000
Name: Glob, dtype: float64
In [3]:
df_zonal
Out[3]:
Glob
NHem
SHem
24N-90N
24S-24N
90S-24S
64N-90N
44N-64N
24N-44N
EQU-24N
24S-EQU
44S-24S
64S-44S
90S-64S
Year
1880-01-01
-0.20
-0.33
-0.06
-0.41
-0.14
-0.06
-0.95
-0.62
-0.21
-0.22
-0.07
-0.02
-0.08
0.34
1881-01-01
-0.11
-0.19
-0.03
-0.30
0.00
-0.06
-0.67
-0.47
-0.13
-0.01
0.01
-0.06
-0.02
0.32
1882-01-01
-0.09
-0.17
-0.02
-0.23
-0.09
0.03
-1.37
-0.26
-0.01
-0.08
-0.09
0.03
0.08
0.37
1883-01-01
-0.20
-0.30
-0.09
-0.36
-0.21
-0.02
-0.34
-0.63
-0.18
-0.21
-0.20
-0.01
0.00
0.32
1884-01-01
-0.27
-0.41
-0.13
-0.56
-0.17
-0.12
-1.31
-0.62
-0.38
-0.18
-0.15
-0.14
-0.05
0.36
1885-01-01
-0.31
-0.41
-0.22
-0.59
-0.17
-0.20
-1.16
-0.71
-0.40
-0.11
-0.24
-0.27
-0.07
0.33
1886-01-01
-0.30
-0.39
-0.22
-0.48
-0.24
-0.20
-1.21
-0.46
-0.36
-0.23
-0.25
-0.17
-0.21
0.23
1887-01-01
-0.33
-0.36
-0.29
-0.46
-0.27
-0.27
-1.54
-0.56
-0.19
-0.21
-0.33
-0.24
-0.29
0.16
1888-01-01
-0.20
-0.22
-0.17
-0.42
0.08
-0.34
-1.40
-0.47
-0.20
0.09
0.06
-0.29
-0.38
0.11
1889-01-01
-0.11
-0.15
-0.07
-0.26
0.05
-0.17
-0.84
-0.18
-0.19
0.01
0.09
-0.15
-0.17
0.14
1890-01-01
-0.36
-0.39
-0.34
-0.45
-0.37
-0.26
-1.30
-0.46
-0.28
-0.31
-0.44
-0.16
-0.40
-0.01
1891-01-01
-0.24
-0.24
-0.25
-0.37
-0.15
-0.24
-1.30
-0.25
-0.25
-0.04
-0.26
-0.20
-0.27
0.04
1892-01-01
-0.27
-0.31
-0.22
-0.32
-0.30
-0.16
-1.29
-0.35
-0.11
-0.30
-0.30
-0.03
-0.33
0.00
1893-01-01
-0.30
-0.40
-0.20
-0.38
-0.40
-0.07
-0.84
-0.39
-0.28
-0.42
-0.38
0.04
-0.21
-0.03
1894-01-01
-0.30
-0.36
-0.24
-0.27
-0.38
-0.23
-1.28
-0.20
-0.07
-0.51
-0.25
-0.15
-0.31
-0.16
1895-01-01
-0.21
-0.26
-0.15
-0.33
-0.13
-0.19
-0.89
-0.38
-0.15
-0.16
-0.09
-0.20
-0.13
-0.06
1896-01-01
-0.14
-0.20
-0.08
-0.33
0.03
-0.19
-1.22
-0.36
-0.08
0.01
0.06
-0.15
-0.21
-0.01
1897-01-01
-0.10
-0.11
-0.09
-0.29
0.17
-0.28
-0.76
-0.32
-0.15
0.17
0.17
-0.20
-0.37
0.00
1898-01-01
-0.28
-0.26
-0.30
-0.27
-0.27
-0.29
-1.22
-0.03
-0.18
-0.24
-0.30
-0.26
-0.30
0.03
1899-01-01
-0.15
-0.16
-0.15
-0.17
-0.12
-0.18
-1.05
0.03
-0.09
-0.12
-0.12
-0.13
-0.22
0.08
1900-01-01
-0.08
-0.03
-0.13
-0.12
0.14
-0.35
-0.57
-0.05
-0.04
0.12
0.17
-0.31
-0.38
0.07
1901-01-01
-0.14
-0.07
-0.21
-0.11
-0.05
-0.30
-0.53
-0.02
-0.05
-0.01
-0.08
-0.30
-0.26
-0.04
1902-01-01
-0.28
-0.31
-0.24
-0.51
-0.06
-0.34
-1.61
-0.42
-0.27
-0.01
-0.10
-0.27
-0.42
0.01
1903-01-01
-0.36
-0.35
-0.37
-0.42
-0.31
-0.35
-0.51
-0.31
-0.46
-0.24
-0.39
-0.37
-0.25
-0.59
1904-01-01
-0.44
-0.45
-0.44
-0.47
-0.45
-0.40
-0.35
-0.55
-0.46
-0.41
-0.50
-0.37
-0.38
-1.26
1905-01-01
-0.28
-0.28
-0.27
-0.38
-0.13
-0.38
-0.22
-0.18
-0.54
-0.13
-0.13
-0.38
-0.34
-0.37
1906-01-01
-0.22
-0.20
-0.25
-0.21
-0.21
-0.25
-0.40
0.01
-0.30
-0.19
-0.24
-0.24
-0.22
-0.66
1907-01-01
-0.40
-0.46
-0.33
-0.57
-0.34
-0.30
-0.80
-0.72
-0.42
-0.29
-0.38
-0.25
-0.31
-1.18
1908-01-01
-0.43
-0.46
-0.41
-0.47
-0.47
-0.34
-0.49
-0.53
-0.43
-0.44
-0.51
-0.31
-0.38
0.65
1909-01-01
-0.47
-0.47
-0.48
-0.48
-0.51
-0.41
-0.84
-0.52
-0.36
-0.45
-0.57
-0.38
-0.41
-0.49
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1987-01-01
0.33
0.26
0.40
0.07
0.58
0.25
-0.28
0.13
0.14
0.55
0.61
0.33
0.21
0.09
1988-01-01
0.40
0.37
0.43
0.41
0.37
0.44
0.81
0.50
0.23
0.31
0.42
0.39
0.18
1.14
1989-01-01
0.28
0.29
0.28
0.43
0.14
0.33
0.44
0.67
0.27
0.09
0.20
0.37
0.25
0.34
1990-01-01
0.44
0.51
0.36
0.62
0.36
0.36
0.64
0.84
0.48
0.35
0.37
0.38
0.33
0.35
1991-01-01
0.42
0.41
0.43
0.47
0.38
0.44
0.79
0.60
0.28
0.33
0.42
0.34
0.34
1.02
1992-01-01
0.23
0.13
0.32
0.08
0.29
0.28
-0.14
0.38
-0.02
0.19
0.39
0.22
0.34
0.41
1993-01-01
0.24
0.20
0.28
0.14
0.32
0.23
0.67
0.24
-0.08
0.28
0.35
0.26
0.35
-0.15
1994-01-01
0.31
0.37
0.26
0.44
0.30
0.21
0.39
0.47
0.43
0.26
0.35
0.28
0.25
-0.13
1995-01-01
0.45
0.58
0.33
0.70
0.43
0.23
1.38
0.95
0.33
0.40
0.46
0.33
0.16
0.06
1996-01-01
0.35
0.29
0.40
0.27
0.32
0.45
0.86
0.20
0.13
0.31
0.34
0.35
0.31
1.09
1997-01-01
0.48
0.54
0.42
0.56
0.51
0.35
0.81
0.88
0.27
0.52
0.51
0.44
0.38
-0.01
1998-01-01
0.63
0.74
0.53
0.82
0.70
0.36
0.99
0.93
0.69
0.63
0.78
0.44
0.32
0.17
1999-01-01
0.41
0.52
0.31
0.74
0.24
0.32
0.47
0.83
0.78
0.19
0.29
0.50
0.16
0.02
2000-01-01
0.42
0.52
0.32
0.72
0.26
0.33
1.12
0.77
0.57
0.23
0.29
0.47
0.10
0.34
2001-01-01
0.54
0.65
0.43
0.81
0.42
0.43
1.06
0.82
0.72
0.41
0.43
0.58
0.20
0.43
2002-01-01
0.63
0.72
0.54
0.84
0.58
0.48
1.36
0.97
0.59
0.55
0.62
0.49
0.29
0.85
2003-01-01
0.62
0.74
0.50
0.83
0.61
0.41
1.55
0.95
0.53
0.60
0.63
0.47
0.23
0.53
2004-01-01
0.54
0.68
0.41
0.76
0.55
0.32
0.62
0.93
0.69
0.57
0.53
0.50
0.21
-0.07
2005-01-01
0.69
0.84
0.53
1.00
0.63
0.46
2.01
1.17
0.57
0.62
0.63
0.52
0.21
0.79
2006-01-01
0.63
0.81
0.46
0.97
0.55
0.40
1.70
1.06
0.68
0.56
0.55
0.54
0.19
0.33
2007-01-01
0.66
0.84
0.47
1.10
0.46
0.48
1.95
1.30
0.70
0.46
0.47
0.54
0.06
1.14
2008-01-01
0.54
0.67
0.40
0.89
0.38
0.39
1.44
1.03
0.62
0.34
0.41
0.56
0.10
0.44
2009-01-01
0.64
0.72
0.58
0.75
0.67
0.50
1.25
0.59
0.69
0.67
0.67
0.61
0.18
0.81
2010-01-01
0.71
0.89
0.54
1.00
0.68
0.48
1.99
0.87
0.75
0.73
0.62
0.66
0.23
0.35
2011-01-01
0.60
0.72
0.48
0.94
0.36
0.58
2.11
0.91
0.59
0.39
0.33
0.68
0.26
0.92
2012-01-01
0.63
0.78
0.48
0.99
0.51
0.45
1.89
0.91
0.74
0.48
0.53
0.60
0.25
0.32
2013-01-01
0.65
0.76
0.54
0.89
0.56
0.54
1.18
1.04
0.70
0.58
0.55
0.64
0.30
0.68
2014-01-01
0.74
0.92
0.57
1.06
0.66
0.55
1.80
1.14
0.77
0.71
0.61
0.75
0.23
0.49
2015-01-01
0.86
1.13
0.60
1.26
0.91
0.41
1.64
1.46
1.02
0.94
0.87
0.75
0.19
-0.31
2016-01-01
0.98
1.25
0.71
1.48
0.96
0.51
2.85
1.41
1.08
0.93
0.99
0.70
0.28
0.34
137 rows × 14 columns
In [ ]:
In [5]:
df_zonal.describe()
Out[5]:
Glob
NHem
SHem
24N-90N
24S-24N
90S-24S
64N-90N
44N-64N
24N-44N
EQU-24N
24S-EQU
44S-24S
64S-44S
90S-64S
count
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
137.000000
mean
0.025985
0.052701
0.000073
0.069708
0.031971
-0.025182
0.075985
0.102628
0.033869
0.029854
0.034672
0.018029
-0.070803
-0.117518
std
0.324288
0.374762
0.295171
0.443739
0.319204
0.291414
0.881053
0.504323
0.332215
0.313287
0.333107
0.310445
0.260751
0.707656
min
-0.470000
-0.500000
-0.480000
-0.590000
-0.590000
-0.490000
-1.610000
-0.730000
-0.540000
-0.650000
-0.570000
-0.430000
-0.620000
-2.550000
25%
-0.200000
-0.210000
-0.220000
-0.260000
-0.210000
-0.260000
-0.530000
-0.270000
-0.180000
-0.210000
-0.220000
-0.230000
-0.270000
-0.480000
50%
-0.070000
-0.010000
-0.080000
0.010000
-0.030000
-0.110000
0.010000
0.010000
-0.050000
-0.010000
-0.030000
-0.090000
-0.090000
0.000000
75%
0.190000
0.170000
0.250000
0.230000
0.240000
0.230000
0.610000
0.360000
0.140000
0.210000
0.290000
0.260000
0.160000
0.340000
max
0.980000
1.250000
0.710000
1.480000
0.960000
0.580000
2.850000
1.460000
1.080000
0.940000
0.990000
0.750000
0.390000
1.270000
The second dataset will be monthly mean temperature data for states published by NOAA. The README file for this dataset describes the mangled time, location, and category code in the first column. They also have a wide range of data available.
In [7]:
# Region IDs according to the README file. A new first value 'null' has been
# added so that the index of the list will corresond to the code (i.e., 'Alabama' is 001)
# We will only deal with the contiguous 48, so other regions are ignored.
Region_ID = ['null', 'Alabama', 'Arizona', 'Arkansas', 'California',
'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Idaho',
'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
In [32]:
import urllib.request # use this package to read html files as text
url = 'https://www1.ncdc.noaa.gov/pub/data/cirs/climdiv/climdiv-tmpcst-v1.0.0-20170404'
with urllib.request.urlopen(url) as f:
html = f.read() # the entire file as a single string
lines = html.split(b'\n') # split into lines. The 'b' is needed since the 'string' is actually
# defined as a 'bytes' object, that could contain special characters
In [33]:
times = {state:[] for state in Region_ID}
monthly_temp = {state:[] for state in Region_ID}
for line in lines:
data = line.split()
if not data:
continue
# First parse the first element into state, division, element, and year according
# to the README file
state_code = int(data[0][:3])
if state_code > 48:
continue # ignore regions outside the contiguous 48.
state = Region_ID[state_code]
division = int(data[0][3]) # Zero for area-averaged element. We won't use this
element = int(data[0][4:6]) # Should all be 02 (average temperature) for this dataset
year = int(data[0][6:])
monthly_temp[state] += [float(temp) for temp in data[1:]]
times[state] += [datetime(year, month, 15) for month in range(1, 13)]
dfs = [pd.DataFrame(monthly_temp[state], index=times[state], columns=[state])
for state in Region_ID]
df_states = pd.concat(dfs[1:], axis=1) # concatinate and remove the 'null' state
df_states[df_states == -99.9] = np.nan
# Remove 1951 - 1980 mean to get anomalies similar to GISS data.
df_states -= df_states['1951':'1980'].mean()
In [34]:
df_states
Out[34]:
Alabama
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Idaho
...
South Dakota
Tennessee
Texas
Utah
Vermont
Virginia
Washington
West Virginia
Wisconsin
Wyoming
1895-01-15
-19.417222
-18.876111
-23.9925
-16.913333
-23.001944
-25.381111
-23.037222
-12.364722
-18.031389
-22.516944
...
-37.505556
-23.211944
-19.858889
-24.768611
-26.4425
-23.437222
-19.864167
-24.651944
-34.793889
-26.373333
1895-02-15
-25.117222
-15.776111
-27.0925
-11.413333
-25.401944
-29.481111
-29.937222
-19.264722
-24.431389
-18.916944
...
-32.305556
-27.911944
-25.958889
-22.768611
-29.8425
-29.037222
-11.364167
-29.651944
-33.193889
-26.273333
1895-03-15
-8.017222
-9.176111
-8.8925
-9.613333
-12.901944
-16.681111
-14.637222
-6.364722
-7.931389
-12.016944
...
-15.405556
-10.211944
-8.058889
-12.868611
-19.1425
-11.937222
-8.364167
-12.251944
-16.393889
-15.473333
1895-04-15
0.882778
-0.976111
2.1075
-3.713333
-0.101944
-3.681111
-3.037222
-2.064722
-0.331389
0.883056
...
7.194444
1.088056
0.341111
-0.568611
-1.9425
-1.337222
-0.464167
0.348056
4.206111
1.226667
1895-05-15
6.982778
6.523889
8.2075
3.886667
6.998056
9.618889
6.562778
4.835278
6.468611
6.083056
...
11.094444
7.488056
5.241111
6.931389
14.1575
6.962778
4.335833
8.848056
14.106111
6.026667
1895-06-15
14.982778
12.323889
16.6075
11.186667
13.898056
19.318889
18.062778
9.435278
15.068611
11.983056
...
17.694444
18.288056
13.541111
13.731389
23.4575
18.062778
11.735833
19.348056
24.106111
12.426667
1895-07-15
16.682778
19.023889
18.6075
14.986667
18.698056
18.218889
17.762778
10.835278
15.868611
20.283056
...
25.394444
18.088056
15.841111
21.231389
21.5575
17.662778
15.935833
18.048056
24.606111
20.426667
1895-08-15
16.982778
18.523889
18.7075
14.886667
20.298056
20.518889
21.462778
11.035278
16.468611
21.183056
...
24.594444
18.788056
16.941111
21.831389
20.5575
19.962778
16.235833
20.848056
25.106111
21.526667
1895-09-15
15.282778
12.223889
16.3075
8.286667
14.198056
15.218889
16.362778
9.435278
13.968611
8.483056
...
16.894444
16.388056
12.641111
12.031389
15.2575
16.862778
4.935833
17.348056
19.906111
11.726667
1895-10-15
-2.817222
1.923889
-4.3925
2.586667
0.498056
-3.181111
-3.137222
0.435278
-1.431389
3.183056
...
-0.905556
-4.711944
-2.758889
0.431389
-2.0425
-3.537222
1.535833
-4.251944
-2.293889
0.726667
1895-11-15
-9.317222
-13.276111
-12.4925
-9.213333
-15.401944
-6.981111
-7.937222
-6.364722
-8.831389
-11.516944
...
-17.305556
-9.311944
-14.458889
-15.968611
-7.8425
-8.537222
-10.564167
-7.351944
-13.193889
-15.873333
1895-12-15
-17.617222
-21.776111
-18.4925
-16.413333
-23.701944
-16.481111
-15.637222
-14.564722
-17.731389
-20.416944
...
-23.105556
-17.311944
-19.058889
-27.068611
-18.2425
-17.037222
-15.064167
-16.251944
-21.893889
-24.973333
1896-01-15
-19.017222
-14.676111
-19.8925
-11.913333
-17.201944
-27.081111
-22.737222
-15.864722
-19.031389
-15.616944
...
-27.505556
-19.811944
-19.258889
-18.568611
-27.9425
-21.137222
-15.864167
-20.651944
-26.693889
-19.673333
1896-02-15
-14.817222
-13.976111
-16.1925
-10.313333
-17.501944
-22.381111
-19.137222
-12.864722
-14.531389
-12.216944
...
-20.305556
-17.011944
-15.058889
-17.568611
-25.0425
-17.737222
-9.564167
-18.651944
-23.993889
-16.773333
1896-03-15
-10.017222
-7.976111
-12.3925
-7.713333
-13.001944
-19.381111
-17.037222
-7.864722
-10.131389
-11.516944
...
-22.605556
-12.711944
-10.658889
-10.768611
-20.6425
-14.437222
-9.664167
-16.251944
-18.893889
-16.473333
1896-04-15
5.582778
-4.076111
7.3075
-8.413333
-1.001944
-0.381111
0.162778
0.335278
4.668611
-4.516944
...
0.294444
6.588056
2.841111
-4.268611
0.4575
3.162778
-4.164167
6.348056
4.306111
-3.773333
1896-05-15
13.382778
6.523889
14.1075
1.486667
8.498056
12.018889
12.462778
6.735278
13.168611
1.183056
...
14.494444
14.788056
12.941111
3.931389
14.4575
14.562778
2.335833
16.548056
19.406111
5.926667
1896-06-15
14.882778
18.623889
17.2075
13.486667
19.198056
15.118889
15.062778
10.035278
15.068611
16.683056
...
22.194444
15.888056
17.541111
20.031389
18.5575
16.062778
11.635833
17.648056
23.906111
19.226667
1896-07-15
18.682778
19.223889
23.2075
18.386667
22.398056
22.318889
21.562778
11.335278
17.668611
24.983056
...
25.794444
20.388056
17.541111
23.031389
25.3575
20.762778
20.435833
21.248056
26.506111
23.326667
1896-08-15
19.682778
18.023889
22.1075
15.586667
21.298056
20.918889
20.362778
12.335278
18.368611
21.083056
...
24.894444
20.488056
18.441111
20.831389
22.2575
19.762778
16.835833
20.548056
25.406111
21.426667
1896-09-15
13.382778
12.223889
12.8075
8.986667
11.898056
11.518889
12.062778
9.135278
13.168611
10.583056
...
11.094444
12.388056
10.841111
11.431389
13.1575
12.262778
8.035833
13.048056
12.006111
9.926667
1896-10-15
0.682778
1.523889
-0.4925
2.586667
0.098056
-1.381111
-1.337222
1.135278
0.568611
0.983056
...
-1.405556
-0.511944
-1.058889
0.631389
0.4575
-1.137222
0.735833
-1.451944
-0.193889
0.426667
1896-11-15
-5.217222
-10.976111
-8.4925
-9.713333
-15.601944
-5.181111
-3.937222
-1.264722
-4.231389
-15.716944
...
-30.005556
-6.411944
-10.658889
-15.468611
-5.7425
-4.037222
-16.564167
-3.751944
-17.193889
-21.373333
1896-12-15
-16.117222
-15.976111
-15.4925
-11.813333
-15.801944
-22.181111
-20.537222
-13.564722
-16.731389
-12.816944
...
-19.405556
-17.411944
-15.958889
-17.668611
-23.8425
-18.837222
-12.064167
-17.451944
-20.393889
-13.473333
1897-01-15
-20.717222
-18.076111
-22.3925
-15.413333
-23.101944
-23.681111
-23.837222
-16.164722
-20.031389
-20.416944
...
-33.205556
-22.911944
-23.658889
-22.668611
-25.5425
-22.937222
-18.064167
-24.151944
-29.393889
-24.273333
1897-02-15
-11.417222
-17.376111
-14.2925
-15.213333
-19.901944
-21.881111
-19.337222
-7.564722
-11.631389
-17.216944
...
-29.305556
-13.711944
-13.158889
-21.168611
-23.8425
-16.237222
-13.564167
-16.051944
-22.893889
-21.373333
1897-03-15
-2.317222
-14.176111
-4.9925
-14.313333
-14.401944
-13.181111
-10.337222
-0.864722
-4.031389
-17.416944
...
-23.305556
-4.711944
-5.358889
-17.768611
-15.2425
-7.437222
-14.364167
-6.551944
-17.493889
-19.173333
1897-04-15
-0.117222
-1.876111
0.4075
-1.413333
-2.301944
-1.681111
-2.737222
-0.964722
-0.031389
-0.416944
...
-0.505556
0.288056
0.041111
-2.268611
-0.5425
-0.837222
1.235833
-0.451944
0.306111
-3.173333
1897-05-15
6.482778
8.823889
7.6075
6.386667
10.898056
8.418889
6.562778
3.335278
6.668611
13.583056
...
12.894444
5.888056
7.041111
11.731389
10.8575
6.862778
9.835833
6.648056
10.706111
12.226667
1897-06-15
18.682778
13.423889
17.6075
10.586667
16.798056
13.418889
13.262778
11.935278
18.268611
13.983056
...
20.094444
17.888056
15.241111
15.331389
16.2575
16.762778
11.035833
17.148056
19.906111
17.226667
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2015-07-15
19.082778
19.323889
21.8075
17.586667
21.998056
24.018889
22.762778
12.835278
18.568611
22.583056
...
27.694444
21.088056
18.741111
22.531389
24.7575
20.762778
22.535833
20.948056
26.306111
23.126667
2015-08-15
16.482778
21.323889
18.0075
19.086667
22.298056
23.818889
20.962778
12.435278
16.568611
23.283056
...
25.494444
16.988056
18.841111
23.431389
25.1575
18.662778
20.535833
18.748056
24.006111
23.526667
2015-09-15
11.882778
15.723889
14.7075
14.786667
17.898056
19.618889
17.862778
10.035278
10.968611
14.683056
...
21.494444
13.688056
14.841111
17.931389
20.9575
14.762778
10.135833
15.948056
22.606111
18.626667
2015-10-15
2.482778
5.223889
3.4075
7.586667
6.798056
3.218889
2.362778
4.435278
1.868611
8.183056
...
7.194444
2.488056
4.741111
6.831389
2.8575
1.762778
6.335833
2.848056
6.406111
7.926667
2015-11-15
-3.317222
-10.976111
-5.8925
-9.813333
-10.001944
-2.081111
-2.537222
2.235278
-3.231389
-12.616944
...
-7.905556
-3.711944
-7.158889
-13.068611
-2.2425
-3.437222
-10.064167
-2.851944
-2.693889
-11.573333
2015-12-15
-5.617222
-18.376111
-10.0925
-15.113333
-18.801944
-4.681111
-3.337222
-0.364722
-4.031389
-18.216944
...
-20.105556
-5.911944
-12.958889
-20.968611
-6.3425
-4.937222
-13.964167
-4.851944
-10.293889
-19.473333
2016-01-15
-19.517222
-17.376111
-20.6925
-12.813333
-19.001944
-18.681111
-19.837222
-12.964722
-18.431389
-16.716944
...
-24.605556
-22.411944
-18.058889
-20.868611
-20.5425
-21.437222
-13.064167
-23.551944
-25.893889
-18.073333
2016-02-15
-12.817222
-8.776111
-13.1925
-5.613333
-12.301944
-16.381111
-15.737222
-10.564722
-12.931389
-10.616944
...
-13.205556
-14.811944
-9.958889
-13.768611
-17.5425
-16.137222
-6.864167
-16.251944
-19.393889
-10.773333
2016-03-15
-2.917222
-4.376111
-4.5925
-4.813333
-6.601944
-5.181111
-4.437222
-1.164722
-1.931389
-5.516944
...
-4.305556
-3.211944
-3.158889
-5.468611
-7.9425
-2.737222
-4.264167
-2.551944
-5.293889
-6.273333
2016-04-15
1.082778
-0.776111
2.4075
0.586667
-0.901944
-1.581111
-1.037222
1.135278
1.068611
4.483056
...
1.894444
2.488056
1.541111
0.231389
-3.2425
0.662778
5.535833
1.748056
0.806111
1.026667
2016-05-15
7.682778
5.423889
7.0075
4.786667
6.198056
9.918889
7.362778
6.135278
7.868611
8.183056
...
12.294444
8.088056
7.141111
6.831389
12.5575
7.362778
9.135833
8.348056
14.006111
7.026667
2016-06-15
16.682778
21.023889
18.8075
16.086667
21.798056
19.018889
17.962778
11.935278
16.868611
18.683056
...
26.194444
19.188056
15.641111
23.231389
20.5575
17.362778
13.835833
18.348056
23.806111
23.026667
2016-07-15
19.282778
23.123889
21.9075
19.386667
24.598056
25.918889
25.262778
14.035278
19.868611
22.383056
...
28.494444
22.588056
20.941111
26.131389
25.8575
22.862778
17.635833
22.548056
27.806111
25.926667
2016-08-15
18.982778
18.323889
19.8075
18.786667
19.898056
25.518889
24.362778
13.135278
18.668611
22.283056
...
26.494444
21.688056
17.141111
22.631389
26.0575
22.562778
19.835833
23.948056
27.406111
22.626667
2016-09-15
15.982778
12.523889
15.6075
12.086667
15.098056
17.918889
17.762778
11.235278
14.668611
12.183056
...
17.894444
16.888056
13.141111
13.531389
18.4575
17.462778
10.435833
18.248056
20.506111
14.526667
2016-10-15
5.882778
6.823889
6.8075
2.986667
7.198056
5.218889
5.962778
4.835278
4.968611
3.283056
...
7.194444
7.388056
6.441111
5.631389
6.3575
5.262778
1.335833
6.248056
8.306111
5.626667
2016-11-15
-4.817222
-6.076111
-4.4925
-4.613333
-4.401944
-4.681111
-5.337222
-3.264722
-5.031389
-3.716944
...
-2.405556
-4.511944
-3.358889
-5.868611
-4.6425
-6.137222
-3.064167
-5.551944
-0.493889
-3.773333
2016-12-15
-12.517222
-14.476111
-17.5925
-13.913333
-19.301944
-15.681111
-15.237222
-3.964722
-10.731389
-23.316944
...
-29.005556
-16.311944
-14.858889
-20.168611
-17.5425
-15.437222
-19.864167
-15.551944
-22.193889
-24.073333
2017-01-15
-10.217222
-16.776111
-14.8925
-15.213333
-19.201944
-15.481111
-15.037222
-7.264722
-9.931389
-23.816944
...
-27.905556
-12.211944
-13.658889
-21.668611
-16.7425
-14.537222
-21.264167
-13.551944
-22.593889
-24.273333
2017-02-15
-7.017222
-9.876111
-7.5925
-10.113333
-9.101944
-13.781111
-10.037222
-4.464722
-6.531389
-12.916944
...
-15.005556
-8.411944
-5.958889
-11.168611
-15.7425
-8.637222
-14.964167
-8.951944
-14.293889
-13.073333
2017-03-15
-4.017222
-2.476111
-4.3925
-4.013333
-2.101944
-15.081111
-10.837222
-3.964722
-5.231389
-4.016944
...
-8.905556
-6.111944
-0.758889
-2.368611
-17.7425
-8.637222
-6.864167
-8.951944
-11.793889
-2.473333
2017-04-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-05-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-06-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-07-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-08-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-09-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-10-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-11-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2017-12-15
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
1476 rows × 48 columns
In [35]:
df_states['Texas']['1951':'1980'].plot()
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce60d0fc88>
In [36]:
df_states.resample('AS').mean()['Texas'].plot()
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcec41c0780>
In [37]:
df = pd.concat([df_zonal, df_states.resample('AS').mean()['Texas']], axis=1).dropna()
In [15]:
df.describe()
Out[15]:
Glob
NHem
SHem
24N-90N
24S-24N
90S-24S
64N-90N
44N-64N
24N-44N
EQU-24N
24S-EQU
44S-24S
64S-44S
90S-64S
Texas
count
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
122.000000
mean
0.058607
0.097131
0.020984
0.126311
0.058525
-0.008852
0.223033
0.169590
0.064590
0.055902
0.061885
0.035164
-0.057295
-0.152623
0.323010
std
0.327970
0.372418
0.304602
0.436466
0.324694
0.302849
0.813104
0.491238
0.337492
0.317742
0.339100
0.322947
0.268590
0.740347
1.056474
min
-0.470000
-0.500000
-0.480000
-0.570000
-0.590000
-0.490000
-1.610000
-0.730000
-0.540000
-0.650000
-0.570000
-0.430000
-0.620000
-2.550000
-2.133889
25%
-0.187500
-0.167500
-0.220000
-0.170000
-0.200000
-0.277500
-0.335000
-0.157500
-0.130000
-0.190000
-0.190000
-0.237500
-0.245000
-0.567500
-0.492222
50%
-0.020000
0.035000
-0.065000
0.065000
0.020000
-0.095000
0.125000
0.065000
-0.020000
0.035000
0.010000
-0.080000
-0.075000
-0.010000
0.320278
75%
0.265000
0.250000
0.277500
0.317500
0.282500
0.250000
0.667500
0.387500
0.167500
0.252500
0.312500
0.290000
0.190000
0.340000
0.936944
max
0.980000
1.250000
0.710000
1.480000
0.960000
0.580000
2.850000
1.460000
1.080000
0.940000
0.990000
0.750000
0.390000
1.270000
3.324444
In [16]:
# We can quickly calculate a correlation matrix between all the columns
df.corr()
Out[16]:
Glob
NHem
SHem
24N-90N
24S-24N
90S-24S
64N-90N
44N-64N
24N-44N
EQU-24N
24S-EQU
44S-24S
64S-44S
90S-64S
Texas
Glob
1.000000
0.975517
0.963644
0.935436
0.947599
0.912153
0.824861
0.890983
0.917213
0.934765
0.939024
0.944705
0.782213
0.491374
0.366355
NHem
0.975517
1.000000
0.881598
0.980091
0.908790
0.815613
0.877802
0.934521
0.950878
0.915667
0.882181
0.875729
0.658797
0.402197
0.428655
SHem
0.963644
0.881598
1.000000
0.818097
0.935815
0.966161
0.704238
0.777761
0.815276
0.899446
0.950137
0.965006
0.876082
0.562960
0.262310
24N-90N
0.935436
0.980091
0.818097
1.000000
0.813556
0.779740
0.911440
0.959961
0.953353
0.817847
0.790858
0.845237
0.608663
0.377788
0.486877
24S-24N
0.947599
0.908790
0.935815
0.813556
1.000000
0.821486
0.693170
0.770717
0.818276
0.988717
0.989776
0.857811
0.720218
0.418141
0.235556
90S-24S
0.912153
0.815613
0.966161
0.779740
0.821486
1.000000
0.679553
0.736778
0.775128
0.784722
0.837899
0.972941
0.918933
0.635731
0.287451
64N-90N
0.824861
0.877802
0.704238
0.911440
0.693170
0.679553
1.000000
0.830752
0.790121
0.700286
0.668809
0.729404
0.526732
0.293047
0.447521
44N-64N
0.890983
0.934521
0.777761
0.959961
0.770717
0.736778
0.830752
1.000000
0.876222
0.766224
0.757346
0.799065
0.575175
0.368080
0.401001
24N-44N
0.917213
0.950878
0.815276
0.953353
0.818276
0.775128
0.790121
0.876222
1.000000
0.827706
0.791810
0.844759
0.609620
0.387811
0.518752
EQU-24N
0.934765
0.915667
0.899446
0.817847
0.988717
0.784722
0.700286
0.766224
0.827706
1.000000
0.957501
0.826890
0.676261
0.400921
0.257518
24S-EQU
0.939024
0.882181
0.950137
0.790858
0.989776
0.837899
0.668809
0.757346
0.791810
0.957501
1.000000
0.867680
0.745877
0.425460
0.207955
44S-24S
0.944705
0.875729
0.965006
0.845237
0.857811
0.972941
0.729404
0.799065
0.844759
0.826890
0.867680
1.000000
0.848053
0.540485
0.331289
64S-44S
0.782213
0.658797
0.876082
0.608663
0.720218
0.918933
0.526732
0.575175
0.609620
0.676261
0.745877
0.848053
1.000000
0.547706
0.161505
90S-64S
0.491374
0.402197
0.562960
0.377788
0.418141
0.635731
0.293047
0.368080
0.387811
0.400921
0.425460
0.540485
0.547706
1.000000
0.144904
Texas
0.366355
0.428655
0.262310
0.486877
0.235556
0.287451
0.447521
0.401001
0.518752
0.257518
0.207955
0.331289
0.161505
0.144904
1.000000
We are now ready to start doing some statistics. Fitting a model in statsmodels typically involves 3 easy steps:
In [17]:
# 1. Describe the model
mod = sm.OLS(df['Texas'], df.drop('Texas', axis=1)) # Ordinary Least Squares
# 2. Fit the model
res = mod.fit()
# 3. Summarize the model fit
res.summary()
Out[17]:
OLS Regression Results
Dep. Variable: Texas R-squared: 0.459
Model: OLS Adj. R-squared: 0.389
Method: Least Squares F-statistic: 6.545
Date: Tue, 11 Apr 2017 Prob (F-statistic): 2.10e-09
Time: 10:27:42 Log-Likelihood: -147.33
No. Observations: 122 AIC: 322.7
Df Residuals: 108 BIC: 361.9
Df Model: 14
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Glob 39.6494 26.754 1.482 0.141 -13.382 92.681
NHem 10.4157 23.999 0.434 0.665 -37.154 57.985
SHem 5.0687 19.720 0.257 0.798 -34.019 44.157
24N-90N 5.5436 15.300 0.362 0.718 -24.784 35.871
24S-24N 3.9676 21.881 0.181 0.856 -39.404 47.339
90S-24S -14.1363 13.191 -1.072 0.286 -40.284 12.011
64N-90N -3.2933 1.928 -1.708 0.091 -7.115 0.529
44N-64N -8.1399 4.102 -1.984 0.050 -16.271 -0.009
24N-44N -9.1017 6.488 -1.403 0.164 -21.961 3.758
EQU-24N -15.4135 14.561 -1.059 0.292 -44.275 13.448
24S-EQU -12.4807 13.499 -0.925 0.357 -39.237 14.276
44S-24S -0.3458 1.751 -0.197 0.844 -3.817 3.126
64S-44S -0.9190 0.998 -0.921 0.359 -2.897 1.059
90S-64S -0.0188 0.177 -0.106 0.916 -0.369 0.331
Omnibus: 1.777 Durbin-Watson: 1.803
Prob(Omnibus): 0.411 Jarque-Bera (JB): 1.487
Skew: 0.110 Prob(JB): 0.475
Kurtosis: 2.506 Cond. No. 529.
In [18]:
res.pvalues # lower is better. Significant usually for p < 0.05
Out[18]:
Glob 0.141256
NHem 0.665145
SHem 0.797638
24N-90N 0.717821
24S-24N 0.856450
90S-24S 0.286272
64N-90N 0.090527
44N-64N 0.049765
24N-44N 0.163502
EQU-24N 0.292158
24S-EQU 0.357239
44S-24S 0.843875
64S-44S 0.359064
90S-64S 0.915527
dtype: float64
In [ ]:
In [19]:
sn.jointplot(df['Texas'], df['24N-44N'])
Out[19]:
<seaborn.axisgrid.JointGrid at 0x7fcea82bce80>
In [23]:
# Seaborn also plays nice with pandas -- you can provide the dataframe as 'data'
# and then just reference the columns.
# Excercise -- Try kind as 'reg', 'kde', and 'hex'
sn.jointplot('Texas', '24N-44N', data=df, kind='reg')
/opt/anaconda3/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[23]:
<seaborn.axisgrid.JointGrid at 0x7fcea00a6978>
In [24]:
# We can quicly visualize a 'heatmap' of the correlation coefficients.
sn.heatmap(df.corr())
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcea0371f60>
In [38]:
fig = plt.figure(figsize=(12, 4))
# sn.boxplot(df)
sn.violinplot(df.drop('Texas', axis=1))
/opt/anaconda3/lib/python3.5/site-packages/seaborn/categorical.py:2342: UserWarning: The violinplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
warnings.warn(msg, UserWarning)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcec42b6198>
In [26]:
df[['Glob', 'NHem', 'SHem', 'Texas']].plot(figsize=(15, 5))
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce60fe9ac8>
In [27]:
import statsmodels.formula.api as smf
model = smf.ols(formula="Texas ~ NHem + SHem", data=df).fit()
model.summary()
Out[27]:
OLS Regression Results
Dep. Variable: Texas R-squared: 0.244
Model: OLS Adj. R-squared: 0.231
Method: Least Squares F-statistic: 19.17
Date: Tue, 11 Apr 2017 Prob (F-statistic): 6.05e-08
Time: 10:39:52 Log-Likelihood: -162.27
No. Observations: 122 AIC: 330.5
Df Residuals: 119 BIC: 339.0
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 0.1166 0.091 1.277 0.204 -0.064 0.297
NHem 2.5136 0.479 5.246 0.000 1.565 3.462
SHem -1.7995 0.586 -3.072 0.003 -2.959 -0.640
Omnibus: 6.713 Durbin-Watson: 1.856
Prob(Omnibus): 0.035 Jarque-Bera (JB): 3.605
Skew: 0.196 Prob(JB): 0.165
Kurtosis: 2.254 Cond. No. 8.82
In [29]:
df['year'] = df.index.year
model = smf.ols(formula="Glob ~ year", data=df).fit()
model.summary()
Out[29]:
OLS Regression Results
Dep. Variable: Glob R-squared: 0.787
Model: OLS Adj. R-squared: 0.785
Method: Least Squares F-statistic: 442.6
Date: Tue, 11 Apr 2017 Prob (F-statistic): 4.47e-42
Time: 10:42:00 Log-Likelihood: 57.654
No. Observations: 122 AIC: -111.3
Df Residuals: 120 BIC: -105.7
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -16.0278 0.765 -20.959 0.000 -17.542 -14.514
year 0.0082 0.000 21.039 0.000 0.007 0.009
Omnibus: 1.182 Durbin-Watson: 0.531
Prob(Omnibus): 0.554 Jarque-Bera (JB): 1.252
Skew: 0.223 Prob(JB): 0.535
Kurtosis: 2.784 Cond. No. 1.09e+05
In [30]:
sm.stats.anova_lm(model)
/opt/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater
return (self.a < x) & (x < self.b)
/opt/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less
return (self.a < x) & (x < self.b)
/opt/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal
cond2 = cond0 & (x <= self.a)
Out[30]:
df
sum_sq
mean_sq
F
PR(>F)
year
1.0
10.239312
10.239312
442.629368
4.467934e-42
Residual
120.0
2.775951
0.023133
NaN
NaN
In [ ]:
In [ ]:
Content source: hetland/python4geosciences
Similar notebooks: