In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
lake = pd.read_csv('data/lake.dat', names=['level'])
lake['t'] = lake.index

print(lake.shape)

# Level changing over time
plt.plot(lake['t'], lake['level'])


(98, 2)
Out[2]:
[<matplotlib.lines.Line2D at 0x108802ba8>]

In [3]:
mod = smf.ols('level ~ t', data=lake)
res = mod.fit()

print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  level   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.265
Method:                 Least Squares   F-statistic:                     35.95
Date:                Mon, 12 Jan 2015   Prob (F-statistic):           3.55e-08
Time:                        15:49:43   Log-Likelihood:                -150.05
No. Observations:                  98   AIC:                             304.1
Df Residuals:                      96   BIC:                             309.3
Df Model:                           1                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     10.1778      0.227     44.912      0.000         9.728    10.628
t             -0.0242      0.004     -5.996      0.000        -0.032    -0.016
==============================================================================
Omnibus:                        1.626   Durbin-Watson:                   0.439
Prob(Omnibus):                  0.444   Jarque-Bera (JB):                1.274
Skew:                          -0.039   Prob(JB):                        0.529
Kurtosis:                       2.447   Cond. No.                         111.
==============================================================================

In [4]:
plt.plot(lake['t'], lake['level'])

# linear fit
plt.plot(lake['t'], res.predict(lake))

# quadratic fit
lake['t2'] = lake['t'] ** 2
res2 = smf.ols('level ~ t + t2', data=lake).fit()
plt.plot(lake['t'], res2.predict(lake))

# cubic
lake['t3'] = lake['t'] ** 3
res3 = smf.ols('level ~ t + t2 + t3', data=lake).fit()
plt.plot(lake['t'], res3.predict(lake))


Out[4]:
[<matplotlib.lines.Line2D at 0x108b31128>]

The 3rd order polynomial basically overlaps the 2nd because the coefficient of the 3rd order term is nearly 0.


In [5]:
print(res2.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  level   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.396
Method:                 Least Squares   F-statistic:                     32.78
Date:                Mon, 12 Jan 2015   Prob (F-statistic):           1.49e-11
Time:                        15:49:44   Log-Likelihood:                -139.92
No. Observations:                  98   AIC:                             285.8
Df Residuals:                      95   BIC:                             293.6
Df Model:                           2                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     11.2262      0.304     36.893      0.000        10.622    11.830
t             -0.0897      0.014     -6.188      0.000        -0.119    -0.061
t2             0.0007      0.000      4.670      0.000         0.000     0.001
==============================================================================
Omnibus:                        0.375   Durbin-Watson:                   0.539
Prob(Omnibus):                  0.829   Jarque-Bera (JB):                0.538
Skew:                          -0.028   Prob(JB):                        0.764
Kurtosis:                       2.642   Cond. No.                     1.25e+04
==============================================================================

Warnings:
[1] The condition number is large, 1.25e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

In [6]:
print(res3.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  level   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.389
Method:                 Least Squares   F-statistic:                     21.62
Date:                Mon, 12 Jan 2015   Prob (F-statistic):           9.81e-11
Time:                        15:49:44   Log-Likelihood:                -139.92
No. Observations:                  98   AIC:                             287.8
Df Residuals:                      94   BIC:                             298.2
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     11.2160      0.401     27.985      0.000        10.420    12.012
t             -0.0884      0.036     -2.459      0.016        -0.160    -0.017
t2             0.0006      0.001      0.743      0.459        -0.001     0.002
t3          2.288e-07   5.85e-06      0.039      0.969     -1.14e-05  1.19e-05
==============================================================================
Omnibus:                        0.356   Durbin-Watson:                   0.539
Prob(Omnibus):                  0.837   Jarque-Bera (JB):                0.522
Skew:                          -0.026   Prob(JB):                        0.770
Kurtosis:                       2.646   Cond. No.                     1.35e+06
==============================================================================

Warnings:
[1] The condition number is large, 1.35e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

In [7]:
plt.plot(lake['t'], lake['level'])

resquad = smf.ols('level ~ 1 + t + I(t**2)', data=lake).fit()
plt.plot(lake['t'], resquad.predict(lake))

res4 = smf.ols('level ~ 1 + t + I(t**2) + I(t**3)', data=lake).fit()
plt.plot(lake['t'], res4.predict(lake))


Out[7]:
[<matplotlib.lines.Line2D at 0x108d1b8d0>]

In [7]: