In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
lake = pd.read_csv('data/lake.dat', names=['level'])
lake['t'] = lake.index
print(lake.shape)
# Level changing over time
plt.plot(lake['t'], lake['level'])
Out[2]:
In [3]:
mod = smf.ols('level ~ t', data=lake)
res = mod.fit()
print(res.summary())
In [4]:
plt.plot(lake['t'], lake['level'])
# linear fit
plt.plot(lake['t'], res.predict(lake))
# quadratic fit
lake['t2'] = lake['t'] ** 2
res2 = smf.ols('level ~ t + t2', data=lake).fit()
plt.plot(lake['t'], res2.predict(lake))
# cubic
lake['t3'] = lake['t'] ** 3
res3 = smf.ols('level ~ t + t2 + t3', data=lake).fit()
plt.plot(lake['t'], res3.predict(lake))
Out[4]:
The 3rd order polynomial basically overlaps the 2nd because the coefficient of the 3rd order term is nearly 0.
In [5]:
print(res2.summary())
In [6]:
print(res3.summary())
In [7]:
plt.plot(lake['t'], lake['level'])
resquad = smf.ols('level ~ 1 + t + I(t**2)', data=lake).fit()
plt.plot(lake['t'], resquad.predict(lake))
res4 = smf.ols('level ~ 1 + t + I(t**2) + I(t**3)', data=lake).fit()
plt.plot(lake['t'], res4.predict(lake))
Out[7]:
In [7]: