Ordinary Least Squares
In [3]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
%matplotlib inline
In [13]:
# artificial data
nsample = 100
x = np.linspace(0, 10, nsample)
X = np.column_stack((x,x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
In [14]:
# add a columnn of 1s as the intercept
X = sm.add_constant(X)
y = np.dot(X, beta) + e
In [15]:
# fit the estimation
model = sm.OLS(y, X).fit()
print(model.summary())
In [16]:
print('Parameters: ', model.params)
print('R2: ', model.rsquared)
print('R2 adjust: ', model.rsquared_adj)
print('BIC:', model.bic)
In [17]:
nsample = 50
sig = 0.5
x = np.linspace(0, 20 ,nsample)
X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))
beta = [0.5 , 0.5, -0.02, 5.0]
y = np.dot(X, beta) + np.random.normal(size=nsample)
In [18]:
model = sm.OLS(y, X).fit()
print(model.summary())
In [20]:
print('parameters:', model.params)
print('standard errors', model.bse)
print('pridict values', model.predict())
In [23]:
# draw a plot to compare the true relationship to OLS predictions
prstd, iv_l, iv_u = wls_prediction_std(model)
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o')
ax.plot(x, model.fittedvalues, 'r--.')
ax.plot(x, iv_u, 'b--')
ax.plot(x, iv_l, 'g--')
plt.show()
In [24]:
nsample = 50
groups = np.zeros(nsample, np.int)
groups[20:40]=1
groups[40:]=2
dummy = sm.categorical(groups, drop=True)
dummy
Out[24]:
In [27]:
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, dummy[:,1:]))
X = sm.add_constant(X, prepend=False)
beta = [1., 3, -3, 10]
y_true = np.dot(X,beta)
e = np.random.normal(size=nsample)
y = y_true + e
In [28]:
model = sm.OLS(y, X).fit()
print(model.summary())
In [29]:
prstd, iv_l, iv_u = wls_prediction_std(model)
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="Data")
ax.plot(x, y_true, 'b-', label="True")
ax.plot(x, model.fittedvalues, 'r--.', label="Predicted")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
legend = ax.legend(loc="best")
In [30]:
R = [[0,1,0,0],[0,0,1,0]]
R = np.array(R)
print(model.f_test(R))
You can also use formula-like syntax to test hypotheses
In [31]:
print(model.f_test('x2=x3=0'))
In [32]:
print(model.f_test('x1=0'))
In [36]:
beta = [0.001, 0.3, -0.0, 10]
y_true = np.dot(X, beta)
y = y_true + np.random.normal(size=nsample)
model = sm.OLS(y, X).fit()
print(model.f_test(R))
In [37]:
print(model.f_test('x2=x3=0'))
In [38]:
print(model.f_test('x1=0'))
In [39]:
from statsmodels.datasets.longley import load_pandas
y = load_pandas().endog
X = load_pandas().exog
X = sm.add_constant(X)
In [40]:
model = sm.OLS(y,X).fit()
print(model.summary())
In [46]:
norm_x = X.values
for i, name in enumerate(X):
if name == 'const':
continue
norm_x[:,i] = X[name]/np.linalg.norm(X[name])
norm_xtx = np.dot(norm_x.T, norm_x)
Then, we take the square root of the ratio of the biggest to the smallest eigen values.
In [47]:
eigs = np.linalg.eigvals(norm_xtx)
condition_number = np.sqrt(eigs.max() / eigs.min())
print(condition_number)