In [ ]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
np.random.seed(9876789)
In [ ]:
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
Our model needs an intercept so we add a column of 1s:
In [ ]:
X = sm.add_constant(X)
y = np.dot(X, beta) + e
Inspect data:
In [ ]:
X = sm.add_constant(X)
y = np.dot(X, beta) + e
Fit and summary:
In [ ]:
model = sm.OLS(y, X)
results = model.fit()
print results.summary()
Quantities of interest can be extracted directly from the fitted model. Type dir(results) for a full list. Here are some examples:
In [ ]:
print 'Parameters: ', results.params
print 'R2: ', results.rsquared
In [ ]:
nsample = 50
sig = 0.5
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))
beta = [0.5, 0.5, -0.02, 5.]
y_true = np.dot(X, beta)
y = y_true + sig * np.random.normal(size=nsample)
Fit and summary:
In [ ]:
res = sm.OLS(y, X).fit()
print res.summary()
Extract other quantities of interest:
In [ ]:
print 'Parameters: ', res.params
print 'Standard errors: ', res.bse
print 'Predicted values: ', res.predict()
Draw a plot to compare the true relationship to OLS predictions. Confidence intervals around the predictions are built using the wls_prediction_std command.
In [ ]:
prstd, iv_l, iv_u = wls_prediction_std(res)
fig, ax = plt.subplots()
ax.plot(x, y, 'o', label="data")
ax.plot(x, y_true, 'b-', label="True")
ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');
In [ ]:
nsample = 50
groups = np.zeros(nsample, int)
groups[20:40] = 1
groups[40:] = 2
#dummy = (groups[:,None] == np.unique(groups)).astype(float)
dummy = sm.categorical(groups, drop=True)
x = np.linspace(0, 20, nsample)
# drop reference category
X = np.column_stack((x, dummy[:,1:]))
X = sm.add_constant(X, prepend=False)
beta = [1., 3, -3, 10]
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + e
Inspect the data:
In [ ]:
print X[:5,:]
print y[:5]
print groups
print dummy[:5,:]
Fit and summary:
In [ ]:
res2 = sm.OLS(y, X).fit()
print res.summary()
Draw a plot to compare the true relationship to OLS predictions:
In [ ]:
prstd, iv_l, iv_u = wls_prediction_std(res2)
fig, ax = plt.subplots()
ax.plot(x, y, 'o', label="Data")
ax.plot(x, y_true, 'b-', label="True")
ax.plot(x, res2.fittedvalues, 'r--.', label="Predicted")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc="best")
In [ ]:
R = [[0, 1, 0, 0], [0, 0, 1, 0]]
print np.array(R)
print res2.f_test(R)
You can also use formula-like syntax to test hypotheses
In [ ]:
print res2.f_test("x2 = x3 = 0")
In [ ]:
beta = [1., 0.3, -0.0, 10]
y_true = np.dot(X, beta)
y = y_true + np.random.normal(size=nsample)
res3 = sm.OLS(y, X).fit()
In [ ]:
print res3.f_test(R)
In [ ]:
print res3.f_test("x2 = x3 = 0")
In [ ]:
from statsmodels.datasets.longley import load_pandas
y = load_pandas().endog
X = load_pandas().exog
X = sm.add_constant(X)
Fit and summary:
In [ ]:
ols_model = sm.OLS(y, X)
ols_results = ols_model.fit()
print ols_results.summary()
In [ ]:
for i, name in enumerate(X):
if name == "const":
continue
norm_x[:,i] = X[name]/np.linalg.norm(X[name])
norm_xtx = np.dot(norm_x.T,norm_x)
Then, we take the square root of the ratio of the biggest to the smallest eigen values.
In [ ]:
eigs = np.linalg.eigvals(norm_xtx)
condition_number = np.sqrt(eigs.max() / eigs.min())
print condition_number
In [ ]:
ols_results2 = sm.OLS(y.ix[:14], X.ix[:14]).fit()
print "Percentage change %4.2f%%\n"*7 % tuple([i for i in (ols_results2.params - ols_results.params)/ols_results.params*100])
We can also look at formal statistics for this such as the DFBETAS -- a standardized measure of how much each coefficient changes when that observation is left out.
In [ ]:
infl = ols_results.get_influence()
In general we may consider DBETAS in absolute value greater than $2/\sqrt{N}$ to be influential observations
In [ ]:
2./len(X)**.5
In [ ]:
print infl.summary_frame().filter(regex="dfb")