In [1]:
import numpy as np
import statsmodels.api as sm
import pandas as pd
from scipy import stats
pd.options.display.float_format = '{:,.4f}'.format
In [2]:
X = sm.add_constant(np.random.randn(100,3)*10)
w = np.array([0.25, 0.1, 0.3, -1.5])
y = np.dot(X,w) + np.random.randn(X.shape[0])
print X.shape, y.shape, w.shape
In [3]:
model = sm.OLS(y, X)
res = model.fit()
print res.summary2()
In [4]:
def R2(y, X, coeffs):
y_hat = np.dot(X, coeffs)
y_mean = np.mean(y)
SST = np.sum((y-y_mean)**2)
SSR = np.sum((y_hat - y_mean)**2)
SSE = np.sum((y_hat - y)**2)
#R_squared = SSR / SST
R_squared = SSE / SST
return 1- R_squared
In [5]:
R2(y, X, res.params)
Out[5]:
In [6]:
def se_coeff(y, X, coeffs):
# Reference: https://en.wikipedia.org/wiki/Ordinary_least_squares#Finite_sample_properties
s2 = np.dot(y, y - np.dot(X, coeffs)) / (X.shape[0] - X.shape[1]) # Calculate S-squared
XX = np.diag(np.linalg.inv(np.dot(X.T, X))) # Calculate
return np.sqrt(s2*XX)
In [7]:
coeffs = res.params
N, K = X.shape
se = se_coeff(y, X, coeffs)
t = coeffs / se
p = stats.t.sf(np.abs(t), N - K)*2
ci = stats.t.ppf((1 + 0.95)/2, N-K)*se
pd.DataFrame(np.vstack((coeffs, se, t, p, coeffs - ci, coeffs + ci)).T, columns=["coeff", "S.E.", "t", "p-value", "ci-", "ci+"])
Out[7]:
In [ ]: