Measuring importance of coefficients of OLS

$s^2 = \frac{y^TMy}{n -p} = \frac{y^T(y - X\hat{\beta})}{n -p}$

$s.e.(\hat{\beta_{j}}) = \sqrt{s^2(X^TX)^{-1}_{jj}}$

$t = \frac{\hat{\beta}}{s.e.(\hat{\beta})}$

$p = SF(|t|, n-p) * 2$

$c.i. = PPF((1 + confidence)/2, n-p)$



In [1]:

    
import numpy as np
import statsmodels.api as sm
import pandas as pd
from scipy import stats
pd.options.display.float_format = '{:,.4f}'.format



In [2]:

    
X = sm.add_constant(np.random.randn(100,3)*10)
w = np.array([0.25, 0.1, 0.3, -1.5])
y = np.dot(X,w) + np.random.randn(X.shape[0])
print X.shape, y.shape, w.shape









    



(100, 4) (100,) (4,)



In [3]:

    
model = sm.OLS(y, X)
res = model.fit()
print res.summary2()









    



            Results: Ordinary least squares
=======================================================
Model:              OLS   AIC:                297.7558 
Dependent Variable: y     BIC:                308.1765 
No. Observations:   100   Log-Likelihood:     -144.88  
Df Model:           3     F-statistic:        7662.    
Df Residuals:       96    Prob (F-statistic): 4.06e-114
R-squared:          0.996 Scale:              1.1057   
Adj. R-squared:     0.996                              
-------------------------------------------------------
       Coef.  Std.Err.     t     P>|t|   [0.025  0.975]
-------------------------------------------------------
const  0.3729   0.1060    3.5167 0.0007  0.1624  0.5834
x1     0.0833   0.0108    7.6968 0.0000  0.0618  0.1048
x2     0.2898   0.0113   25.5539 0.0000  0.2673  0.3123
x3    -1.5043   0.0101 -149.0287 0.0000 -1.5243 -1.4842
-------------------------------------------------------
Omnibus:            0.005    Durbin-Watson:       1.747
Prob(Omnibus):      0.997    Jarque-Bera (JB):    0.050
Skew:               0.005    Prob(JB):            0.976
Kurtosis:           2.891    Condition No.:       11   
=======================================================



In [4]:

    
def R2(y, X, coeffs):
    y_hat = np.dot(X, coeffs)
    y_mean = np.mean(y)
    SST = np.sum((y-y_mean)**2)
    SSR = np.sum((y_hat - y_mean)**2)
    SSE = np.sum((y_hat - y)**2)
    #R_squared = SSR / SST
    R_squared = SSE / SST
    return 1- R_squared



In [5]:

    
R2(y, X, res.params)









    Out[5]:





0.9958407589373941



In [6]:

    
def se_coeff(y, X, coeffs):
    # Reference: https://en.wikipedia.org/wiki/Ordinary_least_squares#Finite_sample_properties
    s2 = np.dot(y, y - np.dot(X, coeffs)) / (X.shape[0] - X.shape[1]) # Calculate S-squared
    XX = np.diag(np.linalg.inv(np.dot(X.T, X))) # Calculate 
    return np.sqrt(s2*XX)



In [7]:

    
coeffs = res.params
N, K = X.shape
se = se_coeff(y, X, coeffs)
t = coeffs / se
p = stats.t.sf(np.abs(t), N - K)*2
ci = stats.t.ppf((1 + 0.95)/2, N-K)*se
pd.DataFrame(np.vstack((coeffs, se, t, p, coeffs - ci, coeffs + ci)).T, columns=["coeff", "S.E.", "t", "p-value", "ci-", "ci+"])



In [ ]:

	coeff	S.E.	t	p-value	ci-	ci+
0	0.3729	0.1060	3.5167	0.0007	0.1624	0.5834
1	0.0833	0.0108	7.6968	0.0000	0.0618	0.1048
2	0.2898	0.0113	25.5539	0.0000	0.2673	0.3123
3	-1.5043	0.0101	-149.0287	0.0000	-1.5243	-1.4842