In [10]:
import numpy as np
from selection.lasso import lasso
%pylab inline
In [11]:
n, p, sigma = 100, 50, 1.5
def instance(n, p, sigma, frac=0.9, rho=0, beta=None):
X = np.sqrt(1-rho**2) * np.random.standard_normal((n,p)) + np.sqrt(rho) * np.random.standard_normal(n)[:,None]
X -= X.mean(0)[None,:]
X /= np.sqrt((X**2).sum(0))[None,:]
y = np.random.standard_normal(n) * sigma
if beta is not None:
mu = np.dot(X, beta)
else:
mu = np.zeros(n)
y += mu
L = lasso(y, X, frac=frac, sigma_epsilon=sigma)
L.alpha = 0.1 # 90% selection intervals
return L, mu
In [12]:
L, mu = instance(n,p,sigma)
In [13]:
L.l1norm_interval
Out[13]:
In [14]:
def form_intervals(n, p, sigma, frac=0.9, beta=None, plot=True, rho=0):
results = []
unadjusted_results = []
L, mu = instance(n, p, sigma, frac=frac, beta=beta, rho=rho)
for i, eta, center, interval in L.intervals:
lower, upper = interval
lower = max(-10e6,lower); upper = min(10e6,upper)
truth = (eta * mu).sum()
if plot:
plt.scatter(i, truth, s=30)
if truth > lower and truth < upper:
covered = True
else:
covered = False
if plot:
if not covered:
plt.plot([i,i], [lower, upper], linewidth=5, color='red', alpha=0.4)
else:
plt.plot([i,i], [lower, upper], linewidth=5, color='green', alpha=0.4)
results.append((truth, covered, lower, upper))
for i, eta, center, interval in L.unadjusted_intervals:
lower, upper = interval
lower = max(-10e6,lower); upper = min(10e6,upper)
truth = (eta * mu).sum()
if plot:
plt.scatter(i, truth, s=30)
if truth > lower and truth < upper:
covered = True
else:
covered = False
if plot:
plt.plot([i,i], [lower, upper], 'k--', linewidth=2)
unadjusted_results.append((truth, covered, lower, upper))
if plot:
plt.gca().set_xlim([-1,p+1])
plt.gca().set_ylim([-12,12])
return plt.gcf(), results, unadjusted_results, L
else:
return None, results, unadjusted_results, L
In [15]:
f = form_intervals(100, 20, 1, frac=0.7)[0]
Now, add a little correlation:
In [16]:
f = form_intervals(100, 20, 1, frac=0.7, rho=0.2)[0]
In [17]:
f = form_intervals(100, 50, 1, frac=0.7)[0]
In [18]:
f = form_intervals(100, 200, 1, frac=0.7)[0]
We'll repeat the experiment including some strong signals. In the intervals below, the light gray region are the truly active coefficients.
In [19]:
p = 20
beta = np.zeros(p)
beta[:4] = np.linspace(3.5,4.5,4)
f = form_intervals(100,p,1,frac=0.7, beta=beta, rho=0.2)[0]
a = f.gca()
ylim = a.get_ylim()
a.fill([0,4,4,0],[-100,-100,100,100], alpha=0.2, facecolor='yellow')
a.set_ylim(ylim)
a.set_xlabel(r'Feature $i$')
a.set_ylabel(r'Coefficient: $\hat{\beta}_{{\cal E},i}^{OLS}$')
Out[19]:
In [20]:
p = 50
beta = np.zeros(p)
beta[:4] = np.linspace(3.5,4.5,4)
f = form_intervals(100,p,1,frac=0.5, beta=beta)[0]
a = f.gca()
ylim = a.get_ylim()
a.fill([0,4,4,0],[-100,-100,100,100], alpha=0.2, facecolor='yellow')
a.set_ylim(ylim)
a.set_xlabel(r'Feature $i$')
a.set_ylabel(r'Coefficient: $\hat{\beta}_{{\cal E},i}^{OLS}$')
Out[20]:
In [21]:
p = 200
beta = np.zeros(p)
beta[:4] = np.linspace(2,2.5,4) * np.sqrt(2*np.log(p))
f = form_intervals(100,p,1,frac=0.2, beta=beta, rho=0.1)[0]
a = f.gca()
ylim = a.get_ylim()
a.fill([0,4,4,0],[-100,-100,100,100], alpha=0.2, facecolor='yellow')
a.set_ylim(ylim)
a.set_xlabel(r'Feature $i$')
a.set_ylabel(r'Coefficient: $\hat{\beta}_{{\cal E},i}^{OLS}$')
a.set_ylim([0,15])
Out[21]:
Here's a zoom on the strong coefficients.
In [22]:
f = a.figure
a.set_xlim([-1,11])
f
Out[22]:
When it has included all true variables in the model, the signal intervals should have center at the true value of $\beta[1:4]$.
In [23]:
results = []
unadjusted_results = []
nsim = 50
min_results = 150
p = 200
beta = np.zeros(p)
beta[:4] = [6,7,8,9]
for _ in range(nsim):
r, ur = form_intervals(100,p,1,frac=0.5, beta=beta, plot=False)[1:3]
results.extend(r)
unadjusted_results.extend(ur)
if len(results) > min_results:
break
In [24]:
f = plt.figure(figsize=(12,7))
idx = 0
for truth, covered, lower, upper in results:
lower = max(-10e6,lower); upper = min(10e6,upper)
plt.scatter(idx, truth, s=30)
if not covered:
plt.plot([idx,idx], [lower, upper], linewidth=5, color='red', alpha=0.3)
else:
plt.plot([idx,idx], [lower, upper], linewidth=5, color='green', alpha=0.3)
idx += 1
idx = 0
for truth, covered, lower, upper in unadjusted_results:
plt.scatter(idx, truth, s=30)
plt.plot([idx,idx], [lower, upper], linewidth=1, color='black')
idx += 1
plt.gca().set_xlim([-1,idx+1])
plt.gca().set_ylim([-10,25])
Out[24]:
But, if we add correlation, the picture gets a little worse:
In [25]:
results = []
unadjusted_results = []
nsim = 50
min_results = 150
p = 200
beta = np.zeros(p)
beta[:4] = [6,7,8,9]
for _ in range(nsim):
r, ur = form_intervals(100,p,1,frac=0.5, beta=beta, rho=0.3, plot=False)[1:3]
results.extend(r)
unadjusted_results.extend(ur)
if len(results) > min_results:
break
In [26]:
f = plt.figure(figsize=(12,7))
idx = 0
for truth, covered, lower, upper in results:
lower = max(-10e6,lower); upper = min(10e6,upper)
plt.scatter(idx, truth, s=30)
if not covered:
plt.plot([idx,idx], [lower, upper], linewidth=5, color='red', alpha=0.3)
else:
plt.plot([idx,idx], [lower, upper], linewidth=5, color='green', alpha=0.3)
idx += 1
idx = 0
for truth, covered, lower, upper in unadjusted_results:
plt.scatter(idx, truth, s=30)
plt.plot([idx,idx], [lower, upper], linewidth=1, color='black')
idx += 1
plt.gca().set_xlim([-1,idx+1])
plt.gca().set_ylim([-10,25])
Out[26]:
We have an estimate of the support and signs of $\beta$, with it we can form an interval for $$ \hat{s}^T\hat{\beta}_{\hat{\cal E}}(\mu) \overset{?}{\approx} \|\beta\|_1. $$ It's certainly a lower bound on $\|\beta\|_1$ when the linear model is correct.
In [27]:
results = []
nsim = 100
min_results = 100
p = 200
beta = np.zeros(p)
beta[:4] = [3,4,5,6]
for _ in range(nsim):
L = form_intervals(100,p,1,frac=0.7, beta=beta, rho=0.3, plot=False)[3]
lower, upper = L.l1norm_interval
truth = np.sum(beta[L.active] * L.signs)
if truth > lower and truth < upper:
covered = True
else:
covered = False
results.extend([(truth, covered, lower, upper)])
In [28]:
f = plt.figure(figsize=(12,7))
idx = 0
for truth, covered, lower, upper in results:
lower = max(-10e6,lower); upper = min(10e6,upper)
plt.scatter(idx, truth, s=30)
if not covered:
plt.plot([idx,idx], [lower, upper], linewidth=5, color='red', alpha=0.3)
else:
plt.plot([idx,idx], [lower, upper], linewidth=5, color='green', alpha=0.3)
idx += 1
plt.gca().set_xlim([-1,idx+1])
plt.gca().set_ylim([-10,25])
Out[28]:
Hmm.... this one is not working.
In [28]: