In [10]:
import numpy as np
from selection.lasso import lasso
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [11]:
n, p, sigma = 100, 50, 1.5

def instance(n, p, sigma, frac=0.9, rho=0, beta=None):
    X = np.sqrt(1-rho**2) * np.random.standard_normal((n,p)) + np.sqrt(rho) * np.random.standard_normal(n)[:,None]
    X -= X.mean(0)[None,:]
    X /= np.sqrt((X**2).sum(0))[None,:]
    y = np.random.standard_normal(n) * sigma
    if beta is not None:
        mu = np.dot(X, beta)
    else:
        mu = np.zeros(n)
    y += mu
    L = lasso(y, X, frac=frac, sigma_epsilon=sigma)
    L.alpha = 0.1 # 90% selection intervals
    return L, mu

In [12]:
L, mu = instance(n,p,sigma)

In [13]:
L.l1norm_interval


/Users/jonathantaylor/canopy/lib/python2.7/site-packages/selection/truncated.py:174: RuntimeWarning: invalid value encountered in double_scalars
  x - mu, use_R=self.use_R)) / P.sum()
Out[13]:
(-inf, 5.6651008744740761)

In [14]:
def form_intervals(n, p, sigma, frac=0.9, beta=None, plot=True, rho=0):
    results = []
    unadjusted_results = []
    L, mu = instance(n, p, sigma, frac=frac, beta=beta, rho=rho)
    for i, eta, center, interval in L.intervals:
        lower, upper = interval
        lower = max(-10e6,lower); upper = min(10e6,upper)
        truth = (eta * mu).sum()
        if plot:
            plt.scatter(i, truth, s=30)
        if truth > lower and truth < upper:
            covered = True
        else:
            covered = False
        if plot:
            if not covered:
                plt.plot([i,i], [lower, upper], linewidth=5, color='red', alpha=0.4)
            else:
                plt.plot([i,i], [lower, upper], linewidth=5, color='green', alpha=0.4)
        results.append((truth, covered, lower, upper))
    for i, eta, center, interval in L.unadjusted_intervals:
        lower, upper = interval
        lower = max(-10e6,lower); upper = min(10e6,upper)
        truth = (eta * mu).sum()
        if plot:
            plt.scatter(i, truth, s=30)
        if truth > lower and truth < upper:
            covered = True
        else:
            covered = False
        if plot:
            plt.plot([i,i], [lower, upper], 'k--', linewidth=2)
        unadjusted_results.append((truth, covered, lower, upper))
        
    if plot:
        plt.gca().set_xlim([-1,p+1])
        plt.gca().set_ylim([-12,12])
        return plt.gcf(), results, unadjusted_results, L
    else:
        return None, results, unadjusted_results, L

Null case

Here are a few null simulations:


In [15]:
f = form_intervals(100, 20, 1, frac=0.7)[0]


Now, add a little correlation:


In [16]:
f = form_intervals(100, 20, 1, frac=0.7, rho=0.2)[0]



In [17]:
f = form_intervals(100, 50, 1, frac=0.7)[0]



In [18]:
f = form_intervals(100, 200, 1, frac=0.7)[0]


A few strong signals

We'll repeat the experiment including some strong signals. In the intervals below, the light gray region are the truly active coefficients.


In [19]:
p = 20
beta = np.zeros(p)
beta[:4] =  np.linspace(3.5,4.5,4)
f = form_intervals(100,p,1,frac=0.7, beta=beta, rho=0.2)[0]
a = f.gca()
ylim = a.get_ylim()
a.fill([0,4,4,0],[-100,-100,100,100], alpha=0.2, facecolor='yellow')
a.set_ylim(ylim)
a.set_xlabel(r'Feature $i$')
a.set_ylabel(r'Coefficient: $\hat{\beta}_{{\cal E},i}^{OLS}$')


Out[19]:
<matplotlib.text.Text at 0x1104826d0>

In [20]:
p = 50
beta = np.zeros(p)
beta[:4] = np.linspace(3.5,4.5,4)
f = form_intervals(100,p,1,frac=0.5, beta=beta)[0]
a = f.gca()
ylim = a.get_ylim()
a.fill([0,4,4,0],[-100,-100,100,100], alpha=0.2, facecolor='yellow')
a.set_ylim(ylim)
a.set_xlabel(r'Feature $i$')
a.set_ylabel(r'Coefficient: $\hat{\beta}_{{\cal E},i}^{OLS}$')


Out[20]:
<matplotlib.text.Text at 0x1104e2490>

In [21]:
p = 200
beta = np.zeros(p)
beta[:4] = np.linspace(2,2.5,4) * np.sqrt(2*np.log(p))
f = form_intervals(100,p,1,frac=0.2, beta=beta, rho=0.1)[0]
a = f.gca()
ylim = a.get_ylim()
a.fill([0,4,4,0],[-100,-100,100,100], alpha=0.2, facecolor='yellow')
a.set_ylim(ylim)
a.set_xlabel(r'Feature $i$')
a.set_ylabel(r'Coefficient: $\hat{\beta}_{{\cal E},i}^{OLS}$')
a.set_ylim([0,15])


Out[21]:
(0, 15)

Here's a zoom on the strong coefficients.


In [22]:
f = a.figure
a.set_xlim([-1,11])
f


Out[22]:

When it has included all true variables in the model, the signal intervals should have center at the true value of $\beta[1:4]$.

Combining several instances

Let's take several of the strong signal instances, compute several intervals and plot them.


In [23]:
results = []
unadjusted_results = []
nsim = 50
min_results = 150
p = 200
beta = np.zeros(p)
beta[:4] = [6,7,8,9]

for _ in range(nsim):
    r, ur = form_intervals(100,p,1,frac=0.5, beta=beta, plot=False)[1:3]
    results.extend(r)
    unadjusted_results.extend(ur)
    if len(results) > min_results:
        break

In [24]:
f = plt.figure(figsize=(12,7))
idx = 0
for truth, covered, lower, upper in results:
    lower = max(-10e6,lower); upper = min(10e6,upper)
    plt.scatter(idx, truth, s=30)
    if not covered:
        
        plt.plot([idx,idx], [lower, upper], linewidth=5, color='red', alpha=0.3)
    else:
        plt.plot([idx,idx], [lower, upper], linewidth=5, color='green', alpha=0.3)
    idx += 1
    
idx = 0
for truth, covered, lower, upper in unadjusted_results:
    plt.scatter(idx, truth, s=30)
    plt.plot([idx,idx], [lower, upper], linewidth=1, color='black')
    idx += 1
    
plt.gca().set_xlim([-1,idx+1])
plt.gca().set_ylim([-10,25])


Out[24]:
(-10, 25)

But, if we add correlation, the picture gets a little worse:


In [25]:
results = []
unadjusted_results = []
nsim = 50
min_results = 150
p = 200
beta = np.zeros(p)
beta[:4] = [6,7,8,9]

for _ in range(nsim):
    r, ur = form_intervals(100,p,1,frac=0.5, beta=beta, rho=0.3, plot=False)[1:3]
    results.extend(r)
    unadjusted_results.extend(ur)
    if len(results) > min_results:
        break

In [26]:
f = plt.figure(figsize=(12,7))
idx = 0
for truth, covered, lower, upper in results:
    lower = max(-10e6,lower); upper = min(10e6,upper)
    plt.scatter(idx, truth, s=30)
    if not covered:
        plt.plot([idx,idx], [lower, upper], linewidth=5, color='red', alpha=0.3)
    else:
        plt.plot([idx,idx], [lower, upper], linewidth=5, color='green', alpha=0.3)
    idx += 1
    
idx = 0
for truth, covered, lower, upper in unadjusted_results:
    plt.scatter(idx, truth, s=30)
    plt.plot([idx,idx], [lower, upper], linewidth=1, color='black')
    idx += 1
    
plt.gca().set_xlim([-1,idx+1])
plt.gca().set_ylim([-10,25])


Out[26]:
(-10, 25)

Interval for (pseudo) $\ell_1$ norm of $\beta$

We have an estimate of the support and signs of $\beta$, with it we can form an interval for $$ \hat{s}^T\hat{\beta}_{\hat{\cal E}}(\mu) \overset{?}{\approx} \|\beta\|_1. $$ It's certainly a lower bound on $\|\beta\|_1$ when the linear model is correct.


In [27]:
results = []
nsim = 100
min_results = 100
p = 200
beta = np.zeros(p)
beta[:4] = [3,4,5,6]

for _ in range(nsim):
    L = form_intervals(100,p,1,frac=0.7, beta=beta, rho=0.3, plot=False)[3]
    lower, upper = L.l1norm_interval
    truth = np.sum(beta[L.active] * L.signs)
    if truth > lower and truth < upper:
        covered = True
    else:
        covered = False
    results.extend([(truth, covered, lower, upper)])

In [28]:
f = plt.figure(figsize=(12,7))
idx = 0
for truth, covered, lower, upper in results:
    lower = max(-10e6,lower); upper = min(10e6,upper)
    plt.scatter(idx, truth, s=30)
    if not covered:
        
        plt.plot([idx,idx], [lower, upper], linewidth=5, color='red', alpha=0.3)
    else:
        plt.plot([idx,idx], [lower, upper], linewidth=5, color='green', alpha=0.3)
    idx += 1
    
plt.gca().set_xlim([-1,idx+1])
plt.gca().set_ylim([-10,25])


Out[28]:
(-10, 25)

Hmm.... this one is not working.


In [28]: