In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import curve_fit
%matplotlib inline
A normal decision when working with binary data (situations where the response $y \in \{0,1\}$) is to perform a logisitic (or similar) regression on the data to find the estimated probability of one class or the other based on a set of input variables, $X$. Often times once this regression has been performed, this information is then used for the scientist or engineer to make an informed decision as to which class an out of sample $X_i$ will belong. Ultimately the goal is an accurate prediction of the class to when a set of observations belongs, and oftentimes some measure of error.
This notebook will attempt to visualize different situations that can arise in both the response variables and the input variables.
First let us examine the binary regression model:
where $y_i \in \{0,1\}, i=1,\dots,n$ is a binary response variable for a collection of $n$ objects, with $p$ covariate measurements $x_i = (x_{i1}, \dots, x_{ip})$. $g(u)$ is a link function, $\eta_i$ denotes the linear predictor and $\beta$ represents a $(p\times1)$ column vector of regression coefficients$
Under this model we can implement iteratively reweight least squares to solve this equation under a variety of settings. If a bayesian estimate were necessary, methods such as those by Albert and Chib (1993) or Holmes and Held (2006) for probit or logistic regression can be utilized. This notebook will focus on iteratively reweighted least squares for speed of computation.
In [2]:
def IRWLS(yi, X, link='logit', tol=1e-8, max_iter=100, verbose=False):
"""
Iteratively Re-Weighted Least Squares
"""
try:
nobs, npar = X.shape
except:
nobs = X.shape[0]
npar = 1
W = np.identity(nobs)
#Ordinary Least Squares as first Beta Guess
beta_start = betas = wls(X,W,yi)
lstart = lold = binLklihood(X, yi, beta_start, link)
delt = betaDelta(lold)
step = 0.0
while np.sum(np.abs(delt)) > tol and step < max_iter:
step += 1
delt = betaDelta(lold)
lnew = binLklihood(X, yi, betas + delt, link)
if lnew.likelihood < lold.likelihood:
delt = delt/2.
betas = betas + delt
lold = binLklihood(X,yi,betas,link)
else:
betas = betas + delt
lold = binLklihood(X,yi,betas,link)
if verbose:
print """Step {0}: \nLikelihood: {1}""".format(step, lold.likelihood)
variance = lold.variance
return betas, variance
class binLklihood:
def __init__(self, X, y, betas, link='logit'):
self.X = X
self.y = y
self.betas = betas
self.link = link
if link == 'logit':
self.pi = invlogit(np.dot(X, betas))
elif link == 'probit':
self.pi = stats.norm.cdf(np.dot(X, betas))
self.W = np.diag(self.pi*(1 - self.pi))
self.likelihood = loglike(self.y, self.pi)
self.score = X.transpose().dot((y - self.pi))
self.information = X.transpose().dot(self.W).dot(X)
self.variance = np.linalg.pinv(self.information) / 4.0
def betaDelta(binlk):
"""
Change in Delta for a given binomial likelihood object
"""
return np.linalg.pinv(binlk.information).dot(binlk.score)
def invlogit(val):
"""
Inverse Logit Function
"""
return np.exp(val) / (np.exp(val) + 1)
def wls(X, W, yi):
"""
Weighted Least Squares
"""
XtWX = X.transpose().dot(W).dot(X)
XtWy = X.transpose().dot(W).dot(yi)
return np.linalg.pinv(XtWX).dot(XtWy)
def loglike(yi, pi):
"""
Binary Log-Likelihood
"""
vect_loglike = yi*np.log(pi) + (1-yi)*np.log(1-pi)
return np.sum(vect_loglike)
In [3]:
def makesamples(p_success, m1, s1, m2, s2, nsim):
nsim = 1000
y = stats.bernoulli.rvs(p_success, size=nsim)
X = np.vstack((np.ones(nsim),np.array([stats.norm.rvs(m1,s1) if i == 1
else stats.norm.rvs(m2,s2) for i in y]))).transpose()
F = plt.figure()
plt.subplot(211)
plt.plot(X[:,1],y, marker='o', linestyle='')
plt.subplot(212)
F.set_size_inches(10,3)
plt.hist(X[:,1][y==1], bins=50, color='r', alpha=0.5)
plt.hist(X[:,1][y==0], bins=50, color='b', alpha=0.5)
plt.show()
return X, y
X, y = makesamples(0.5, 10, 1, 5, 1, 1000.)
In [4]:
def probability_curve(X,y):
beta, var = IRWLS(y, X)
testpts = np.arange(np.min(X),np.max(X),0.01)
testmat = np.vstack((np.ones(len(testpts)),testpts)).transpose()
probs = invlogit(np.dot(testmat, beta))
F = plt.figure()
plt.plot(X[:,1],y, marker='o', linestyle='')
plt.plot(testpts, probs)
plt.axhline(y=0.5, linestyle='--')
F.set_size_inches(10,3)
plt.show()
return beta, var
In [5]:
beta, var = probability_curve(X, y)
Read about ROC curves, but the main idea is that if it's above the red dotted line, then that is a favorable model for that type of dataset. Also if the ROC line matches the red line exactly, no matter what model you've fit using this method... You'll only do as good as randomly guessing.
http://en.wikipedia.org/wiki/Receiver_operating_characteristic
In [6]:
def calcRates(y, pred, checkpt, verbose=True):
true_positive = 0.
true_negative = 0.
false_positive = 0.
false_negative = 0.
totalP = len(y[y==1])
totalN = len(y[y==0])
for i in np.arange(len(y)):
if y[i] == 1 and pred[i] <= checkpt:
false_negative += 1.
if y[i] == 1 and pred[i] > checkpt:
true_positive += 1.
if y[i] == 0 and pred[i] >= checkpt:
false_positive += 1.
if y[i] == 0 and pred[i] < checkpt:
true_negative += 1.
TPR = true_positive / totalP
TNR = true_negative / totalN
FPR = false_positive / totalP
FNR = false_negative / totalN
if verbose:
print """True Positive Rate = {0}
True Negative Rate = {1}
False Positive Rate = {2}
False Negative Rate = {3}\n""".format(TPR, TNR, FPR, FNR)
return TPR, TNR, FPR, FNR
In [7]:
def plotROC(y, pred, verbose=False):
results = [1.,1.]
for i in np.arange(0,1.01,0.01):
TPR, TNR, FPR, FNR = calcRates(y,pred, i, verbose)
results = np.vstack((results,[FPR,TPR]))
results = np.vstack((results,[0.0,0.0]))
F = plt.figure()
plt.plot(results[:,0], results[:,1])
plt.plot(np.array([0,1]), np.array([0,1]),color='r',linestyle="--")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
In [8]:
plotROC(y, np.dot(X,beta))
Simulation 2 - Partial Overlap of groups. Balanced Data
In [9]:
X, y = makesamples(0.5, 6, 1, 5, 1, 1000)
In [10]:
beta, var = probability_curve(X, y)
In [11]:
plotROC(y, np.dot(X,beta))
Simulation 3 - Total Overlap of groups. Balanced Data
In [12]:
X, y = makesamples(0.5, 5,1,5,1, 1000)
In [13]:
beta, var = probability_curve(X, y)
In [14]:
plotROC(y, np.dot(X,beta))
Simulation 4 - No Overlap of groups. Unbalanced Data
In [15]:
x, y = makesamples(0.2, 5,0.5, 8,1, 10000)
beta, var = probability_curve(x,y)
plotROC(y, np.dot(x, beta))
Simulation 5 - Partial overlap of groups. Unbalanced Data
In [16]:
x, y = makesamples(0.2, 5,.5, 7,1, 10000)
beta, var = probability_curve(x,y)
plotROC(y, np.dot(x, beta))
Simulation 6 - Overlap of groups. Unbalanced Data
In [17]:
x, y = makesamples(0.2, 5,.5, 6,1, 10000)
beta, var = probability_curve(x,y)
plotROC(y, np.dot(x, beta))
In [18]:
def makesamples2d(p_success, m_x1, m_y1, s1, m_x2, m_y2, s2, nsim):
nsim = 1000
response = stats.bernoulli.rvs(p_success, size=nsim)
X = np.vstack((np.ones(nsim),
np.array([stats.norm.rvs(m_x1,s1) if i == 1
else stats.norm.rvs(m_x2,s2) for i in response]),
np.array([stats.norm.rvs(m_y1,s1) if i == 1
else stats.norm.rvs(m_y2,s2) for i in response]))).transpose()
F = plt.figure()
plt.plot(X[response==1][:,1], X[response==1][:,2], marker='.', linestyle='')
plt.plot(X[response==0][:,1], X[response==0][:,2], marker='.', color='r', linestyle='')
F.set_size_inches(6,6)
plt.show()
return X, response
X, y = makesamples2d(0.2, 7.2, 7.2, 0.5, 5,5, 1, 10000.)
In [19]:
def probability_curve(X,response):
beta, var = IRWLS(response, X)
testX1 = np.arange(np.min(X[:,1]),np.max(X[:,1]),0.01)
F = plt.figure()
plt.plot(X[response==1][:,1], X[response==1][:,2], marker='.', linestyle='')
plt.plot(X[response==0][:,1], X[response==0][:,2], marker='.', color='r', linestyle='')
plt.plot(testX1, makepline(testX1,beta,0.5), color='black', linestyle='dotted', marker='')
plt.plot(testX1, makepline(testX1,beta,0.25), color='red', linestyle='dashed', marker='')
plt.plot(testX1, makepline(testX1,beta,0.05), color='red', linestyle='-', marker='')
plt.plot(testX1, makepline(testX1,beta,0.75), color='blue', linestyle='dashed', marker='')
plt.plot(testX1, makepline(testX1,beta,0.95), color='blue', linestyle='-', marker='')
F.set_size_inches(6,6)
plt.show()
return beta, var
def makepline(X, beta, p):
beta3 = beta[2]
beta_star = beta[:-1]
X_star = np.vstack((np.ones(len(X)), X)).transpose()
x3 = (logiT(p) - np.dot(X_star, beta_star)) / beta3
return x3
def logiT(p):
return np.log(p/(1-p))
In [20]:
beta, var = probability_curve(X,y)
In [21]:
plotROC(y, np.dot(X,beta))
Simulation 8 - Partial overlap. Unbalanced Data. Two Dimensions
In [22]:
X, y = makesamples2d(0.2, 6.5, 6.5, .5, 5,5, 1, 10000.)
beta, var = probability_curve(X,y)
plotROC(y, np.dot(X,beta))
Simulation 9 - Complete overlap. Unbalanced Data. Two Dimensions
In [23]:
X, y = makesamples2d(0.2, 5, 5, .5, 5,5, 1, 10000.)
beta, var = probability_curve(X,y)
plotROC(y, np.dot(X,beta))
In [24]:
def makeChandraSamples2d(p_success, m_x1, m_y1, sx1, sy1, m_x2, m_y2, sx2, sy2, nsim):
response = stats.bernoulli.rvs(p_success, size=nsim)
X = np.vstack((np.ones(nsim),
np.array([stats.norm.rvs(m_x1,sx1) if i == 1
else stats.norm.rvs(m_x2,sx2) for i in response]),
np.array([stats.norm.rvs(m_y1,sy1) if i == 1
else stats.norm.rvs(m_y2,sy2) for i in response]))).transpose()
F = plt.figure()
plt.plot(X[response==0][:,1], X[response==0][:,2], marker='.', color='b', linestyle='')
plt.plot(X[response==1][:,1], X[response==1][:,2], marker='x', color='r', linestyle='')
plt.show()
return X, response
In [25]:
X, y = makeChandraSamples2d(.25, 9.7, 0.12, .5, .02, 8.5, 0.12, 1, .02, 8000)
beta, var = probability_curve(X,y)
plotROC(y, np.dot(X, beta), verbose=False)
Now lets play with some non-linear regression!
$\begin{equation} f(X\beta) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_1^2 + \beta_4 x_2^2 + \beta_5 x_1 x_2 \end{equation}$
In [37]:
def makeNonLinear2d(p_success, m_x1, m_y1, sx1, sy1, m_x2, m_y2, sx2, sy2, nsim):
response = stats.bernoulli.rvs(p_success, size=nsim)
X_linear = np.vstack((np.ones(nsim),
np.array([stats.norm.rvs(m_x1,sx1) if i == 1
else stats.norm.rvs(m_x2,sx2) for i in response]),
np.array([stats.norm.rvs(m_y1,sy1) if i == 1
else stats.norm.rvs(m_y2,sy2) for i in response])))
X_nonlinear = np.vstack((X_linear, X_linear[1,:]**2, X_linear[2,:]**2)).transpose() #, X_linear[1,:]*X_linear[2,:]
print X[1,:]**2
X_linear = X_linear.transpose()
F = plt.figure()
plt.plot(X_linear[response==0][:,1], X_linear[response==0][:,2], marker='.', color='b', linestyle='')
plt.plot(X_linear[response==1][:,1], X_linear[response==1][:,2], marker='x', color='r', linestyle='')
plt.show()
return X_linear, X_nonlinear, response
In [38]:
def probability_curve_nonlinear(X, Xnl, response):
beta_linear, var_linear = IRWLS(response, X)
beta_nonlinear, var_nonlinear = IRWLS(response, Xnl)
x1 = np.arange(-2, 12, 0.1)
x2 = np.arange(-2, 12, 0.1)
xx1, xx2 = np.meshgrid(x1, x2, sparse=True)
zz_nl = invlogit(beta_nonlinear[0] + beta_nonlinear[1]*xx1 + beta_nonlinear[2]*xx2 + beta_nonlinear[3]*xx1*xx1 + beta_nonlinear[4]*xx2*xx2) # + beta_nonlinear[5]*xx1*xx2
zz_l = invlogit(beta_linear[0] + beta_linear[1]*xx1 + beta_linear[2]*xx2)
F = plt.figure()
plt.plot(X[response==0][:,1], X[response==0][:,2], marker='.', linestyle='')
plt.plot(X[response==1][:,1], X[response==1][:,2], marker='x', color='r', linestyle='')
CS = plt.contour(x1, x2, zz_nl, colors='k')
plt.clabel(CS, inline=1, fontsize=10)
plt.show()
F = plt.figure()
plt.plot(X[response==0][:,1], X[response==0][:,2], marker='.', linestyle='')
plt.plot(X[response==1][:,1], X[response==1][:,2], marker='x', color='r', linestyle='')
CS = plt.contour(x1, x2, zz_l, colors='k')
plt.clabel(CS, inline=1, fontsize=10)
plt.show()
return beta_linear, var_linear, beta_nonlinear, var_nonlinear
def logiT(p):
return np.log(p/(1-p))
In [40]:
Xl, Xnl, y = makeNonLinear2d(.1, 3,3,1,1, 5,5,2,2, 1000)
b1, v1, b2, v2 = probability_curve_nonlinear(Xl, Xnl, y)
plotROC(y, np.dot(Xl,b1))
plotROC(y, np.dot(Xnl,b2))