Use binomial instead of bernoulli
$l_i(\theta) = - y_i \log(p_i) - (1-y_i) \log(1-p_i) $, $y_i \in {0,1}$
becomes
$l_i(\theta) = - k_i \log(p_i) - (n_i-k_i) \log(1-p_i) $
where $p_i = f(x_i^T \theta + \theta_0)$
$ r = n \odot f(X^T \theta + \theta_0) - k $
$ \nabla_0 l = \frac{\sum r}{\sum n} $
$ \nabla l = \frac{X^T r + \lambda \theta}{\sum n} $
In [33]:
from scipy.optimize import fmin_l_bfgs_b
ALPHA = 1.0
FACTR = 1e10
def logistic(x):
return 1/(1+np.exp(-x))
def log_loss(yp_orig, n, k, eps=1e-15):
yp = yp_orig.copy()
yp[yp < eps] = eps
yp[yp > 1 - eps] = 1 - eps
x = -k * np.log(yp) - (n - k) * np.log(1 - yp)
return x.sum()/n.sum()
def lbfgs_func(x, X, n, k, alpha=ALPHA):
intercept, theta = x[0], x[1:]
yp = logistic(X.dot(theta) + intercept)
penalty = alpha/2*(theta*theta).sum()/n.sum()
obj = log_loss(yp, n, k) + penalty
r = n*logistic(X.dot(theta)+intercept) - k
grad_intercept = r.sum()/n.sum()
grad_coefs = (X.T.dot(r) + alpha*theta)/n.sum()
grad = np.hstack([grad_intercept, grad_coefs])
return obj, grad
#x0 = np.zeros(X.shape[1]+1)
#x_opt, _, info = fmin_l_bfgs_b(lbfgs_func, x0, factr=FACTR, args=(X, n, k))
Fake some data
In [86]:
import pandas as pd
import numpy as np
import scipy.sparse
from scipy.stats import norm as norm_dist, uniform
In [212]:
n_features = 25
n_datapoints = 1e6
sigma = 2
intercept = 1.14
theta = norm_dist.rvs(0, sigma, size=(n_features, 1))
X = np.ceil(scipy.sparse.rand(n_datapoints, n_features, .3)) # binary features
p = logistic(X.dot(theta) + intercept)
y = uniform.rvs(size=(n_datapoints, 1)) < p
In [213]:
n = np.ones_like(y)[:, 0].astype(float)
k = y[:, 0].astype(float)
In [214]:
%%time
factr = 1e7
alpha = sigma**-2
x0 = np.zeros(X.shape[1]+1)
x_opt, _, info = fmin_l_bfgs_b(lbfgs_func, x0, factr=factr, args=(X, n, k, alpha))
In [215]:
%precision 2
x_opt[0], x_opt[1:]
Out[215]:
In [216]:
%precision 2
intercept, theta[:, 0]
Out[216]:
In [217]:
%%time
df = pd.DataFrame(X.todense())
cols = list(df.columns)
df['y'] = y.astype(float)
summarized = df.groupby(cols).aggregate([np.sum, len]).reset_index()
In [218]:
X = summarized.values[:, :-2]
n = summarized.values[:, -1]
k = summarized.values[:, -2]
In [219]:
%%time
factr = 1e7
alpha = sigma**-2
x0 = np.zeros(X.shape[1]+1)
x_opt, _, info = fmin_l_bfgs_b(lbfgs_func, x0, factr=factr, args=(X, n, k, alpha))
In [206]:
%precision 2
x_opt[0], x_opt[1:]
Out[206]:
Searching around the internet, I couldn't find an expression for the gradient of regularized logistic regression with sample weights. Here it is:
$$r = f(X^T \theta + \theta_0) - y$$$$\nabla_0 l = \frac{r^T w}{\sum w} $$$$\nabla l = \frac{X^T (w \odot r) + \lambda \theta}{ \sum w} $$where $\odot$ indicates element-wise multiplication. The loss function here is
$l(\theta) = -w^T \left(y \odot \log(p) + (1-y) \odot \log(1-p)\right) / \sum w $
with $p = f(X^T \theta + \theta_0)$
i.e. the $w$-weighted average of the usual log-loss.
Note that I'm not regularizing the intercept term, $\theta_0$.
Scikit-learn's LogisticRegression classifier (which uses LIBLINEAR) doesn't support weighted samples. SGDClassifier does support weighted samples, but it can be tricky to tune. For my application, solving the optimization problem without scikit-learn using L-BFGS worked best.
In other words, all samples have weight $1.0$. $M$ training examples and $N$ features, one of which is the intercept.
$\theta$ - real vector of $N$ coefficients
$y$ - response variable, binary vector of $M$ failures or successes
$X$ - $M$x$N$ design matrix. Element $(i, j)$ of $X$ is the value of the $j$th feature for training example $i$. One of the $N$ columns is a dummy column of ones for the intercept.
$f(x) = \frac{1}{1+e^{-x}}$ - logistic function
$l(\theta)$ - loss function. This is what we're trying to minimize when we're considering different coefficients $\theta$.
$p = f(X^T \theta)$ - real vector of $M$ predictions.
And finally
$l_i(\theta) = - y_i \log(p_i) + (1-y_i) \log(1-p_i) $
$l(\theta) = \sum_i l_i(\theta)/M$ is the average log-loss
$r = f(X^T \theta) - y$ - residual, real vector of $M$ elements
$\nabla l = X^T r $ - gradient of loss function, real vector of $N$ elements
No regularization on the intercept. $M$ training examples and $N$ features.
$\theta \in \mathbb{R}^N$ - coefficients
$\theta_0 \in \mathbb{R}$ - intercept
$y \in \{0, 1\}^M$ - response variable ($M x 1$ vector)
$w \in (0, \infty)^N $ - per example weight ($N x 1$ vector)
$X \in \{0, 1\}^{MxN}$ - $M x N$ design matrix
$f(x) = \frac{1}{1+e^{-x}}$ - logistic function
$l(\theta)$ - loss function
$l(\theta) = -w^T \left(y \odot \log(p) + (1-y) \odot \log(1-p)\right) / \sum w $
where $p = f(X^T \theta + \theta_0)$
aka average of the usual log-loss, weighted by $w$
$$r = f(X^T \theta + \theta_0) - y$$$$\nabla_0 l = \frac{r^T w}{\sum w} $$$$\nabla l = \frac{X^T (w \odot r) + \lambda \theta}{ \sum w} $$
In [2]:
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
In [3]:
def logistic(x):
return 1/(1+np.exp(-x))
REG = 1.0
In [4]:
def old_method(X, y, w, reg=REG):
def log_loss(yp_orig, yt, w, eps=1e-15):
yp = yp_orig.copy()
yp[yp < eps] = eps
yp[yp > 1 - eps] = 1 - eps
x = -w * (yt * np.log(yp) + (1 - yt) * np.log(1 - yp))
return x.sum()/w.sum()
def lbfgs_func(x, X, y, w, reg):
m, n = X.shape
intercept, theta = x[0], x[1:].reshape(n, 1)
yp = logistic(X.dot(theta) + intercept)[:, 0]
penalty = reg/2*(theta*theta).sum()/w.sum()
obj = log_loss(yp, y, w) + penalty
y, w = y.reshape((m, 1)), w.reshape((m, 1))
r = logistic(X.dot(theta)+intercept) - y
grad_intercept = np.average(r, weights=w)
grad_coefs = (X.T.dot(w*r) + reg*theta)/w.sum()
grad = np.hstack([grad_intercept, grad_coefs[:, 0]])
return obj, grad
def lr(X, y, w, reg, factr=1e10):
x0 = np.zeros(X.shape[1]+1) # a coefficient for each feature, plus one for the intercept
x_opt, _, info = fmin_l_bfgs_b(lbfgs_func, x0, factr=factr, args=(X, y, w, reg))
assert info['warnflag'] == 0, 'l-BFGS did not converge'
return x_opt
x = lr(X, y, w, reg)
intercept, theta = x[0], x[1:]
y_pred = logistic(X.dot(theta) + intercept)
return log_loss(y_pred, y, w)
In [33]:
def new_method(X, n, k, reg=REG):
def log_loss(yp_orig, n, k, eps=1e-15):
yp = yp_orig.copy()
yp[yp < eps] = eps
yp[yp > 1 - eps] = 1 - eps
x = -k * np.log(yp) - (n - k) * np.log(1 - yp)
return x.sum()/n.sum()
def lbfgs_func(x, X, n, k, reg):
intercept, theta = x[0], x[1:]
yp = logistic(X.dot(theta) + intercept)
penalty = reg/2*(theta*theta).sum()/n.sum()
obj = log_loss(yp, n, k) + penalty
r = n*logistic(X.dot(theta)+intercept) - k
grad_intercept = r.sum()/n.sum()
grad_coefs = (X.T.dot(r) + reg*theta)/n.sum()
grad = np.hstack([grad_intercept, grad_coefs])
return obj, grad
def lr(X, n, k, reg, factr=1e10):
x0 = np.zeros(X.shape[1]+1) # a coefficient for each feature, plus one for the intercept
x_opt, _, info = fmin_l_bfgs_b(lbfgs_func, x0, factr=factr, args=(X, n, k, reg))
assert info['warnflag'] == 0, 'l-BFGS did not converge'
return x_opt
x = lr(X, n, k, reg)
intercept, theta = x[0], x[1:]
y_pred = logistic(X.dot(theta) + intercept)
return log_loss(y_pred, n, k)
In [36]:
def log_loss(yp_orig, n, k, eps=1e-15):
yp = yp_orig.copy()
yp[yp < eps] = eps
yp[yp > 1 - eps] = 1 - eps
x = -k * np.log(yp) - (n - k) * np.log(1 - yp)
return x.sum()/n.sum()
In [30]:
x = np.zeros(X.shape[1]+1)
k, n = f.raw_data.leads.values, f.raw_data.clicks.values
In [28]:
M, N = X.shape
intercept, theta = x[0], x[1:].reshape(N, 1)
yp = logistic(X.dot(theta) + intercept)[:, 0]
penalty = reg/2*(theta*theta).sum()/n.sum()
obj = log_loss(yp, n, k) + penalty
print(obj)
n, k = n.reshape((M, 1)), k.reshape((M, 1))
r = n*logistic(X.dot(theta)+intercept) - k
grad_intercept = r.sum()/n.sum()
grad_coefs = (X.T.dot(r) + reg*theta)/n.sum()
grad = np.hstack([grad_intercept, grad_coefs[:, 0]])
grad
Out[28]:
In [31]:
intercept, theta = x[0], x[1:]
yp = logistic(X.dot(theta) + intercept)
penalty = reg/2*(theta*theta).sum()/n.sum()
obj = log_loss(yp, n, k) + penalty
print(obj)
In [32]:
r = n*logistic(X.dot(theta)+intercept) - k
grad_intercept = r.sum()/n.sum()
grad_coefs = (X.T.dot(r) + reg*theta)/n.sum()
grad = np.hstack([grad_intercept, grad_coefs])
grad
Out[32]:
In [8]:
import scipy.sparse
from sys import path
path.insert(0, '/home/seanharnett/Dropbox/cvr_redux/cvr-logistic-regression/cvrlr')
from assemble_features import Features, get_feature_categories
In [9]:
data_file = '/home/seanharnett/Documents/vw/db_data_20141003.csv'
data = pd.read_csv(data_file, nrows=100000)
In [10]:
f = Features()
f.raw_data = data
f.feature_categories = {0: [],
2: ['adconfiguration'],
3: ['provider'],
4: ['segment'],
7: ['target'],
11: ['provider', 'segment'],
13: ['provider', 'servicegroup']}
f.encoded_data, f.reverse_encodings = f.encode_feature_dataframe()
f.matrix, f.breaks = f.sparsify()
In [11]:
reg = 15
In [12]:
def assemble_response_variable(leads, clicks):
"""
Creates the left-hand-side 'y' vector that we're trying to predict from the
design matrix X, as well as the vector of weights for each example.
"""
successes = leads
failures = clicks - leads
sample_weight = np.hstack([failures, successes])
y = np.hstack([np.zeros_like(failures), np.ones_like(successes)])
return y, sample_weight
In [34]:
%%time
y, w = assemble_response_variable(f.raw_data.leads, f.raw_data.clicks)
X = scipy.sparse.vstack([f.matrix, f.matrix])
print(old_method(X, y, w, reg))
In [35]:
%%time
X = f.matrix
k, n = f.raw_data.leads.values, f.raw_data.clicks.values
print(new_method(X, n, k, reg))