In [1]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import SVG
分析用的代码版本信息:
~/W/g/scikit-learn ❯❯❯ git log -n 1
commit d161bfaa1a42da75f4940464f7f1c524ef53484f
Author: John B Nelson <jnelso11@gmu.edu>
Date: Thu May 26 18:36:37 2016 -0400
Add missing double quote (#6831)
In [2]:
SVG("./res/sklearn_lr.svg")
Out[2]:
如逻辑回归在spark中的实现简介中分析一样,主要把精力定位到算法代码上,即寻优算子和损失函数。
sklearn支持liblinear, sag, lbfgs和newton-cg四种寻优算子,其中lbfgs属于scipy包,liblinear属于LibLinear库,剩下两种由sklearn自己实现。代码很好定位,逻辑也很明了,不多说:
704 if solver == 'lbfgs':
705 try:
706 w0, loss, info = optimize.fmin_l_bfgs_b(
707 func, w0, fprime=None,
708 args=(X, target, 1. / C, sample_weight),
709 iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
710 except TypeError:
711 # old scipy doesn't have maxiter
712 w0, loss, info = optimize.fmin_l_bfgs_b(
713 func, w0, fprime=None,
714 args=(X, target, 1. / C, sample_weight),
715 iprint=(verbose > 0) - 1, pgtol=tol)
716 if info["warnflag"] == 1 and verbose > 0:
717 warnings.warn("lbfgs failed to converge. Increase the number "
718 "of iterations.")
719 try:
720 n_iter_i = info['nit'] - 1
721 except:
722 n_iter_i = info['funcalls'] - 1
723 elif solver == 'newton-cg':
724 args = (X, target, 1. / C, sample_weight)
725 w0, n_iter_i = newton_cg(hess, func, grad, w0, args=args,
726 maxiter=max_iter, tol=tol)
727 elif solver == 'liblinear':
728 coef_, intercept_, n_iter_i, = _fit_liblinear(
729 X, target, C, fit_intercept, intercept_scaling, None,
730 penalty, dual, verbose, max_iter, tol, random_state,
731 sample_weight=sample_weight)
732 if fit_intercept:
733 w0 = np.concatenate([coef_.ravel(), intercept_])
734 else:
735 w0 = coef_.ravel()
736
737 elif solver == 'sag':
738 if multi_class == 'multinomial':
739 target = target.astype(np.float64)
740 loss = 'multinomial'
741 else:
742 loss = 'log'
743
744 w0, n_iter_i, warm_start_sag = sag_solver(
745 X, target, sample_weight, loss, 1. / C, max_iter, tol,
746 verbose, random_state, False, max_squared_sum, warm_start_sag)
sklearn的多分类支持ovr (one vs rest,一对多)和multinominal两种方式。
默认是ovr,它会对毎个标签训练一个二分类的分类器,即总共$K$个。训练代码在
1230 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1231 backend=backend)(
1232 path_func(X, y, pos_class=class_, Cs=[self.C],
1233 fit_intercept=self.fit_intercept, tol=self.tol,
1234 verbose=self.verbose, solver=self.solver, copy=False,
1235 multi_class=self.multi_class, max_iter=self.max_iter,
1236 class_weight=self.class_weight, check_input=False,
1237 random_state=self.random_state, coef=warm_start_coef_,
1238 max_squared_sum=max_squared_sum,
1239 sample_weight=sample_weight)
1240 for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))
注意,1240L的for class_ in classes
配合1232L的pos_class=class
,就是逐个取标签来训练的逻辑。
前面讲到ovr会遍历标签,逐个训练。为了兼容这段逻辑,真正的二分类问题需要做变化:
1201 if len(self.classes_) == 2:
1202 n_classes = 1
1203 classes_ = classes_[1:]
同样地,multinominal需要一次对全部标签做处理,也需要做变化:
1217 # Hack so that we iterate only once for the multinomial case.
1218 if self.multi_class == 'multinomial':
1219 classes_ = [None]
1220 warm_start_coef = [warm_start_coef]
好,接下来,我们看multinoinal的损失函数和导数计算代码,它是_multinomial_loss_grad
这个函数。
sklearn里多分类的代码使用的公式和逻辑回归算法简介和Python实现里一致,即:
\begin{align} L(\beta) &= \log(\sum_i e^{\beta_{i0} + \beta_i x)}) - (\beta_{k0} + \beta_k x) \\ \frac{\partial L}{\partial \beta} &= x \left ( \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} - I(y = k) \right ) \\ \end{align}具体到损失函数:
244 def _multinomial_loss(w, X, Y, alpha, sample_weight):
245 #+-- 37 lines: """Computes multinomial loss and class probabilities.---
282 n_classes = Y.shape[1]
283 n_features = X.shape[1]
284 fit_intercept = w.size == (n_classes * (n_features + 1))
285 w = w.reshape(n_classes, -1)
286 sample_weight = sample_weight[:, np.newaxis]
287 if fit_intercept:
288 intercept = w[:, -1]
289 w = w[:, :-1]
290 else:
291 intercept = 0
292 p = safe_sparse_dot(X, w.T)
293 p += intercept
294 p -= logsumexp(p, axis=1)[:, np.newaxis]
295 loss = -(sample_weight * Y * p).sum()
296 loss += 0.5 * alpha * squared_norm(w)
297 p = np.exp(p, p)
298 return loss, p, w
logsumexp
函数里作的,原理和逻辑回归在spark中的实现简介一样。好,再看导数的计算:
301 def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
302 #+-- 37 lines: """Computes the multinomial loss, gradient and class probabilities.---
339 n_classes = Y.shape[1]
340 n_features = X.shape[1]
341 fit_intercept = (w.size == n_classes * (n_features + 1))
342 grad = np.zeros((n_classes, n_features + bool(fit_intercept)))
343 loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
344 sample_weight = sample_weight[:, np.newaxis]
345 diff = sample_weight * (p - Y)
346 grad[:, :n_features] = safe_sparse_dot(diff.T, X)
347 grad[:, :n_features] += alpha * w
348 if fit_intercept:
349 grad[:, -1] = diff.sum(axis=0)
350 return loss, grad.ravel(), p
注意,sklearn支持牛顿法,需要用到Hessian阵,定义见维基Hessian matrix,
\begin{equation} {\mathbf H}={\begin{bmatrix}{\dfrac {\partial ^{2}f}{\partial x_{1}^{2}}}&{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{n}}}\\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{2}^{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{n}}}\\[2.2ex]\vdots &\vdots &\ddots &\vdots \\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{n}^{2}}}\end{bmatrix}}. \end{equation}其实就是各点位的二阶偏导。具体推导就不写了,感兴趣可以看Logistic Regression - Jia Li或Logistic regression: a simple ANN Nando de Freitas。
基本公式是$\mathbf{H} = \mathbf{X}^T \operatorname{diag}(\pi_i (1 - \pi_i)) \mathbf{X}$,其中$\pi_i = \operatorname{sigm}(x_i \beta)$。
167 def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
168 #+-- 33 lines: """Computes the gradient and the Hessian, in the case of a logistic loss.
201 w, c, yz = _intercept_dot(w, X, y)
202 #+-- 4 lines: if sample_weight is None:---------
206 z = expit(yz)
207 #+-- 8 lines: z0 = sample_weight * (z - 1) * y---
215 # The mat-vec product of the Hessian
216 d = sample_weight * z * (1 - z)
217 if sparse.issparse(X):
218 dX = safe_sparse_dot(sparse.dia_matrix((d, 0),
219 shape=(n_samples, n_samples)), X)
220 else:
221 # Precompute as much as possible
222 dX = d[:, np.newaxis] * X
223
224 if fit_intercept:
225 # Calculate the double derivative with respect to intercept
226 # In the case of sparse matrices this returns a matrix object.
227 dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))
228
229 def Hs(s):
230 ret = np.empty_like(s)
231 ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
232 ret[:n_features] += alpha * s[:n_features]
233
234 # For the fit intercept case.
235 if fit_intercept:
236 ret[:n_features] += s[-1] * dd_intercept
237 ret[-1] = dd_intercept.dot(s[:n_features])
238 ret[-1] += d.sum() * s[-1]
239 return ret
240
241 return grad, Hs
这里我也只知其然,以后有时间再深挖下吧。
In [ ]: