In [1]:
import numpy as np
class LinearSVM:
def __init__(self):
self._w = self._b = None
def fit(self, x, y, c=1, lr=0.01, epoch=10000):
x, y = np.asarray(x, np.float32), np.asarray(y, np.float32)
self._w = np.zeros(x.shape[1])
self._b = 0.
for _ in range(epoch):
self._w *= 1 - lr
err = 1 - y * self.predict(x, True)
idx = np.argmax(err)
# 注意即使所有 x, y 都满足 w·x + b >= 1
# 由于损失里面有一个 w 的模长平方
# 所以仍然不能终止训练,只能截断当前的梯度下降
if err[idx] <= 0:
continue
delta = lr * c * y[idx]
self._w += delta * x[idx]
self._b += delta
def predict(self, x, raw=False):
x = np.asarray(x, np.float32)
y_pred = x.dot(self._w) + self._b
if raw:
return y_pred
return np.sign(y_pred).astype(np.float32)
In [2]:
from Util import gen_two_clusters
x, y = gen_two_clusters()
svm = LinearSVM()
svm.fit(x, y)
print("准确率:{:8.6} %".format((svm.predict(x) == y).mean() * 100))
In [3]:
from Util import visualize2d
visualize2d(svm, x, y)
visualize2d(svm, x, y, True)
In [4]:
# 注意我们只是把 center 参数(亦即正负样本点的“中心”)
# 从原点(0, 0)(默认值)挪到(5, 5)(亦即破坏了一定的对称性)、
# 并将正负样本点之间的距离(dis 参数)稍微拉近了一点而已,
# 结果就已经惨不忍睹了
x, y = gen_two_clusters(center=5, dis=1)
svm = LinearSVM()
svm.fit(x, y)
print("准确率:{:8.6} %".format((svm.predict(x) == y).mean() * 100))
visualize2d(svm, x, y)
visualize2d(svm, x, y, True)
通过下面这张动图,我们能够直观地感受极大梯度下降法下 LinearSVM 的训练过程:
可以看到,LinearSVM 确实卡在了奇怪的地方
原理我不敢乱说,这里只提供一个牵强附会的直观解释:
专业的理论就留待专业的观众老爷补充吧 ( σ'ω')σ
In [5]:
# 继承上一个 LinearSVM 以重复利用代码
class LinearSVM2(LinearSVM):
# 用参数 batch_size 表示 Top n 中的 n
def fit(self, x, y, c=1, lr=0.01, batch_size=128, epoch=10000):
x, y = np.asarray(x, np.float32), np.asarray(y, np.float32)
# 如果 batch_size 设得比样本总数还多、则将其改为样本总数
batch_size = min(batch_size, len(y))
self._w = np.zeros(x.shape[1])
self._b = 0.
for _ in range(epoch):
self._w *= 1 - lr
err = 1 - y * self.predict(x, True)
# 利用 argsort 函数直接取出 Top n
# 注意 argsort 的结果是从小到大的,所以要用 [::-1] 把结果翻转一下
batch = np.argsort(err)[-batch_size:][::-1]
err = err[batch]
if err[0] <= 0:
continue
# 注意这里我们只能利用误分类的样本做梯度下降
# 因为被正确分类的样本处、这一部分的梯度为 0
mask = err > 0
batch = batch[mask]
# 取各梯度平均并做一步梯度下降
delta = lr * c * y[batch]
self._w += np.mean(delta[..., None] * x[batch], axis=0)
self._b += np.mean(delta)
In [6]:
x, y = gen_two_clusters(center=5, dis=1)
svm = LinearSVM2()
svm.fit(x, y)
print("准确率:{:8.6} %".format((svm.predict(x) == y).mean() * 100))
visualize2d(svm, x, y)
visualize2d(svm, x, y, True)
In [7]:
class LinearSVM3(LinearSVM):
def fit(self, x, y, c=1, lr=0.01, batch_size=128, epoch=10000):
x, y = np.asarray(x, np.float32), np.asarray(y, np.float32)
batch_size = min(batch_size, len(y))
self._w = np.zeros(x.shape[1])
self._b = 0.
for _ in range(epoch):
self._w *= 1 - lr
# 随机选取 batch_size 个样本
batch = np.random.choice(len(x), batch_size)
x_batch, y_batch = x[batch], y[batch]
err = 1 - y_batch * self.predict(x_batch, True)
if np.max(err) <= 0:
continue
mask = err > 0
delta = lr * c * y_batch[mask]
self._w += np.mean(delta[..., None] * x_batch[mask], axis=0)
self._b += np.mean(delta)
In [8]:
# 进一步拉近正负样本点间的距离以观察性能
x, y = gen_two_clusters(center=5, dis=0.5)
top_n_svm = LinearSVM2()
top_n_svm.fit(x, y)
print("Top n LinearSVM 准确率:{:8.6} %".format((top_n_svm.predict(x) == y).mean() * 100))
mbgd_svm = LinearSVM3()
mbgd_svm.fit(x, y)
print("MBGD LinearSVM 准确率:{:8.6} %".format((mbgd_svm.predict(x) == y).mean() * 100))
visualize2d(top_n_svm, x, y)
visualize2d(mbgd_svm, x, y)
In [9]:
# 将 scale 从 1(默认)调成 5
x, y = gen_two_clusters(center=5, scale=5)
top_n_svm = LinearSVM2()
top_n_svm.fit(x, y)
print("Top n LinearSVM 准确率:{:8.6} %".format((top_n_svm.predict(x) == y).mean() * 100))
mbgd_svm = LinearSVM3()
mbgd_svm.fit(x, y)
print("MBGD LinearSVM 准确率:{:8.6} %".format((mbgd_svm.predict(x) == y).mean() * 100))
visualize2d(top_n_svm, x, y)
visualize2d(mbgd_svm, x, y)
通过下面这张动图,我们能够直观地感受数据的 scale 很大时 LinearSVM 的训练过程:
可以看到,模型确实一直在持续震荡
In [10]:
x, y = gen_two_clusters(center=5, dis=1, scale=5)
# 进行归一化处理
x -= x.mean(axis=0)
x /= x.std(axis=0)
# Top 1 梯度下降法即为极大梯度下降法
top_1_svm = LinearSVM()
top_1_svm.fit(x, y)
print("Top 1 LinearSVM 准确率:{:8.6} %".format((top_1_svm.predict(x) == y).mean() * 100))
top_n_svm = LinearSVM2()
top_n_svm.fit(x, y)
print("Top n LinearSVM 准确率:{:8.6} %".format((top_n_svm.predict(x) == y).mean() * 100))
mbgd_svm = LinearSVM3()
mbgd_svm.fit(x, y)
print("MBGD LinearSVM 准确率:{:8.6} %".format((mbgd_svm.predict(x) == y).mean() * 100))
visualize2d(top_1_svm, x, y)
visualize2d(top_n_svm, x, y)
visualize2d(mbgd_svm, x, y)
可以看到在归一化处理后,即使是简单地采用极大梯度下降法,也能够在比较苛刻的数据(center=5
、dis=1
、scale=5
)上表现得不错