In [3]:
# %load ../../../preconfig.py
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#sns.set(font='SimHei')
#plt.rcParams['axes.grid'] = False
import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
#import sklearn
#import itertools
import logging
logger = logging.getLogger()
参考:
9.2 Tree-Based Methods - The Elements of Statistical Learning
Classification and Regression Trees (CART) Theory and Applications
In [4]:
from sklearn.datasets import load_iris
data = load_iris()
In [5]:
# 准备特征数据
X = pd.DataFrame(data.data,
columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
X.head(2)
Out[5]:
In [6]:
# 准备标签数据
y = pd.DataFrame(data.target, columns=['target'])
y.replace(to_replace=range(3), value=data.target_names, inplace=True)
y.head(3)
Out[6]:
In [7]:
# 组建样本 [特征,标签]
samples = pd.concat([X, y], axis=1) #, keys=["x", "y"])
samples.head(3)
Out[7]:
In [8]:
def splitter(samples, feature, threshold):
# 按特征 f 和阈值 t 分割样本
left_nodes = samples.query("{f} < {t}".format(f=feature, t=threshold))
right_nodes = samples.query("{f} >= {t}".format(f=feature, t=threshold))
return {"left_nodes": left_nodes, "right_nodes": right_nodes}
In [9]:
split = splitter(samples, "sepal_length", 5)
# 左子集
x_l = split["left_nodes"].loc[:, "target"].value_counts()
x_l
Out[9]:
In [10]:
# 右子集
x_r = split["right_nodes"].loc[:, "target"].value_counts()
x_r
Out[10]:
In [11]:
def calc_class_proportion(node):
# 计算各标签在集合中的占比
y = node["target"]
return y.value_counts() / y.count()
In [12]:
calc_class_proportion(split["left_nodes"])
Out[12]:
In [13]:
calc_class_proportion(split["right_nodes"])
Out[13]:
主要的评价函数有三种,它们评价的是集合的不纯度(值越大,集合越混杂)。
先做些数学定义以便于描述:
假设对于集合 $m$ 有 $N_m$ 个样本,可分割成 $R_m$ 子集。
若总的标签类别有 $K$ 种,则标签 $k$ 在此集合中的占比为:
且令标签 $k$ 是占比最大的标签,即 $k(m) = \operatorname{arg max}_k \hat{p}_{m k}$.
我们一般把集合的分类结果定义为占比最大的标签,那么落在此集合中的其它标签就是误分类。其比率是 $1 - \hat{p}_{m k}(m)$.
In [14]:
def misclassification_error(node):
p_mk = calc_class_proportion(node)
return 1 - p_mk.max()
In [15]:
misclassification_error(split["left_nodes"])
Out[15]:
In [16]:
misclassification_error(split["right_nodes"])
Out[16]:
对于二分类问题,
In [17]:
binary_class = pd.Series(np.arange(0, 1.01, 0.01)).to_frame(name="p")
binary_class["1-p"] = 1 - binary_class["p"]
binary_class.head(3)
Out[17]:
误分类率和占比 $p$ 的关系可划图为:
In [18]:
binary_class["misclass"] = binary_class.apply(lambda x: 1 - x.max(), axis=1)
binary_class.plot(x="p", y="misclass")
Out[18]:
当 $p=0.5$,两种标签各占一半,不纯度最高;当 $p=0$ 或 $p=1$, 只含有其中一种标签时,不纯度最低。
这里的基尼系数并非是经济上测量分配公平的指标。
它的思路是从集合中随机抽取元素 $a \in K_p$,再以 $K_p$ 在集合中的分布为参考随机给 $a$ 分配标签,那么误分配率就是基尼系数。
具体到决策树的节点 $m$ 上,标签 $k_i$ 的占比为 $p_{k_i m}$。则抽中属于标签 $k_i$ 的元素概率是 $p_{k_i m}$,误分配到其它标签的概率是 $\sum_{k' \neq k_i} p_{k_i m} p_{k' m}$。对于整个集合的标签则是:
\begin{equation} G(m) = \displaystyle \sum_{k \neq k'} p_{k m} p_{k' m} \, \overset{乘法分配律}{=} \sum_{k = 1}^{K} p_{k m} (1 - p_{k m}) \end{equation}
In [19]:
def gini_index(node):
p_mk = calc_class_proportion(node)
return (p_mk * (1 - p_mk)).sum()
In [20]:
gini_index(split["left_nodes"])
Out[20]:
In [21]:
gini_index(split["right_nodes"])
Out[21]:
在二分类中,基尼系数和占比 $p$ 的关系可划图为:
In [22]:
binary_class["gini"] = (binary_class["p"] * binary_class["1-p"] * 2)
binary_class.plot(x="p", y="gini")
Out[22]:
ref:
Qualitively what is Cross Entropy
这个损失函数的思路来源于信息论:若某事件的发生概率是 $p$,则需至少 $\log_2 (1/p)$ 位编码。那么对于所有事件,其最优编码的平均字长为 $\sum_i p_i \log_2 (1 / p_i)$。
借用其思路,对于节点来说,其内容越混杂,就需要越多字长来区分。所以这里 cross-entropy 定义为:
\begin{equation} C(m) = \displaystyle \sum_{k=1}^K p_{m k} \log (1 / p_{m k}) \, = - \sum_{k=1}^K p_{m k} \log p_{m k} \end{equation}
In [23]:
def cross_entropy(node):
p_mk = calc_class_proportion(node)
return - (p_mk * p_mk.apply(np.log)).sum()
In [24]:
cross_entropy(split["left_nodes"])
Out[24]:
In [25]:
cross_entropy(split["right_nodes"])
Out[25]:
在二分类中,cross-entropy 和占比 $p$ 的关系可划图为:
In [26]:
x = binary_class[["p", "1-p"]]
binary_class["cross_entropy"] = -(x * np.log(x)).sum(axis=1)
binary_class.plot(x="p", y="cross_entropy")
Out[26]:
在二分类问题中,三种评价函数的比较如图:
In [27]:
binary_class.plot(x="p", y=["misclass", "gini", "cross_entropy"])
Out[27]:
为了便于比较,我们将 cross_entropy 也放缩到 $(0.5, 0.5)$。
In [28]:
binary_class["cross_entropy_scaled"] = binary_class["cross_entropy"] / binary_class["cross_entropy"].max() * 0.5
binary_class.plot(x="p", y=["misclass", "gini", "cross_entropy_scaled"], ylim=[0,0.55])
Out[28]:
可以看到,识分类率在整个区间是均一的,而 cross_entropy 越靠近纯净,其值变化越剧烈。所以 cross_entropy 对纯净更敏感的特性,有利于让结果子集更纯净,其使用相对较多。
In [29]:
def calc_impurity_measure(node, feathure, threshold, measure, min_nodes=5):
child = splitter(node, feathure, threshold)
left = child["left_nodes"]
right = child["right_nodes"]
if left.shape[0] <= min_nodes or right.shape[0] <= min_nodes:
return 0
impurity = pd.DataFrame([],
columns=["score", "rate"],
index=[])
impurity.loc["all"] = [measure(node), node.shape[0]]
impurity.loc["left"] = [-measure(left), left.shape[0]]
impurity.loc["right"] = [-measure(right), right.shape[0]]
impurity["rate"] /= impurity.at["all", "rate"]
logger.info(impurity)
return (impurity["score"] * impurity["rate"]).sum()
In [30]:
calc_impurity_measure(samples, "sepal_length", 5, gini_index)
Out[30]:
In [31]:
calc_impurity_measure(samples, "sepal_length", 1, gini_index)
Out[31]:
In [32]:
def find_best_threshold(node, feature, measure):
threshold_candidates = node[feature].quantile(np.arange(0, 1, 0.2))
res = pd.Series([], name=feature)
for t in threshold_candidates:
res[t] = calc_impurity_measure(node, feature, t, measure)
logger.info(res)
if res.max() == 0:
return None
else:
return res.argmax()
In [33]:
find_best_threshold(samples, "sepal_width", gini_index)
Out[33]:
In [34]:
find_best_threshold(samples, "sepal_length", gini_index)
Out[34]:
In [35]:
def find_best_split(node, measure):
if node["target"].unique().shape[0] <= 1:
return None
purity_gain = pd.Series([], name="feature")
for f in node.drop("target", axis=1).columns:
purity_gain[f] = find_best_threshold(node, f, measure)
if pd.isnull(purity_gain.max()):
return None
else:
best_split = {"feature": purity_gain.argmax(), "threshold": purity_gain.max()}
best_split["child"] = splitter(node, **best_split)
return best_split
In [36]:
best_split = find_best_split(samples, gini_index)
[best_split[x] for x in ["feature", "threshold"]]
Out[36]:
In [37]:
class BinaryNode:
def __init__(self, samples, max_depth, measure=gini_index):
self.samples = samples
self.max_depth = max_depth
self.measure = measure
self.is_leaf = False
self.class_ = None
self.left = None
self.right = None
self.best_split = None
def split(self, depth):
if depth > self.max_depth:
self.is_leaf = True
self.class_ = self.samples["target"].value_counts().argmax()
return
best_split = find_best_split(self.samples, self.measure)
if pd.isnull(best_split):
self.is_leaf = True
self.class_ = self.samples["target"].value_counts().argmax()
return
self.best_split = best_split
left = self.best_split["child"]["left_nodes"]
self.left = BinaryNode(left.drop(best_split["feature"], axis=1), self.max_depth)
right = self.best_split["child"]["right_nodes"]
self.right = BinaryNode(right.drop(best_split["feature"], axis=1), self.max_depth)
# 先序深度优先
self.left.split(depth+1)
self.right.split(depth+1)
In [38]:
binaryNode = BinaryNode(samples, 3)
binaryNode.split(0)
In [39]:
def show(node, depth):
if node.left:
show(node.left, depth+1)
if node.is_leaf:
print("{}{}".format("\t"*(depth+2), node.class_))
return
else:
print("{}{}: {}".format("\t"*depth,
node.best_split["feature"],
node.best_split["threshold"]))
if node.right:
show(node.right, depth+1)
In [40]:
show(binaryNode, 0)
观察可知,这颗树是有问题的,如 petal_width: 0.4 下两个叶子的类别均是 setosa。在树生成后,可以通过后续的剪枝操作,让整颗树更精简。这里不再详述。