In [1]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font='SimHei')
plt.rcParams['axes.grid'] = False
from IPython.display import SVG
def show_image(filename, figsize=None):
if figsize:
plt.figure(figsize=figsize)
plt.imshow(plt.imread(filename))
In [3]:
SVG("./res/uml/Model___tree_3.svg")
Out[3]:
主体代码如下:
195 with nogil:
196 # push root node onto stack
197 rc = stack.push(0, n_node_samples, 0, _TREE_UNDEFINED, 0, INFINITY, 0)
198 #+-- 4 lines: if rc == -1:-----------------------------------------------------------------
202
203 while not stack.is_empty():
204 stack.pop(&stack_record)
205 #+-- 9 lines: start = stack_record.start---------------------------------------------------
214 n_node_samples = end - start
215 splitter.node_reset(start, end, &weighted_n_node_samples)
216
217 is_leaf = ((depth >= max_depth) or
218 (n_node_samples < min_samples_split) or
219 (n_node_samples < 2 * min_samples_leaf) or
220 (weighted_n_node_samples < min_weight_leaf))
221
222 if first:
223 impurity = splitter.node_impurity()
224 first = 0
225
226 is_leaf = is_leaf or (impurity <= MIN_IMPURITY_SPLIT)
227
228 if not is_leaf:
229 splitter.node_split(impurity, &split, &n_constant_features)
230 is_leaf = is_leaf or (split.pos >= end)
231
232 node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
233 split.threshold, impurity, n_node_samples,
234 weighted_n_node_samples)
235
236 #+-- 4 lines: if node_id == <SIZE_t>(-1):--------------------------------------------------
240 # Store value for all nodes, to facilitate tree/model
241 # inspection and interpretation
242 splitter.node_value(tree.value + node_id * tree.value_stride)
243
244 if not is_leaf:
245 # Push right child on stack
246 rc = stack.push(split.pos, end, depth + 1, node_id, 0,
247 split.impurity_right, n_constant_features)
248 #+-- 3 lines: if rc == -1:-----------------------------------------------------------------
251 # Push left child on stack
252 rc = stack.push(start, split.pos, depth + 1, node_id, 1,
253 split.impurity_left, n_constant_features)
254 #+-- 2 lines: if rc == -1:-----------------------------------------------------------------
256
sklearn 用数组实现了二叉树,我比较感兴趣的函数是计算特征重要性的 compute_feature_importances
。
这个函数的想法其实也很简单,就是遍历决策的中间节点,汇总各个特征对纯净度的贡献量。代码很短,很好理解。
1033 cpdef compute_feature_importances(self, normalize=True):
1034 """Computes the importance of each feature (aka variable)."""
1035 #+-- 3 lines: cdef Node* left--------------------------------------------------------------
1038 cdef Node* node = nodes
1039 cdef Node* end_node = node + self.node_count
1040 #+-- 3 lines: cdef double normalizer = 0.--------------------------------------------------
1043 cdef np.ndarray[np.float64_t, ndim=1] importances
1044 #+-- 2 lines: importances = np.zeros((self.n_features,))-----------------------------------
1046
1047 with nogil:
1048 while node != end_node:
1049 if node.left_child != _TREE_LEAF:
1050 # ... and node.right_child != _TREE_LEAF:
1051 left = &nodes[node.left_child]
1052 right = &nodes[node.right_child]
1053
1054 importance_data[node.feature] += (
1055 node.weighted_n_node_samples * node.impurity -
1056 left.weighted_n_node_samples * left.impurity -
1057 right.weighted_n_node_samples * right.impurity)
1058 node += 1
1059
1060 importances /= nodes[0].weighted_n_node_samples
1061
1062 if normalize:
1063 normalizer = np.sum(importances)
1064 #+-- 3 lines: if normalizer > 0.0:---------------------------------------------------------
1067 importances /= normalizer
1068
1069 return importances
另外,在 tree.py
模块里决策树分类结果也可以计算出概率值,这个概率其实是预测类的样本在此叶子的占比。
整个计算路径是:
_tree.py:Tree.predict
通过 _tree.py:Tree.apply
找到叶子节点,结合 _tree.py:Tree._get_value_ndarray
得到所在叶子节点的各个类统计数。
tree.py:DecisionTreeClassifier.predict_proba
计算占比。
In [ ]: