In [1]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font='SimHei')
plt.rcParams['axes.grid'] = False
from IPython.display import SVG
def show_image(filename, figsize=None):
if figsize:
plt.figure(figsize=figsize)
plt.imshow(plt.imread(filename))
In [2]:
SVG("./res/uml/Model___splitter_2.svg")
Out[2]:
Splitter
Splitter 是分割模块的接口类。其定义在 _splitter.pxd 文件,实现在 _splitter.pyx 文件。
为了记录每次分割的情况,首先定义了 SplitRecord 数据结构,含义注释非常清晰,不赘述。
23 cdef struct SplitRecord:
24 # Data to track sample split
25 SIZE_t feature # Which feature to split on.
26 SIZE_t pos # Split samples array at the given position,
27 # i.e. count of samples below threshold for feature.
28 # pos is >= end if the node is a leaf.
29 double threshold # Threshold to split at.
30 double improvement # Impurity improvement given parent node.
31 double impurity_left # Impurity of the left split.
32 double impurity_right # Impurity of the right split.
成员变量粗略分为五部份:
评价函数和终止条件:
34 cdef class Splitter:
35 #+-- 5 lines: The splitter searches in the input space for a feature and a thresho
40 # Internal structures
41 cdef public Criterion criterion # Impurity criterion
42 cdef public SIZE_t max_features # Number of features to test
43 cdef public SIZE_t min_samples_leaf # Min samples in a leaf
44 cdef public double min_weight_leaf # Minimum weight in a leaf
随机数的种子,在选特征和阈值时会用到:
46 cdef object random_state # Random state
47 cdef UINT32_t rand_r_state # sklearn_rand_r random number state
样本索引和特征索引:
constant_features
,在有的节点处,某些特征已经是常数(主要是离散特征),不再可分割。为了避免无谓的计算量,sklearn 每次分割会记录下已经是常量的特征,并由父节点到子节点,代代相传。49 cdef SIZE_t* samples # Sample indices in X, y
50 cdef SIZE_t n_samples # X.shape[0]
51 cdef double weighted_n_samples # Weighted number of samples
52 cdef SIZE_t* features # Feature indices in X
53 cdef SIZE_t* constant_features # Constant features indices
54 cdef SIZE_t n_features # X.shape[1]
55 cdef DTYPE_t* feature_values # temp. array holding feature values
节点起止: sklearn 将 X 和 y 用数组存储,并在每次分割后,将同一子节点的数据索引归结在一起,所以只需指定首末位罝。
57 cdef SIZE_t start # Start position for the current node
58 cdef SIZE_t end # End position for the current node
预排:事先已经有按各特征值排序好的索引矩阵
60 cdef bint presort # Whether to use presorting, only
61 # allowed on dense data
标签和权重
63 cdef DOUBLE_t* y
64 cdef SIZE_t y_stride
65 cdef DOUBLE_t* sample_weight
init
init 是调用时的初始化函数。
119 cdef void init(self,
120 object X,
121 np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
122 DOUBLE_t* sample_weight,
123 np.ndarray X_idx_sorted=None) except *:
124 """Initialize the splitter.
125 +-- 15 lines: Take in the input data X, the target Y, and optional sample weights
140 """
141
142 self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
143 cdef SIZE_t n_samples = X.shape[0]
144 #+-- 3 lines: Create a new array which will be used to store nonzero-------------
147 cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)
148 #+-- 5 lines: cdef SIZE_t i, j---------------------------------------------------
153 for i in range(n_samples):
154 # Only work with positively weighted samples
155 if sample_weight == NULL or sample_weight[i] != 0.0:
156 samples[j] = i
157 j += 1
158
159 if sample_weight != NULL:
160 weighted_n_samples += sample_weight[i]
161 else:
162 weighted_n_samples += 1.0
163
164 # Number of samples is number of positively weighted samples
165 self.n_samples = j
166 self.weighted_n_samples = weighted_n_samples
167
168 cdef SIZE_t n_features = X.shape[1]
169 cdef SIZE_t* features = safe_realloc(&self.features, n_features)
170
171 for i in range(n_features):
172 features[i] = i
173
174 self.n_features = n_features
175
176 #+-- 7 lines: safe_realloc(&self.feature_values, n_samples)----------------------
主要流程:
剩下的函数都非常简单,不赘述。
88 cdef void node_reset(self, SIZE_t start, SIZE_t end,
89 double* weighted_n_node_samples) nogil
90
91 cdef void node_split(self,
92 double impurity, # Impurity of the node
93 SplitRecord* split,
94 SIZE_t* n_constant_features) nogil
95
96 cdef void node_value(self, double* dest) nogil
97
98 cdef double node_impurity(self) nogil
BaseDenseSplitter
BaseDenseSplitter 增加了关于实阵 X 和相应预排索引的 X_idx_sorted
的变量,相应地重载做了初始化工作。
232 cdef class BaseDenseSplitter(Splitter):
233 cdef DTYPE_t* X
234 cdef SIZE_t X_sample_stride
235 cdef SIZE_t X_feature_stride
236
237 cdef np.ndarray X_idx_sorted
238 cdef INT32_t* X_idx_sorted_ptr
239 cdef SIZE_t X_idx_sorted_stride
240 cdef SIZE_t n_total_samples
241 cdef SIZE_t* sample_mask
242
243 #+-- 16 lines: def __cinit__(self, Criterion criterion, SIZE_t max_features,------
259
260 cdef void init(self,
261 object X,
262 np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
263 DOUBLE_t* sample_weight,
264 np.ndarray X_idx_sorted=None) except *:
265 """Initialize the splitter."""
266
267 # Call parent init
268 Splitter.init(self, X, y, sample_weight)
269
270 # Initialize X
271 cdef np.ndarray X_ndarray = X
272
273 self.X = <DTYPE_t*> X_ndarray.data
274 #+-- 2 lines: self.X_sample_stride = <SIZE_t> X.strides[0] / <SIZE_t> X.itemsize-
276
277 if self.presort == 1:
278 self.X_idx_sorted = X_idx_sorted
279 #+-- 4 lines: self.X_idx_sorted_ptr = <INT32_t*> self.X_idx_sorted.data----------
283 self.n_total_samples = X.shape[0]
284 safe_realloc(&self.sample_mask, self.n_total_samples)
285 memset(self.sample_mask, 0, self.n_total_samples*sizeof(SIZE_t))
主要流程:
X_idx_sorted
;sample_mask
。BestSplitter
BestSplitter 会随机抽取最多 max_features
个特征。对每一个抽定特征,会将样本按此特征进入排序,并逐一作为分割阈值点来考查计算其评价值。最后从所有结果中选取出评价最优的分割方法(特征和阈值)。
主要内容集中在 node_split
函数,我们边给代码边说明:
288 cdef class BestSplitter(BaseDenseSplitter):
289 #+-- 8 lines: """Splitter for finding the best split."""-----------------------------------
297
298 cdef void node_split(self, double impurity, SplitRecord* split,
299 SIZE_t* n_constant_features) nogil:
300 """Find the best split on node samples[start:end]."""
301 #+-- 20 lines: Find the best split----------------------------------------------------------
321
函数开头主要是复制函数和初始化,不多说。
322 #+--- 4 lines: cdef SplitRecord best, current----------------------------------------------
326 cdef SIZE_t f_i = n_features
327 #+--- 18 lines: cdef SIZE_t f_j-------------------------------------------------------------
345
346 _init_split(&best, end)
347
348 if self.presort == 1:
349 for p in range(start, end):
350 sample_mask[samples[p]] = 1
322L-350L 是分割时主要涉及的参数初始化。其中 346L 初始化最佳分割点,348L-350L 是标记预排样本。
接下来就是寻找分割的主体。在具体说明前,需要简单介绍下 feature
特征索引的组成。
在前面已经说过,某些特征在分割后的子集中会是同一数值,此时已经是常数,不再可分割。为了避免计算浪费在常数特征中,sklearn 设计的 feature
特征数组会不断标记在分割中形成的常数特征,并将它们放于数组最前面。为了区分 feature
中的常数和非常数,已抽取和未抽取等情况,有多个指针用于指示划分边界点。理解这些指针的位置和作用,可以方便我们的理解。
主要的指针在 feature
数组中位置如下所示:
0 | n_draw_const | n_known_const | n_total_const | f_i | n_features | |||||
---|---|---|---|---|---|---|---|---|---|---|
旧常数,已抽取 | 旧常数,未抽取 | 新常数(已抽取) | 未抽取 | 非常数,已抽取 |
旧常数是父节点传承过来的信息,在分割前已知;新常数是在子节点在抽取分割时新发现的。
于是,基本的寻找分割特征的过程是:
n_draw_const
, f_i
] 中随机抽取出一个特征 f_j
。f_j
在 [n_draw_const
, n_known_const
] 间,是旧常数。f_j
到已抽取旧常数 [0, n_draw_const
] 中。f_j
在 [n_total_const
, f_i
] 间,是未知情况,进一步判断:n_know_const
, n_total_const
] 中;f_i
, n_features
] 中。有两点要注意说明:
f_j
在具体设计中,不会抽到新发现常数 [n_total_const
, f_i
] 区间,后面会讲到。
这里的移动,其实是交换指针,再移动边界的方法。说起来抽象,但因为算法中很常见,一般都看过,见到代码就了然了。
代码主体如下:
361 while (f_i > n_total_constants and # Stop early if remaining features
362 # are constant
363 (n_visited_features < max_features or
364 # At least one drawn features must be non constant
365 n_visited_features <= n_found_constants + n_drawn_constants)):
366
367 n_visited_features += 1
368 #+-- 12 lines: Loop invariant: elements of features in--------------------------------------
380 # Draw a feature at random
381 f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
382 random_state)
383
384 if f_j < n_known_constants:
385 #+-- 6 lines: f_j in the interval [n_drawn_constants, n_known_constants[-------------------
391
392 else:
393 # f_j in the interval [n_known_constants, f_i - n_found_constants[
394 f_j += n_found_constants
395 # f_j in the interval [n_total_constants, f_i[
396 current.feature = features[f_j]
397 feature_offset = self.X_feature_stride * current.feature
398
399 #+-- 4 lines: Sort samples along that feature; either by utilizing-------------------------
403 if self.presort == 1:
404 #+-- 9 lines: p = start--------------------------------------------------------------------
413 else:
414 #+--- 3 lines: for i in range(start, end):-------------------------------------------------
417 sort(Xf + start, samples + start, end - start)
418
419 if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
420 #+-- 6 lines: features[f_j] = features[n_total_constants]----------------------------------
426 else:
427 f_i -= 1
428 features[f_i], features[f_j] = features[f_j], features[f_i]
429
430 # Evaluate all splits
431 self.criterion.reset()
432 p = start
433
434 while p < end:
435 #+-- 11 lines: while (p + 1 < end and-------------------------------------------------------
446 current.pos = p
447
448 # Reject if min_samples_leaf is not guaranteed
449 #+-- 3 lines: if (((current.pos - start) < min_samples_leaf) or----------------------------
452
453 self.criterion.update(current.pos)
454
455 # Reject if min_weight_leaf is not satisfied
456 #+-- 3 lines: if ((self.criterion.weighted_n_left < min_weight_leaf) or--------------------
459
460 current_proxy_improvement = self.criterion.proxy_impurity_impro vement()
461
462 if current_proxy_improvement > best_proxy_improvement:
463 #+-- 6 lines: best_proxy_improvement = current_proxy_improvement---------------------------
469 best = current # copy
470
大概流程说明如下:
361L-365L,搜索终止条件:
抽取:381L 的随机抽取,配合 394L 的加法,可以保证 f_j
只会落在未抽取常数和未抽取两个区。后面有画了一个图示,注意红圈1和2,一目了然。
384L-391L,f_j
是旧常数的处理,上面已经说过。
接下来,是对 f_j
未知情况的处理:
f_j
是新发现的常数特征;
In [6]:
show_image("./res/BestSplitter.jpg", figsize=(10,15))
在找到最佳分割点后,剩下的就是善后工作。
按分割点进行节点分割。
471 # Reorganize into samples[start:best.pos] + samples[best.pos:end]
472 if best.pos < end:
473 feature_offset = X_feature_stride * best.feature
474 partition_end = end
475 p = start
476
477 while p < partition_end:
478 if X[X_sample_stride * samples[p] + feature_offset] <= best.threshold:
479 p += 1
480
481 else:
482 partition_end -= 1
483
484 tmp = samples[partition_end]
485 samples[partition_end] = samples[p]
486 samples[p] = tmp
487
488 self.criterion.reset()
489 self.criterion.update(best.pos)
490 best.improvement = self.criterion.impurity_improvement(impurity)
491 self.criterion.children_impurity(&best.impurity_left,
492 &best.impurity_right)
493
复制常数特征信息,确保 features
和 constant_features
一致。
494 # Reset sample mask
495 #+-- 3 lines: if self.presort == 1:--------------------------------------------------------
498
499 # Respect invariant for constant features: the original order of
500 # element in features[:n_known_constants] must be preserved for sibling
501 # and child nodes
502 memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
503
504 # Copy newly found constant features
505 memcpy(constant_features + n_known_constants,
506 features + n_known_constants,
507 sizeof(SIZE_t) * n_found_constants)
508
返值。
509 # Return values
510 split[0] = best
511 n_constant_features[0] = n_total_constants
附:这里特征值排序的算法,有机会再补充。
RandomSplitter 和 BestSplitter 大同小异,主要的差异是对分割阈值的寻找。BestSplitter 会遍历所有的特征值来确定最好的阈值点,而 RandomSplitter 则是找出此特征最大徝和最小值,在这个范围内随机抽值作为阈值点,这种分割比较随缘。
差异的代码如下,就不多说了:
726 # Find min, max
727 min_feature_value = X[X_sample_stride * samples[start] + feature_stride]
728 max_feature_value = min_feature_value
729 Xf[start] = min_feature_value
730
731 for p in range(start + 1, end):
732 #+-- 7 lines: current_feature_value = X[X_sample_stride * samples[p] + feature_stride]----- 739
740 if max_feature_value <= min_feature_value + FEATURE_THRESHOLD:
741 #+-- 6 lines: features[f_j] = features[n_total_constants]----------------------------------
747 else:
748 f_i -= 1
749 features[f_i], features[f_j] = features[f_j], features[f_i]
750
751 # Draw a random threshold
752 current.threshold = rand_uniform(min_feature_value,
753 max_feature_value,
754 random_state)
In [ ]: