|
44 | 44 | #
|
45 | 45 | # The decision classifier has an attribute called ``tree_`` which allows access
|
46 | 46 | # to low level attributes such as ``node_count``, the total number of nodes,
|
47 |
| -# and ``max_depth``, the maximal depth of the tree. It also stores the |
| 47 | +# and ``max_depth``, the maximal depth of the tree. The tree_.compute_node_depths() |
| 48 | +# method computes the depth of each node in the tree. `tree_` also stores the |
48 | 49 | # entire binary tree structure, represented as a number of parallel arrays. The
|
49 | 50 | # i-th element of each array holds information about the node ``i``. Node 0 is
|
50 | 51 | # the tree's root. Some of the arrays only apply to either leaves or split
|
|
63 | 64 | # - ``n_node_samples[i]``: the number of training samples reaching node
|
64 | 65 | # ``i``
|
65 | 66 | # - ``impurity[i]``: the impurity at node ``i``
|
| 67 | +# - ``weighted_n_node_samples[i]``: the weighted number of training samples |
| 68 | +# reaching node ``i`` |
| 69 | +# - ``value[i, j, k]``: the summary of the training samples that reached node i for |
| 70 | +# class j and output k. |
66 | 71 | #
|
67 | 72 | # Using the arrays, we can traverse the tree structure to compute various
|
68 | 73 | # properties. Below, we will compute the depth of each node and whether or not
|
|
73 | 78 | children_right = clf.tree_.children_right
|
74 | 79 | feature = clf.tree_.feature
|
75 | 80 | threshold = clf.tree_.threshold
|
| 81 | +values = clf.tree_.value |
76 | 82 |
|
77 | 83 | node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
|
78 | 84 | is_leaves = np.zeros(shape=n_nodes, dtype=bool)
|
|
100 | 106 | for i in range(n_nodes):
|
101 | 107 | if is_leaves[i]:
|
102 | 108 | print(
|
103 |
| - "{space}node={node} is a leaf node.".format( |
104 |
| - space=node_depth[i] * "\t", node=i |
| 109 | + "{space}node={node} is a leaf node with value={value}.".format( |
| 110 | + space=node_depth[i] * "\t", node=i, value=values[i] |
105 | 111 | )
|
106 | 112 | )
|
107 | 113 | else:
|
108 | 114 | print(
|
109 |
| - "{space}node={node} is a split node: " |
| 115 | + "{space}node={node} is a split node with value={value}: " |
110 | 116 | "go to node {left} if X[:, {feature}] <= {threshold} "
|
111 | 117 | "else to node {right}.".format(
|
112 | 118 | space=node_depth[i] * "\t",
|
|
115 | 121 | feature=feature[i],
|
116 | 122 | threshold=threshold[i],
|
117 | 123 | right=children_right[i],
|
| 124 | + value=values[i], |
118 | 125 | )
|
119 | 126 | )
|
120 | 127 |
|
| 128 | +# %% |
| 129 | +# What is the values array used here? |
| 130 | +# ----------------------------------- |
| 131 | +# The `tree_.value` array is a 3D array of shape |
| 132 | +# [``n_nodes``, ``n_classes``, ``n_outputs``] which provides the count of samples |
| 133 | +# reaching a node for each class and for each output. Each node has a ``value`` |
| 134 | +# array which is the number of weighted samples reaching this |
| 135 | +# node for each output and class. |
| 136 | +# |
| 137 | +# For example, in the above tree built on the iris dataset, the root node has |
| 138 | +# ``value = [37, 34, 41]``, indicating there are 37 samples |
| 139 | +# of class 0, 34 samples of class 1, and 41 samples of class 2 at the root node. |
| 140 | +# Traversing the tree, the samples are split and as a result, the ``value`` array |
| 141 | +# reaching each node changes. The left child of the root node has ``value = [37, 0, 0]`` |
| 142 | +# because all 37 samples in the left child node are from class 0. |
| 143 | +# |
| 144 | +# Note: In this example, `n_outputs=1`, but the tree classifier can also handle |
| 145 | +# multi-output problems. The `value` array at each node would just be a 2D |
| 146 | +# array instead. |
| 147 | + |
121 | 148 | ##############################################################################
|
122 | 149 | # We can compare the above output to the plot of the decision tree.
|
123 | 150 |
|
|
0 commit comments