DMML2023 Lecture05 19jan2023
DMML2023 Lecture05 19jan2023
Setup
First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a
function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x
may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-
Learn ≥0.20.
In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)
# Common imports
import numpy as np
import os
iris = load_iris()
X = iris.data[:, 2:] # the attributes of the data item. Note: petal le
ngth and width only, don't use the 3rd attribute
y = iris.target # the labels of the data item
Out[59]:
DecisionTreeClassifier(max_depth=2, random_state=42)
In [60]:
# for visualizing the decision tree
from graphviz import Source
from sklearn.tree import export_graphviz
export_graphviz(
tree_clf,
out_file=os.path.join(IMAGES_PATH, "iris_tree.dot"),
feature_names=iris.feature_names[2:],
class_names=iris.target_names,
rounded=True,
filled=True
)
Source.from_file(os.path.join(IMAGES_PATH, "iris_tree.dot"))
Out[60]:
plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf, X, y)
save_fig("decision_tree_decision_boundaries_plot")
plt.show()
Saving figure decision_tree_decision_boundaries_plot
In [5]:
tree_clf.predict_proba([[5, 1.5]]) # probabilities of each class
Out[5]:
array([[0. , 0.90740741, 0.09259259]])
In [6]:
tree_clf.predict([[5, 1.5]]) # outputs the class with the highest pro
bability
Out[6]:
array([1])
In [76]:
X[(X[:, 0]==X[:, 0][y==1].max()) & (y==1)] # longest Iris versicolor
flower
Out[76]:
array([[5.1, 1.6]])
In [77]:
X[(X[:, 0]==X[:, 0][y==2].min()) & (y==2)] # shortest Iris virginica
flower
Out[77]:
array([[4.5, 1.7]])
In [79]:
eliminate_some= ((X[:, 0]<=5.0) & (y==1)) | ((X[:, 0]>=4.6) & (y==2))
| (y==0) # remove longest versicolor and shortest virginica
X_tweaked = X[eliminate_some]
y_tweaked = y[eliminate_some]
plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf_tweaked, X_tweaked, y_tweaked, legend
=False)
save_fig("decision_tree_instability_plot")
plt.show()
Source.from_file(os.path.join(IMAGES_PATH, "iris_tree_tweaked.dot"))
Out[82]:
deep_tree_clf1 = DecisionTreeClassifier(random_state=42)
deep_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=4, random_st
ate=42) # split a node further only if it has at least 4 data points
deep_tree_clf1.fit(Xm, ym)
deep_tree_clf2.fit(Xm, ym)
save_fig("min_samples_leaf_plot")
plt.show()
Source.from_file(os.path.join(IMAGES_PATH, "deep_tree_clf1.dot"))
Out[12]:
X[1] <= -0.092
gini = 0.5
samples = 100
value = [50, 50]
True False
Source.from_file(os.path.join(IMAGES_PATH, "deep_tree_clf2.dot"))
Out[13]:
X[1] <= -0.092
gini = 0.5
samples = 100
value = [50, 50]
True False
tree_clf_r = DecisionTreeClassifier(random_state=42)
tree_clf_r.fit(Xr, y)
plt.figure(figsize=(8, 3))
plot_decision_boundary(tree_clf_r, Xr, y, axes=[0.5, 7.5, -1.0, 1], i
ris=False)
plt.show()
In [15]:
export_graphviz(
tree_clf_r,
out_file=os.path.join(IMAGES_PATH, "tree_clf_r.dot"),
rounded=True,
filled=True
)
Source.from_file(os.path.join(IMAGES_PATH, "tree_clf_r.dot"))
Out[15]:
X[0] <= 2.559
gini = 0.667
samples = 150
value = [50, 50, 50]
False
True
angle = np.pi / 4
rotation_matrix = np.array([[np.cos(angle), -np.sin(angle)], [np.sin
(angle), np.cos(angle)]])
Xsr = Xs.dot(rotation_matrix)
tree_clf_s = DecisionTreeClassifier(random_state=42)
tree_clf_s.fit(Xs, ys)
tree_clf_sr = DecisionTreeClassifier(random_state=42)
tree_clf_sr.fit(Xsr, ys)
save_fig("sensitivity_to_rotation_plot")
plt.show()
Source.from_file(os.path.join(IMAGES_PATH, "tree_clf_s.dot"))
Out[17]:
Source.from_file(os.path.join(IMAGES_PATH, "tree_clf_sr.dot"))
Out[18]:
X[0] <= 0.125
gini = 0.495
samples = 100
value = [55, 45]
True False