Decision Trees
Decision Trees
Process:
Splitting
Selecting the best Split (Gini index - impurity)
Pure node, Max depth
Implementation:
1. Implement a basic Decision Tree Classifier from scratch.
2. Use Entropy (disorder) and Information Gain (variance) to
determine the best feature to split on.
Class A: 30
Class B: 20
Total: 50
PA = 30/50
PB = 20/50
import math
# Helper functions
def calculate_entropy(data):
"""
Calculate the entropy of a dataset.
data: List of target labels
"""
total = len(data)
if total == 0:
return 0
counts = {}
for label in data:
counts[label] = counts.get(label, 0) + 1
entropy = 0
for count in counts.values():
prob = count / total
entropy -= prob * math.log2(prob)
return entropy
def split_data(dataset, feature_index):
"""
Split the dataset based on a feature.
dataset: List of lists where each inner list is a data point
feature_index: Index of the feature to split on
"""
splits = {}
for row in dataset:
key = row[feature_index]
if key not in splits:
splits[key] = []
splits[key].append(row)
return splits
weighted_entropy = 0
for subset in splits.values():
prob = len(subset) / total_samples
subset_entropy = calculate_entropy([row[target_index] for row
in subset])
weighted_entropy += prob * subset_entropy
# Example Usage
dataset = [
['Sunny', 'Hot', 'High', 'No'],
['Sunny', 'Hot', 'High', 'No'],
['Overcast', 'Hot', 'High', 'Yes'],
['Rainy', 'Mild', 'High', 'Yes'],
['Rainy', 'Cool', 'Normal', 'Yes'],
['Rainy', 'Cool', 'Normal', 'No'],
['Overcast', 'Cool', 'Normal', 'Yes'],
['Sunny', 'Mild', 'High', 'No'],
['Sunny', 'Cool', 'Normal', 'Yes'],
['Rainy', 'Mild', 'Normal', 'Yes']
]
tree = DecisionTree(max_depth=3)
tree.fit(dataset, features, target_index)
print(tree.tree)