复制
收藏
提问
全网
把决策树算法用普通pytho语言实现
1个月前
理解问题把决策树算法用普通pytho语言实现
已完成理解「把决策树算法用普通pytho语言实现」
展开阅读网页
以下是使用普通Python实现决策树算法的简化版本:
import numpy as np
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def fit(self, X, y):
self.n_classes_ = len(set(y))
self.n_features_ = X.shape[1]
self.tree_ = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _gini(self, y):
m = len(y)
return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))
def _best_split(self, X, y):
m, n = X.shape
if m <= 1:
return None, None
num_parent = [np.sum(y == c) for c in np.unique(y)]
best_gini = 1.0 - sum((num / m) ** 2 for num in num_parent)
best_idx, best_thr = None, None
for idx in range(self.n_features_):
thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
num_left = [0] * self.n_classes_
num_right = num_parent.copy()
for i in range(1, m):
c = classes[i - 1]
num_left[c] += 1
num_right[c] -= 1
gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(self.n_classes_))
gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_))
gini = (i * gini_left + (m - i) * gini_right) / m
if thresholds[i] == thresholds[i - 1]:
continue
if gini < best_gini:
best_gini = gini
best_idx = idx
best_thr = (thresholds[i] + thresholds[i - 1]) / 2
return best_idx, best_thr
def _grow_tree(self, X, y, depth=0):
num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
predicted_class = np.argmax(num_samples_per_class)
node = Node(predicted_class=predicted_class)
if depth < self.max_depth:
idx, thr = self._best_split(X, y)
if idx is not None:
indices_left = X[:, idx] < thr
X_left, y_left = X[indices_left], y[indices_left]
X_right, y_right = X[~indices_left], y[~indices_left]
node.feature_index = idx
node.threshold = thr
node.left = self._grow_tree(X_left, y_left, depth + 1)
node.right = self._grow_tree(X_right, y_right, depth + 1)
return node
def _predict(self, inputs):
node = self.tree_
while node.left:
if inputs[node.feature_index] < node.threshold:
node = node.left
else:
node = node.right
return node.predicted_class
class Node:
def __init__(self, predicted_class):
self.predicted_class = predicted_class
self.feature_index = 0
self.threshold = 0
self.left = None
self.right = None
# 示例数据
X = np.array([[2, 3], [1, 2], [3, 6], [5, 7], [6, 5], [7, 2]])
y = np.array([0, 0, 1, 1, 1, 0])
# 创建并训练决策树
clf = DecisionTree(max_depth=3)
clf.fit(X, y)
# 预测
print(clf.predict(X))
更专业一些
转笔记
专业
把决策树算法用普通pytho语言实现不在提醒
决策树算法是一种常见的机器学习算法,用于分类和回归任务。以下是使用普通Python语言实现决策树算法的示例代码。
import math
from collections import Counter
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
self.tree = None
def fit(self, X, y):
self.tree = self._build_tree(X, y, depth=0)
def predict(self, X):
return [self._predict_one(x, self.tree) for x in X]
def _build_tree(self, X, y, depth):
if depth == self.max_depth or len(set(y)) == 1:
return Counter(y).most_common(1)[0][0]
best_feature, best_threshold = self._find_best_split(X, y)
if best_feature is None:
return Counter(y).most_common(1)[0][0]
left_indices = X[:, best_feature] <= best_threshold
right_indices = X[:, best_feature] > best_threshold
left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
return {
'feature': best_feature,
'threshold': best_threshold,
'left': left_subtree,
'right': right_subtree
}
def _find_best_split(self, X, y):
best_gain = -math.inf
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = sorted(set(X[:, feature]))
for threshold in thresholds:
gain = self._information_gain(X, y, feature, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _information_gain(self, X, y, feature, threshold):
parent_entropy = self._entropy(y)
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
if sum(left_indices) == 0 or sum(right_indices) == 0:
return 0
n = len(y)
n_left, n_right = sum(left_indices), sum(right_indices)
e_left, e_right = self._entropy(y[left_indices]), self._entropy(y[right_indices])
child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
return parent_entropy - child_entropy
def _entropy(self, y):
counter = Counter(y)
probabilities = [count / len(y) for count in counter.values()]
return -sum(p * math.log2(p) for p in probabilities)
def _predict_one(self, x, tree):
if not isinstance(tree, dict):
return tree
feature = tree['feature']
threshold = tree['threshold']
if x[feature] <= threshold:
return self._predict_one(x, tree['left'])
else:
return self._predict_one(x, tree['right'])
# 示例数据
X = [[0, 1], [1, 0], [1, 1], [0, 0]]
y = [0, 1, 1, 0]
# 创建并训练决策树
tree = DecisionTree(max_depth=2)
tree.fit(X, y)
# 预测
predictions = tree.predict(X)
print(predictions) # 输出: [0, 1, 1, 0]
代码解释
- 类定义:
DecisionTree
类包含了决策树的构建和预测方法。 - 树的构建:
_build_tree
方法递归地构建决策树,直到达到最大深度或所有样本属于同一类别。 - 最佳分割点:
_find_best_split
方法找到最佳的特征和阈值,以最大化信息增益。 - 信息增益计算:
_information_gain
方法计算给定特征和阈值的信息增益。 - 熵计算:
_entropy
方法计算给定标签集的熵。 - 预测:
_predict_one
方法递归地遍历决策树,进行预测。
示例数据
X
是一个二维数组,表示特征矩阵。y
是一个一维数组,表示标签。
输出
- 预测结果将输出每个样本的预测标签。