Note

Go to the end to download the full example code. or to run this example in your browser via Binder

随机森林的袋外误差#

RandomForestClassifier 使用 自助聚合 进行训练，其中每棵新树都是从训练观测值的自助样本 \(z_i = (x_i, y_i)\) 中拟合的。袋外 (OOB) 误差是使用不包含 \(z_i\) 的树的预测计算的每个 \(z_i\) 的平均误差。这使得 RandomForestClassifier 在训练的同时可以进行拟合和验证 [1]。

下面的示例演示了在训练过程中每增加一棵新树时如何测量 OOB 误差。生成的图表允许实践者估算误差稳定时 n_estimators 的合适值。

# 作者：scikit-learn 开发者
# SPDX-License-Identifier: BSD-3-Clause

from collections import OrderedDict

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 123

# 生成一个二元分类数据集。
X, y = make_classification(
    n_samples=500,
    n_features=25,
    n_clusters_per_class=1,
    n_informative=15,
    random_state=RANDOM_STATE,
)

# 注意：将 `warm_start` 构造参数设置为 `True` 会禁用对并行化集成的支持，但在训练过程中跟踪 OOB 错误轨迹是必要的。
ensemble_clfs = [
    (
        "RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(
            warm_start=True,
            oob_score=True,
            max_features="sqrt",
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(
            warm_start=True,
            max_features="log2",
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features=None",
        RandomForestClassifier(
            warm_start=True,
            max_features=None,
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
]

# 将分类器名称映射到一个（<n_estimators>, <error rate>）对的列表。
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# 范围内的 `n_estimators` 值进行探索。
min_estimators = 15
max_estimators = 150

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1, 5):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # 记录每个 `n_estimators=i` 设置的 OOB 错误。
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# 生成“OOB错误率”与“n_estimators”的关系图。
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()