.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_examples/applications/plot_prediction_latency.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_auto_examples_applications_plot_prediction_latency.py>`
        to download the full example code. or to run this example in your browser via Binder

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_applications_plot_prediction_latency.py:


==================
预测延迟
==================

这是一个展示各种scikit-learn估计器预测延迟的示例。

目标是测量在批量或原子（即一个接一个）模式下进行预测时可以预期的延迟。

图表以箱线图的形式表示预测延迟的分布。

.. GENERATED FROM PYTHON SOURCE LINES 13-38

.. code-block:: Python


    # 作者：scikit-learn 开发者
    # SPDX许可证标识符：BSD-3-Clause

    import gc
    import time
    from collections import defaultdict

    import matplotlib.pyplot as plt
    import numpy as np

    from sklearn.datasets import make_regression
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import Ridge, SGDRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVR
    from sklearn.utils import shuffle


    def _not_in_sphinx():
        # 检测我们是否由sphinx构建器运行的黑客方法
        return "__file__" in globals()


.. GENERATED FROM PYTHON SOURCE LINES 39-41

基准测试和绘图辅助函数
-----------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 41-287

.. code-block:: Python


    def atomic_benchmark_estimator(estimator, X_test, verbose=False):
        """测量每个实例的运行时间预测。"""
        n_instances = X_test.shape[0]
        runtimes = np.zeros(n_instances, dtype=float)
        for i in range(n_instances):
            instance = X_test[[i], :]
            start = time.time()
            estimator.predict(instance)
            runtimes[i] = time.time() - start
        if verbose:
            print(
                "atomic_benchmark runtimes:",
                min(runtimes),
                np.percentile(runtimes, 50),
                max(runtimes),
            )
        return runtimes


    def bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose):
        """测量整个输入的运行时间预测。"""
        n_instances = X_test.shape[0]
        runtimes = np.zeros(n_bulk_repeats, dtype=float)
        for i in range(n_bulk_repeats):
            start = time.time()
            estimator.predict(X_test)
            runtimes[i] = time.time() - start
        runtimes = np.array(list(map(lambda x: x / float(n_instances), runtimes)))
        if verbose:
            print(
                "bulk_benchmark runtimes:",
                min(runtimes),
                np.percentile(runtimes, 50),
                max(runtimes),
            )
        return runtimes


    def benchmark_estimator(estimator, X_test, n_bulk_repeats=30, verbose=False):
        """测量预测在原子模式和批量模式下的运行时间。

    Parameters
    ----------
    estimator : 已经训练好的支持 `predict()` 的估计器
    X_test : 测试输入
    n_bulk_repeats : 评估批量模式时重复的次数

    Returns
    -------
    atomic_runtimes, bulk_runtimes : 一对 `np.array` ，包含以秒为单位的运行时间。
    """
        atomic_runtimes = atomic_benchmark_estimator(estimator, X_test, verbose)
        bulk_runtimes = bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose)
        return atomic_runtimes, bulk_runtimes


    def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
        """生成具有给定参数的回归数据集。"""
        if verbose:
            print("generating dataset...")

        X, y, coef = make_regression(
            n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True
        )

        random_seed = 13
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=n_train, test_size=n_test, random_state=random_seed
        )
        X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

        X_scaler = StandardScaler()
        X_train = X_scaler.fit_transform(X_train)
        X_test = X_scaler.transform(X_test)

        y_scaler = StandardScaler()
        y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
        y_test = y_scaler.transform(y_test[:, None])[:, 0]

        gc.collect()
        if verbose:
            print("ok")
        return X_train, y_train, X_test, y_test


    def boxplot_runtimes(runtimes, pred_type, configuration):
        """绘制一个包含预测运行时间箱线图的新 `Figure` 。

    Parameters
    ----------
    runtimes : 以微秒为单位的延迟 `np.array` 列表
    cls_names : 生成运行时间的估计器类名列表
    pred_type : 'bulk' 或 'atomic'
    """

        fig, ax1 = plt.subplots(figsize=(10, 6))
        bp = plt.boxplot(
            runtimes,
        )

        cls_infos = [
            "%s\n(%d %s)"
            % (
                estimator_conf["name"],
                estimator_conf["complexity_computer"](estimator_conf["instance"]),
                estimator_conf["complexity_label"],
            )
            for estimator_conf in configuration["estimators"]
        ]
        plt.setp(ax1, xticklabels=cls_infos)
        plt.setp(bp["boxes"], color="black")
        plt.setp(bp["whiskers"], color="black")
        plt.setp(bp["fliers"], color="red", marker="+")

        ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)

        ax1.set_axisbelow(True)
        ax1.set_title(
            "Prediction Time per Instance - %s, %d feats."
            % (pred_type.capitalize(), configuration["n_features"])
        )
        ax1.set_ylabel("Prediction Time (us)")

        plt.show()


    def benchmark(configuration):
        """运行整个基准测试。"""
        X_train, y_train, X_test, y_test = generate_dataset(
            configuration["n_train"], configuration["n_test"], configuration["n_features"]
        )

        stats = {}
        for estimator_conf in configuration["estimators"]:
            print("Benchmarking", estimator_conf["instance"])
            estimator_conf["instance"].fit(X_train, y_train)
            gc.collect()
            a, b = benchmark_estimator(estimator_conf["instance"], X_test)
            stats[estimator_conf["name"]] = {"atomic": a, "bulk": b}

        cls_names = [
            estimator_conf["name"] for estimator_conf in configuration["estimators"]
        ]
        runtimes = [1e6 * stats[clf_name]["atomic"] for clf_name in cls_names]
        boxplot_runtimes(runtimes, "atomic", configuration)
        runtimes = [1e6 * stats[clf_name]["bulk"] for clf_name in cls_names]
        boxplot_runtimes(runtimes, "bulk (%d)" % configuration["n_test"], configuration)


    def n_feature_influence(estimators, n_train, n_test, n_features, percentile):
        """估计特征数量对预测时间的影响。

    Parameters
    ----------

    estimators : 用于基准测试的（名称（str），估计器）字典
    n_train : 训练实例的数量（int）
    n_test : 测试实例的数量（int）
    n_features : 要测试的特征空间维度列表（int）
    percentile : 测量速度的百分位数（int [0-100]）

    Returns:
    --------

    percentiles : 字典（估计器名称，
                       字典（特征数量，百分位性能（微秒）））
    """
        percentiles = defaultdict(defaultdict)
        for n in n_features:
            print("benchmarking with %d features" % n)
            X_train, y_train, X_test, y_test = generate_dataset(n_train, n_test, n)
            for cls_name, estimator in estimators.items():
                estimator.fit(X_train, y_train)
                gc.collect()
                runtimes = bulk_benchmark_estimator(estimator, X_test, 30, False)
                percentiles[cls_name][n] = 1e6 * np.percentile(runtimes, percentile)
        return percentiles


    def plot_n_features_influence(percentiles, percentile):
        fig, ax1 = plt.subplots(figsize=(10, 6))
        colors = ["r", "g", "b"]
        for i, cls_name in enumerate(percentiles.keys()):
            x = np.array(sorted(percentiles[cls_name].keys()))
            y = np.array([percentiles[cls_name][n] for n in x])
            plt.plot(
                x,
                y,
                color=colors[i],
            )
        ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
        ax1.set_axisbelow(True)
        ax1.set_title("Evolution of Prediction Time with #Features")
        ax1.set_xlabel("#Features")
        ax1.set_ylabel("Prediction Time at %d%%-ile (us)" % percentile)
        plt.show()


    def benchmark_throughputs(configuration, duration_secs=0.1):
        """不同估计器的基准吞吐量。"""
        X_train, y_train, X_test, y_test = generate_dataset(
            configuration["n_train"], configuration["n_test"], configuration["n_features"]
        )
        throughputs = dict()
        for estimator_config in configuration["estimators"]:
            estimator_config["instance"].fit(X_train, y_train)
            start_time = time.time()
            n_predictions = 0
            while (time.time() - start_time) < duration_secs:
                estimator_config["instance"].predict(X_test[[0]])
                n_predictions += 1
            throughputs[estimator_config["name"]] = n_predictions / duration_secs
        return throughputs


    def plot_benchmark_throughput(throughputs, configuration):
        fig, ax = plt.subplots(figsize=(10, 6))
        colors = ["r", "g", "b"]
        cls_infos = [
            "%s\n(%d %s)"
            % (
                estimator_conf["name"],
                estimator_conf["complexity_computer"](estimator_conf["instance"]),
                estimator_conf["complexity_label"],
            )
            for estimator_conf in configuration["estimators"]
        ]
        cls_values = [
            throughputs[estimator_conf["name"]]
            for estimator_conf in configuration["estimators"]
        ]
        plt.bar(range(len(throughputs)), cls_values, width=0.5, color=colors)
        ax.set_xticks(np.linspace(0.25, len(throughputs) - 0.75, len(throughputs)))
        ax.set_xticklabels(cls_infos, fontsize=10)
        ymax = max(cls_values) * 1.2
        ax.set_ylim((0, ymax))
        ax.set_ylabel("Throughput (predictions/sec)")
        ax.set_title(
            "Prediction Throughput for different estimators (%d features)"
            % configuration["n_features"]
        )
        plt.show()


.. GENERATED FROM PYTHON SOURCE LINES 288-290

基准测试各种回归器的大量/原子预测速度
-------------------------------------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 290-320

.. code-block:: Python


    configuration = {
        "n_train": int(1e3),
        "n_test": int(1e2),
        "n_features": int(1e2),
        "estimators": [
            {
                "name": "Linear Model",
                "instance": SGDRegressor(
                    penalty="elasticnet", alpha=0.01, l1_ratio=0.25, tol=1e-4
                ),
                "complexity_label": "non-zero coefficients",
                "complexity_computer": lambda clf: np.count_nonzero(clf.coef_),
            },
            {
                "name": "RandomForest",
                "instance": RandomForestRegressor(),
                "complexity_label": "estimators",
                "complexity_computer": lambda clf: clf.n_estimators,
            },
            {
                "name": "SVR",
                "instance": SVR(kernel="rbf"),
                "complexity_label": "support vectors",
                "complexity_computer": lambda clf: len(clf.support_vectors_),
            },
        ],
    }
    benchmark(configuration)


.. rst-class:: sphx-glr-horizontal


    *

      .. image-sg:: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_001.png
         :alt: Prediction Time per Instance - Atomic, 100 feats.
         :srcset: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_001.png
         :class: sphx-glr-multi-img

    *

      .. image-sg:: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_002.png
         :alt: Prediction Time per Instance - Bulk (100), 100 feats.
         :srcset: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_002.png
         :class: sphx-glr-multi-img


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    Benchmarking SGDRegressor(alpha=0.01, l1_ratio=0.25, penalty='elasticnet', tol=0.0001)
    Benchmarking RandomForestRegressor()
    Benchmarking SVR()


.. GENERATED FROM PYTHON SOURCE LINES 321-323

基准测试 n_features 对预测速度的影响
--------------------------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 323-334

.. code-block:: Python


    percentile = 90
    percentiles = n_feature_influence(
        {"ridge": Ridge()},
        configuration["n_train"],
        configuration["n_test"],
        [100, 250, 500],
        percentile,
    )
    plot_n_features_influence(percentiles, percentile)


.. image-sg:: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_003.png
   :alt: Evolution of Prediction Time with #Features
   :srcset: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_003.png
   :class: sphx-glr-single-img


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    benchmarking with 100 features
    benchmarking with 250 features
    benchmarking with 500 features


.. GENERATED FROM PYTHON SOURCE LINES 335-337

基准吞吐量
--------------------

.. GENERATED FROM PYTHON SOURCE LINES 337-341

.. code-block:: Python


    throughputs = benchmark_throughputs(configuration)
    plot_benchmark_throughput(throughputs, configuration)


.. image-sg:: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_004.png
   :alt: Prediction Throughput for different estimators (100 features)
   :srcset: /auto_examples/applications/images/sphx_glr_plot_prediction_latency_004.png
   :class: sphx-glr-single-img


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** (0 minutes 28.432 seconds)


.. _sphx_glr_download_auto_examples_applications_plot_prediction_latency.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: binder-badge

      .. image:: images/binder_badge_logo.svg
        :target: https://mybinder.org/v2/gh/scikit-learn/scikit-learn/main?urlpath=lab/tree/notebooks/auto_examples/applications/plot_prediction_latency.ipynb
        :alt: Launch binder
        :width: 150 px

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: plot_prediction_latency.ipynb <plot_prediction_latency.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: plot_prediction_latency.py <plot_prediction_latency.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: plot_prediction_latency.zip <plot_prediction_latency.zip>`


.. include:: plot_prediction_latency.recommendations


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_