使用 Quantile DMatrix 的数据迭代器演示

Added in version 1.2.0.

这个演示定义了一个自定义的迭代器,用于将数据批次传递给 xgboost.QuantileDMatrix 并使用这个 QuantileDMatrix 进行训练。该功能主要设计用于减少分布式环境中训练所需的GPU内存。

看完演示后,人们可能会问为什么不使用更多的原生 Python 迭代器?这是因为 XGBoost 需要一个 reset 函数,而使用 itertools.tee 可能会根据以下内容导致显著的内存使用:

import cupy
import numpy

import xgboost

COLS = 64
ROWS_PER_BATCH = 1000  # data is splited by rows
BATCHES = 32


class IterForDMatrixDemo(xgboost.core.DataIter):
    """A data iterator for XGBoost DMatrix.

    `reset` and `next` are required for any data iterator, other functions here
    are utilites for demonstration's purpose.

    """

    def __init__(self):
        """Generate some random data for demostration.

        Actual data can be anything that is currently supported by XGBoost.
        """
        self.rows = ROWS_PER_BATCH
        self.cols = COLS
        rng = cupy.random.RandomState(1994)
        self._data = [rng.randn(self.rows, self.cols)] * BATCHES
        self._labels = [rng.randn(self.rows)] * BATCHES
        self._weights = [rng.uniform(size=self.rows)] * BATCHES

        self.it = 0  # set iterator to 0
        super().__init__()

    def as_array(self):
        return cupy.concatenate(self._data)

    def as_array_labels(self):
        return cupy.concatenate(self._labels)

    def as_array_weights(self):
        return cupy.concatenate(self._weights)

    def data(self):
        """Utility function for obtaining current batch of data."""
        return self._data[self.it]

    def labels(self):
        """Utility function for obtaining current batch of label."""
        return self._labels[self.it]

    def weights(self):
        return self._weights[self.it]

    def reset(self):
        """Reset the iterator"""
        self.it = 0

    def next(self, input_data):
        """Yield next batch of data."""
        if self.it == len(self._data):
            # Return 0 when there's no more batch.
            return 0
        input_data(data=self.data(), label=self.labels(), weight=self.weights())
        self.it += 1
        return 1


def main():
    rounds = 100
    it = IterForDMatrixDemo()

    # Use iterator, must be `QuantileDMatrix`.

    # In this demo, the input batches are created using cupy, and the data processing
    # (quantile sketching) will be performed on GPU. If data is loaded with CPU based
    # data structures like numpy or pandas, then the processing step will be performed
    # on CPU instead.
    m_with_it = xgboost.QuantileDMatrix(it)

    # Use regular DMatrix.
    m = xgboost.DMatrix(
        it.as_array(), it.as_array_labels(), weight=it.as_array_weights()
    )

    assert m_with_it.num_col() == m.num_col()
    assert m_with_it.num_row() == m.num_row()
    # Tree meethod must be `hist`.
    reg_with_it = xgboost.train(
        {"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
    )
    predict_with_it = reg_with_it.predict(m_with_it)

    reg = xgboost.train(
        {"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
    )
    predict = reg.predict(m)

    numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)


if __name__ == "__main__":
    main()

由 Sphinx-Gallery 生成的图库