备注
前往结尾 下载完整示例代码。
使用 Quantile DMatrix 的数据迭代器演示
Added in version 1.2.0.
这个演示定义了一个自定义的迭代器,用于将数据批次传递给 xgboost.QuantileDMatrix
并使用这个 QuantileDMatrix
进行训练。该功能主要设计用于减少分布式环境中训练所需的GPU内存。
看完演示后,人们可能会问为什么不使用更多的原生 Python 迭代器?这是因为 XGBoost 需要一个 reset 函数,而使用 itertools.tee 可能会根据以下内容导致显著的内存使用:
import cupy
import numpy
import xgboost
COLS = 64
ROWS_PER_BATCH = 1000 # data is splited by rows
BATCHES = 32
class IterForDMatrixDemo(xgboost.core.DataIter):
"""A data iterator for XGBoost DMatrix.
`reset` and `next` are required for any data iterator, other functions here
are utilites for demonstration's purpose.
"""
def __init__(self):
"""Generate some random data for demostration.
Actual data can be anything that is currently supported by XGBoost.
"""
self.rows = ROWS_PER_BATCH
self.cols = COLS
rng = cupy.random.RandomState(1994)
self._data = [rng.randn(self.rows, self.cols)] * BATCHES
self._labels = [rng.randn(self.rows)] * BATCHES
self._weights = [rng.uniform(size=self.rows)] * BATCHES
self.it = 0 # set iterator to 0
super().__init__()
def as_array(self):
return cupy.concatenate(self._data)
def as_array_labels(self):
return cupy.concatenate(self._labels)
def as_array_weights(self):
return cupy.concatenate(self._weights)
def data(self):
"""Utility function for obtaining current batch of data."""
return self._data[self.it]
def labels(self):
"""Utility function for obtaining current batch of label."""
return self._labels[self.it]
def weights(self):
return self._weights[self.it]
def reset(self):
"""Reset the iterator"""
self.it = 0
def next(self, input_data):
"""Yield next batch of data."""
if self.it == len(self._data):
# Return 0 when there's no more batch.
return 0
input_data(data=self.data(), label=self.labels(), weight=self.weights())
self.it += 1
return 1
def main():
rounds = 100
it = IterForDMatrixDemo()
# Use iterator, must be `QuantileDMatrix`.
# In this demo, the input batches are created using cupy, and the data processing
# (quantile sketching) will be performed on GPU. If data is loaded with CPU based
# data structures like numpy or pandas, then the processing step will be performed
# on CPU instead.
m_with_it = xgboost.QuantileDMatrix(it)
# Use regular DMatrix.
m = xgboost.DMatrix(
it.as_array(), it.as_array_labels(), weight=it.as_array_weights()
)
assert m_with_it.num_col() == m.num_col()
assert m_with_it.num_row() == m.num_row()
# Tree meethod must be `hist`.
reg_with_it = xgboost.train(
{"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
)
predict_with_it = reg_with_it.predict(m_with_it)
reg = xgboost.train(
{"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
)
predict = reg.predict(m)
numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)
if __name__ == "__main__":
main()