梯度提升库在计算shap值时的速度比较

在这里,我们比较了 CatBoost、LightGBM 和 XGBoost 在计算 shap 值方面的表现。所有提升算法都是在 GPU 上训练的,但 shap 评估是在 CPU 上进行的。

我们使用来自 这里 的 epsilon_normalized 数据集。

[1]:
import copy
import datetime
import os

import catboost
import lightgbm as lgb
import numpy as np
import pandas as pd
import tqdm
import xgboost as xgb
from sklearn import datasets
[2]:
catboost.__version__, lgb.__version__, xgb.__version__
[2]:
('0.11.2', '2.2.2', '0.81')
[3]:
train_data, train_target = datasets.load_svmlight_file("epsilon_normalized")
test_data, test_target = datasets.load_svmlight_file(
    "epsilon_normalized.t",
)

参数

[4]:
num_iters = 1000
lr = 0.1
max_bin = 128
gpu_device = "0"  # specify your GPU (used only for training)
random_state = 0
[5]:
train_target[train_target == -1] = 0
test_target[test_target == -1] = 0
[6]:
def preprocess_data(data, label=None, mode="train", boosting=None):
    assert boosting is not None

    if boosting == "xgboost":
        return xgb.DMatrix(data, label)
    elif boosting == "lightgbm":
        if mode == "train":
            return lgb.Dataset(data, label)
        else:
            return data
    elif boosting == "catboost":
        data = catboost.FeaturesData(num_feature_data=data)
        return catboost.Pool(data, label)
    else:
        raise RuntimeError("Unknown boosting library")
[7]:
def create_parameters(base_params, boosting=None, **kwargs):
    assert boosting is not None
    assert isinstance(base_params, dict)

    params = copy.copy(base_params)
    if boosting == "xgboost":
        params["objective"] = "binary:logistic"
        params["max_depth"] = kwargs["depth"]
        params["tree_method"] = "gpu_hist"
        params["gpu_id"] = gpu_device
    elif boosting == "lightgbm":
        params["objective"] = "binary"
        params["device"] = "gpu"
        params["gpu_device_id"] = gpu_device
        params["num_leaves"] = 2 ** kwargs["depth"]
    elif boosting == "catboost":
        params["objective"] = "Logloss"
        params["task_type"] = "GPU"
        params["devices"] = gpu_device
        params["bootstrap_type"] = "Bernoulli"
        params["logging_level"] = "Silent"
    else:
        raise RuntimeError("Unknown boosting library")

    return params
[8]:
def train(data, params, num_iters, boosting=None):
    assert boosting is not None
    if boosting == "xgboost":
        return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)
    elif boosting == "lightgbm":
        return lgb.train(params=params, train_set=data, num_boost_round=num_iters)
    elif boosting == "catboost":
        return catboost.train(pool=data, params=params, num_boost_round=num_iters)
    else:
        raise RuntimeError("Unknown boosting library")
[9]:
def predict_shap(model, data, boosting=None):
    assert boosting is not None
    if boosting == "xgboost":
        return model.predict(data, pred_contribs=True)
    elif boosting == "lightgbm":
        return model.predict(data, pred_contrib=True)
    elif boosting == "catboost":
        return model.get_feature_importance(data, fstr_type="ShapValues")
[10]:
def create_path(boosting, params):
    fname = [boosting]
    for key, value in sorted(params.items()):
        fname.append(str(key))
        fname.append(str(value))
    fname = "_".join(fname)
    fname = fname.replace(".", "")
    fname += ".model"
    return fname
[11]:
def load_model(fname, boosting):
    if boosting == "xgboost":
        bst = xgb.Booster(model_file=fname)
        bst.load_model(fname)
    elif boosting == "lightgbm":
        bst = lgb.Booster(model_file=fname)
    elif boosting == "catboost":
        bst = catboost.CatBoost()
        bst.load_model(fname)
    else:
        raise RuntimeError("Unknown boosting")
    return bst
[12]:
base_params = {"learning_rate": lr, "max_bin": max_bin, "random_state": random_state}
[13]:
result = []

boosting_list = ["xgboost", "catboost", "lightgbm"]
depth_list = [2, 4, 6, 8, 10]
lens_list = [1000, 5000, 10000]


for gb_type in boosting_list:
    print(f"{gb_type} is going")

    for size_test in lens_list:
        print(f"size test {size_test}")
        sep_test_data = test_data[:size_test]
        sep_test_target = test_target[:size_test]

        # comment this line if you have already trained all models
        train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)

        dense_test = sep_test_data.todense().A.astype(np.float32)

        for depth in tqdm.tqdm(depth_list):
            start_test_preproc = datetime.datetime.now()
            test_preprocessed = preprocess_data(
                dense_test, sep_test_target, mode="test", boosting=gb_type
            )

            finish_test_preproc = datetime.datetime.now()
            preprocessing_delta = finish_test_preproc - start_test_preproc
            preprocessing_delta = preprocessing_delta.total_seconds()

            params = create_parameters(base_params, boosting=gb_type, depth=depth)
            params["depth"] = depth
            fname = create_path(gb_type, params)
            if os.path.exists(fname):
                print("model exist")
                bst = load_model(fname, boosting=gb_type)
            else:
                print("model is training")
                start_train = datetime.datetime.now()
                bst = train(
                    train_preprocessed, params, num_iters=num_iters, boosting=gb_type
                )
                finish_train = datetime.datetime.now()
                delta_train = finish_train - start_train
                delta_train = int(delta_train.total_seconds() * 1000)
                bst.save_model(fname)

            start_time = datetime.datetime.now()
            preds = predict_shap(bst, test_preprocessed, boosting=gb_type)
            assert preds.shape == (sep_test_data.shape[0], sep_test_data.shape[1] + 1)
            finish_time = datetime.datetime.now()

            delta = finish_time - start_time
            delta = delta.total_seconds()

            current_res = {
                "preprocessing_time": preprocessing_delta,
                "boosting": gb_type,
                "test_size": size_test,
                "depth": depth,
                "time": delta,
            }

            result.append(current_res)

        print("*" * 40)
[14]:
result_df = pd.DataFrame(result)
[16]:
result_df.to_csv(f"shap_benchmark_{max_bin}_max_bin_with_test_sizes.csv", index=False)
[17]:
result_df = pd.read_csv(
    "shap_benchmark_128_max_bin_with_test_sizes.csv",
)
result_df.pivot_table(index=["test_size", "depth"], columns="boosting", values="time")
[17]:
boosting catboost lightgbm xgboost
test_size depth
1000 2 0.311027 0.090156 0.112515
4 0.281931 0.578531 0.300671
6 0.464603 4.159926 1.468442
8 4.918599 23.844245 7.847191
10 93.152000 119.527824 30.872254
5000 2 1.171963 0.284673 0.241316
4 1.081119 2.094985 0.931881
6 1.319114 20.624486 6.498283
8 5.807985 118.552238 38.992395
10 95.049909 601.251603 153.408904
10000 2 2.048301 0.621454 0.509722
4 2.263058 4.291201 1.935541
6 2.396371 42.788038 12.981580
8 7.078056 240.614644 77.883250
10 95.680684 1189.685032 306.529277
[18]:
result_df.pivot_table(
    index="test_size", columns="boosting", values="preprocessing_time"
)
[18]:
boosting catboost lightgbm xgboost
test_size
1000 0.069569 0.002816 0.011025
5000 0.349831 0.000006 0.047836
10000 0.770179 0.000006 0.089032
[ ]: