梯度提升库在计算shap值时的速度比较
在这里,我们比较了 CatBoost、LightGBM 和 XGBoost 在计算 shap 值方面的表现。所有提升算法都是在 GPU 上训练的,但 shap 评估是在 CPU 上进行的。
我们使用来自 这里 的 epsilon_normalized 数据集。
[1]:
import copy
import datetime
import os
import catboost
import lightgbm as lgb
import numpy as np
import pandas as pd
import tqdm
import xgboost as xgb
from sklearn import datasets
[2]:
catboost.__version__, lgb.__version__, xgb.__version__
[2]:
('0.11.2', '2.2.2', '0.81')
[3]:
train_data, train_target = datasets.load_svmlight_file("epsilon_normalized")
test_data, test_target = datasets.load_svmlight_file(
"epsilon_normalized.t",
)
参数
[4]:
num_iters = 1000
lr = 0.1
max_bin = 128
gpu_device = "0" # specify your GPU (used only for training)
random_state = 0
[5]:
train_target[train_target == -1] = 0
test_target[test_target == -1] = 0
[6]:
def preprocess_data(data, label=None, mode="train", boosting=None):
assert boosting is not None
if boosting == "xgboost":
return xgb.DMatrix(data, label)
elif boosting == "lightgbm":
if mode == "train":
return lgb.Dataset(data, label)
else:
return data
elif boosting == "catboost":
data = catboost.FeaturesData(num_feature_data=data)
return catboost.Pool(data, label)
else:
raise RuntimeError("Unknown boosting library")
[7]:
def create_parameters(base_params, boosting=None, **kwargs):
assert boosting is not None
assert isinstance(base_params, dict)
params = copy.copy(base_params)
if boosting == "xgboost":
params["objective"] = "binary:logistic"
params["max_depth"] = kwargs["depth"]
params["tree_method"] = "gpu_hist"
params["gpu_id"] = gpu_device
elif boosting == "lightgbm":
params["objective"] = "binary"
params["device"] = "gpu"
params["gpu_device_id"] = gpu_device
params["num_leaves"] = 2 ** kwargs["depth"]
elif boosting == "catboost":
params["objective"] = "Logloss"
params["task_type"] = "GPU"
params["devices"] = gpu_device
params["bootstrap_type"] = "Bernoulli"
params["logging_level"] = "Silent"
else:
raise RuntimeError("Unknown boosting library")
return params
[8]:
def train(data, params, num_iters, boosting=None):
assert boosting is not None
if boosting == "xgboost":
return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)
elif boosting == "lightgbm":
return lgb.train(params=params, train_set=data, num_boost_round=num_iters)
elif boosting == "catboost":
return catboost.train(pool=data, params=params, num_boost_round=num_iters)
else:
raise RuntimeError("Unknown boosting library")
[9]:
def predict_shap(model, data, boosting=None):
assert boosting is not None
if boosting == "xgboost":
return model.predict(data, pred_contribs=True)
elif boosting == "lightgbm":
return model.predict(data, pred_contrib=True)
elif boosting == "catboost":
return model.get_feature_importance(data, fstr_type="ShapValues")
[10]:
def create_path(boosting, params):
fname = [boosting]
for key, value in sorted(params.items()):
fname.append(str(key))
fname.append(str(value))
fname = "_".join(fname)
fname = fname.replace(".", "")
fname += ".model"
return fname
[11]:
def load_model(fname, boosting):
if boosting == "xgboost":
bst = xgb.Booster(model_file=fname)
bst.load_model(fname)
elif boosting == "lightgbm":
bst = lgb.Booster(model_file=fname)
elif boosting == "catboost":
bst = catboost.CatBoost()
bst.load_model(fname)
else:
raise RuntimeError("Unknown boosting")
return bst
[12]:
base_params = {"learning_rate": lr, "max_bin": max_bin, "random_state": random_state}
[13]:
result = []
boosting_list = ["xgboost", "catboost", "lightgbm"]
depth_list = [2, 4, 6, 8, 10]
lens_list = [1000, 5000, 10000]
for gb_type in boosting_list:
print(f"{gb_type} is going")
for size_test in lens_list:
print(f"size test {size_test}")
sep_test_data = test_data[:size_test]
sep_test_target = test_target[:size_test]
# comment this line if you have already trained all models
train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)
dense_test = sep_test_data.todense().A.astype(np.float32)
for depth in tqdm.tqdm(depth_list):
start_test_preproc = datetime.datetime.now()
test_preprocessed = preprocess_data(
dense_test, sep_test_target, mode="test", boosting=gb_type
)
finish_test_preproc = datetime.datetime.now()
preprocessing_delta = finish_test_preproc - start_test_preproc
preprocessing_delta = preprocessing_delta.total_seconds()
params = create_parameters(base_params, boosting=gb_type, depth=depth)
params["depth"] = depth
fname = create_path(gb_type, params)
if os.path.exists(fname):
print("model exist")
bst = load_model(fname, boosting=gb_type)
else:
print("model is training")
start_train = datetime.datetime.now()
bst = train(
train_preprocessed, params, num_iters=num_iters, boosting=gb_type
)
finish_train = datetime.datetime.now()
delta_train = finish_train - start_train
delta_train = int(delta_train.total_seconds() * 1000)
bst.save_model(fname)
start_time = datetime.datetime.now()
preds = predict_shap(bst, test_preprocessed, boosting=gb_type)
assert preds.shape == (sep_test_data.shape[0], sep_test_data.shape[1] + 1)
finish_time = datetime.datetime.now()
delta = finish_time - start_time
delta = delta.total_seconds()
current_res = {
"preprocessing_time": preprocessing_delta,
"boosting": gb_type,
"test_size": size_test,
"depth": depth,
"time": delta,
}
result.append(current_res)
print("*" * 40)
[14]:
result_df = pd.DataFrame(result)
[16]:
result_df.to_csv(f"shap_benchmark_{max_bin}_max_bin_with_test_sizes.csv", index=False)
[17]:
result_df = pd.read_csv(
"shap_benchmark_128_max_bin_with_test_sizes.csv",
)
result_df.pivot_table(index=["test_size", "depth"], columns="boosting", values="time")
[17]:
boosting | catboost | lightgbm | xgboost | |
---|---|---|---|---|
test_size | depth | |||
1000 | 2 | 0.311027 | 0.090156 | 0.112515 |
4 | 0.281931 | 0.578531 | 0.300671 | |
6 | 0.464603 | 4.159926 | 1.468442 | |
8 | 4.918599 | 23.844245 | 7.847191 | |
10 | 93.152000 | 119.527824 | 30.872254 | |
5000 | 2 | 1.171963 | 0.284673 | 0.241316 |
4 | 1.081119 | 2.094985 | 0.931881 | |
6 | 1.319114 | 20.624486 | 6.498283 | |
8 | 5.807985 | 118.552238 | 38.992395 | |
10 | 95.049909 | 601.251603 | 153.408904 | |
10000 | 2 | 2.048301 | 0.621454 | 0.509722 |
4 | 2.263058 | 4.291201 | 1.935541 | |
6 | 2.396371 | 42.788038 | 12.981580 | |
8 | 7.078056 | 240.614644 | 77.883250 | |
10 | 95.680684 | 1189.685032 | 306.529277 |
[18]:
result_df.pivot_table(
index="test_size", columns="boosting", values="preprocessing_time"
)
[18]:
boosting | catboost | lightgbm | xgboost |
---|---|---|---|
test_size | |||
1000 | 0.069569 | 0.002816 | 0.011025 |
5000 | 0.349831 | 0.000006 | 0.047836 |
10000 | 0.770179 | 0.000006 | 0.089032 |
[ ]: