作者: Khalid Salama
创建日期: 2021/01/15
最后修改: 2021/01/15
描述: 如何训练可微分的决策树以实现深度神经网络的端到端学习。
此示例提供了 深度神经决策森林 模型的实现,由 P. Kontschieder 等人提出,用于结构化数据分类。 它演示了如何构建一个随机和可微分的决策树模型, 对其进行端到端训练,并将决策树与深度表示学习统一起来。
此示例使用由 加州大学欧文分校机器学习库提供的 美国人口普查收入数据集。 任务是二元分类 预测一个人是否可能年收入超过 50,000 美元。
该数据集包括 48,842 个实例,具有 14 个输入特征(例如年龄、工作类别、教育、职业等):5 个数值特征 和 9 个分类特征。
import keras
from keras import layers
from keras.layers import StringLookup
from keras import ops
from tensorflow import data as tf_data
import numpy as np
import pandas as pd
import math
CSV_HEADER = [
"age",
"workclass",
"fnlwgt",
"education",
"education_num",
"marital_status",
"occupation",
"relationship",
"race",
"gender",
"capital_gain",
"capital_loss",
"hours_per_week",
"native_country",
"income_bracket",
]
train_data_url = (
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
test_data_url = (
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
)
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
print(f"训练数据集形状: {train_data.shape}")
print(f"测试数据集形状: {test_data.shape}")
训练数据集形状: (32561, 15)
测试数据集形状: (16282, 15)
删除第一条记录(因为它不是有效的数据示例)和类标签中的尾部 ‘点’。
test_data = test_data[1:]
test_data.income_bracket = test_data.income_bracket.apply(
lambda value: value.replace(".", "")
)
我们将训练和测试数据分割本地存储为 CSV 文件。
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"
train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)
在这里,我们定义数据集的元数据,这将在读取、解析 和编码输入特征时非常有用。
# 数值特征名称的列表。
NUMERIC_FEATURE_NAMES = [
"age",
"education_num",
"capital_gain",
"capital_loss",
"hours_per_week",
]
# 分类特征及其词汇的字典。
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
"workclass": sorted(list(train_data["workclass"].unique())),
"education": sorted(list(train_data["education"].unique())),
"marital_status": sorted(list(train_data["marital_status"].unique())),
"occupation": sorted(list(train_data["occupation"].unique())),
"relationship": sorted(list(train_data["relationship"].unique())),
"race": sorted(list(train_data["race"].unique())),
"gender": sorted(list(train_data["gender"].unique())),
"native_country": sorted(list(train_data["native_country"].unique())),
}
# 要忽略的数据集列的列表。
IGNORE_COLUMN_NAMES = ["fnlwgt"]
# 分类特征名称的列表。
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
# 所有输入特征的列表。
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
# 每个特征的列默认值列表。
COLUMN_DEFAULTS = [
[0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
for feature_name in CSV_HEADER
]
# 目标特征的名称。
TARGET_FEATURE_NAME = "income_bracket"
# 目标特征标签的列表。
TARGET_LABELS = [" <=50K", " >50K"]
tf_data.Dataset
对象用于训练和验证我们创建一个输入函数来读取和解析文件,并将特征和标签转换为用于训练和验证的 tf_data.Dataset
。我们还通过将目标标签映射到索引来预处理输入。
target_label_lookup = StringLookup(
vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)
lookup_dict = {}
for feature_name in CATEGORICAL_FEATURE_NAMES:
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
# 创建查找表,将字符串值转换为整数索引。
# 由于我们不使用掩码标记,也不期望任何词汇外
# (oov) 标记,所以我们将mask_token设置为None,num_oov_indices设置为0。
lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
lookup_dict[feature_name] = lookup
def encode_categorical(batch_x, batch_y):
for feature_name in CATEGORICAL_FEATURE_NAMES:
batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
return batch_x, batch_y
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
dataset = (
tf_data.experimental.make_csv_dataset(
csv_file_path,
batch_size=batch_size,
column_names=CSV_HEADER,
column_defaults=COLUMN_DEFAULTS,
label_name=TARGET_FEATURE_NAME,
num_epochs=1,
header=False,
na_value="?",
shuffle=shuffle,
)
.map(lambda features, target: (features, target_label_lookup(target)))
.map(encode_categorical)
)
return dataset.cache()
def create_model_inputs():
inputs = {}
for feature_name in FEATURE_NAMES:
if feature_name in NUMERIC_FEATURE_NAMES:
inputs[feature_name] = layers.Input(
name=feature_name, shape=(), dtype="float32"
)
else:
inputs[feature_name] = layers.Input(
name=feature_name, shape=(), dtype="int32"
)
return inputs
def encode_inputs(inputs):
encoded_features = []
for feature_name in inputs:
if feature_name in CATEGORICAL_FEATURE_NAMES:
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
# 创建查找表,将字符串值转换为整数索引。
# 由于我们不使用掩码标记,也不期望任何词汇外
# (oov) 标记,所以我们将mask_token设置为None,num_oov_indices设置为0。
value_index = inputs[feature_name]
embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
# 创建具有指定维度的嵌入层。
embedding = layers.Embedding(
input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
)
# 将索引值转换为嵌入表示。
encoded_feature = embedding(value_index)
else:
# 直接使用数值特征。
encoded_feature = inputs[feature_name]
if inputs[feature_name].shape[-1] is None:
encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
encoded_features.append(encoded_feature)
encoded_features = layers.concatenate(encoded_features)
return encoded_features
神经决策树模型有两组需要学习的权重。第一组是 pi
,表示树叶中类的概率分布。第二组是路由层 decision_fn
的权重,表示去往每个叶子的概率。模型的前向传播如下:
features
作为一个单一向量,编码批次中实例的所有特征。这个向量可以通过应用于图像的卷积神经网络(CNN)生成,或者通过应用于结构数据特征的密集变换生成。used_features_mask
随机选择一部分输入特征使用。mu
)。outputs
。class NeuralDecisionTree(keras.Model):
def __init__(self, depth, num_features, used_features_rate, num_classes):
super().__init__()
self.depth = depth
self.num_leaves = 2**depth
self.num_classes = num_classes
# 为随机选择的特征创建一个掩码。
num_used_features = int(num_features * used_features_rate)
one_hot = np.eye(num_features)
sampled_feature_indices = np.random.choice(
np.arange(num_features), num_used_features, replace=False
)
self.used_features_mask = ops.convert_to_tensor(
one_hot[sampled_feature_indices], dtype="float32"
)
# 初始化叶子节点中类的权重。
self.pi = self.add_weight(
initializer="random_normal",
shape=[self.num_leaves, self.num_classes],
dtype="float32",
trainable=True,
)
# 初始化随机路由层。
self.decision_fn = layers.Dense(
units=self.num_leaves, activation="sigmoid", name="decision"
)
def call(self, features):
batch_size = ops.shape(features)[0]
# 将特征掩码应用于输入特征。
features = ops.matmul(
features, ops.transpose(self.used_features_mask)
) # [batch_size, num_used_features]
# 计算路由概率。
decisions = ops.expand_dims(
self.decision_fn(features), axis=2
) # [batch_size, num_leaves, 1]
# 将路由概率与其补集连接起来。
decisions = layers.concatenate(
[decisions, 1 - decisions], axis=2
) # [batch_size, num_leaves, 2]
mu = ops.ones([batch_size, 1, 1])
begin_idx = 1
end_idx = 2
# 以广度优先顺序遍历树。
for level in range(self.depth):
mu = ops.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]
mu = ops.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]
level_decisions = decisions[
:, begin_idx:end_idx, :
] # [batch_size, 2 ** level, 2]
mu = mu * level_decisions # [batch_size, 2**level, 2]
begin_idx = end_idx
end_idx = begin_idx + 2 ** (level + 1)
mu = ops.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]
probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]
outputs = ops.matmul(mu, probabilities) # [batch_size, num_classes]
return outputs
神经决策森林模型由一组同时训练的神经决策树组成。森林模型的输出是其树的平均输出。
class NeuralDecisionForest(keras.Model):
def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
super().__init__()
self.ensemble = []
# 通过添加 NeuralDecisionTree 实例来初始化集成。
# 每棵树将拥有自己随机选择的输入特征。
for _ in range(num_trees):
self.ensemble.append(
NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
)
def call(self, inputs):
# 初始化输出:一个 [batch_size, num_classes] 的零矩阵。
batch_size = ops.shape(inputs)[0]
outputs = ops.zeros([batch_size, num_classes])
# 聚合集成中树的输出。
for tree in self.ensemble:
outputs += tree(inputs)
# 将输出除以集成大小以获得平均值。
outputs /= len(self.ensemble)
return outputs
最后,让我们设置将训练和评估模型的代码。
learning_rate = 0.01
batch_size = 265
num_epochs = 10
def run_experiment(model):
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
print("开始训练模型...")
train_dataset = get_dataset_from_csv(
train_data_file, shuffle=True, batch_size=batch_size
)
model.fit(train_dataset, epochs=num_epochs)
print("模型训练完成")
print("正在对测试数据评估模型...")
test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
_, accuracy = model.evaluate(test_dataset)
print(f"测试准确率: {round(accuracy * 100, 2)}%")
在这个实验中,我们训练一个单一的神经决策树模型 其中我们使用所有输入特征。
num_trees = 10
depth = 10
used_features_rate = 1.0
num_classes = len(TARGET_LABELS)
def create_tree_model():
inputs = create_model_inputs()
features = encode_inputs(inputs)
features = layers.BatchNormalization()(features)
num_features = features.shape[1]
tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
outputs = tree(features)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
tree_model = create_tree_model()
run_experiment(tree_model)
开始训练模型...
Epoch 1/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 5s 26ms/step - loss: 0.5308 - sparse_categorical_accuracy: 0.8150
Epoch 2/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3476 - sparse_categorical_accuracy: 0.8429
Epoch 3/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3312 - sparse_categorical_accuracy: 0.8478
Epoch 4/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3247 - sparse_categorical_accuracy: 0.8495
Epoch 5/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3202 - sparse_categorical_accuracy: 0.8512
Epoch 6/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3158 - sparse_categorical_accuracy: 0.8536
Epoch 7/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3116 - sparse_categorical_accuracy: 0.8572
Epoch 8/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3071 - sparse_categorical_accuracy: 0.8608
Epoch 9/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3026 - sparse_categorical_accuracy: 0.8630
Epoch 10/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.2975 - sparse_categorical_accuracy: 0.8653
模型训练完成
正在对测试数据评估模型...
62/62 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 0.3279 - sparse_categorical_accuracy: 0.8463
测试准确率: 85.08%
在这个实验中,我们训练一个神经决策森林,其具有 num_trees
棵树
每棵树使用随机选择的 50% 输入特征。你可以通过设置 used_features_rate
变量来控制每棵树中使用的特征数量。
此外,与之前的实验相比,我们将深度设置为 5,而不是 10。
num_trees = 25
depth = 5
used_features_rate = 0.5
def create_forest_model():
inputs = create_model_inputs()
features = encode_inputs(inputs)
features = layers.BatchNormalization()(features)
num_features = features.shape[1]
forest_model = NeuralDecisionForest(
num_trees, depth, num_features, used_features_rate, num_classes
)
outputs = forest_model(features)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
forest_model = create_forest_model()
run_experiment(forest_model)
开始训练模型...
Epoch 1/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 47s 202ms/step - loss: 0.5469 - sparse_categorical_accuracy: 0.7915
Epoch 2/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3459 - sparse_categorical_accuracy: 0.8494
Epoch 3/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3268 - sparse_categorical_accuracy: 0.8523
Epoch 4/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3195 - sparse_categorical_accuracy: 0.8524
Epoch 5/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3149 - sparse_categorical_accuracy: 0.8539
Epoch 6/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3112 - sparse_categorical_accuracy: 0.8556
Epoch 7/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3079 - sparse_categorical_accuracy: 0.8566
Epoch 8/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.3050 - sparse_categorical_accuracy: 0.8582
Epoch 9/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.3021 - sparse_categorical_accuracy: 0.8595
Epoch 10/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.2992 - sparse_categorical_accuracy: 0.8617
模型训练完成
在测试数据上评估模型...
62/62 ━━━━━━━━━━━━━━━━━━━━ 5s 39ms/step - loss: 0.3145 - sparse_categorical_accuracy: 0.8503
测试准确率: 85.55%