代码示例 / 计算机视觉 / 图像分类使用视觉变换器

图像分类使用视觉变换器

作者: Khalid Salama
创建日期: 2021/01/18
最后修改: 2021/01/18
描述: 实现视觉变换器(ViT)模型用于图像分类。

在Colab中查看 GitHub源代码


介绍

本示例实现由Alexey Dosovitskiy等人提出的视觉变换器(ViT)模型用于图像分类,并在CIFAR-100数据集上进行了演示。ViT模型将自注意力的变换器架构应用于图像补丁的序列,而不使用卷积层。


设置

import os

os.environ["KERAS_BACKEND"] = "jax"  # @param ["tensorflow", "jax", "torch"]

import keras
from keras import layers
from keras import ops

import numpy as np
import matplotlib.pyplot as plt

准备数据

num_classes = 100
input_shape = (32, 32, 3)

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()

print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
x_train shape: (50000, 32, 32, 3) - y_train shape: (50000, 1)
x_test shape: (10000, 32, 32, 3) - y_test shape: (10000, 1)

配置超参数

learning_rate = 0.001
weight_decay = 0.0001
batch_size = 256
num_epochs = 10  # 实际训练时,使用 num_epochs=100。10 是测试值
image_size = 72  # 我们将输入图像调整为此大小
patch_size = 6  # 从输入图像中提取补丁的大小
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4
transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # 变换器层的大小
transformer_layers = 8
mlp_head_units = [
    2048,
    1024,
]  # 最终分类器的稠密层大小

使用数据增强

data_augmentation = keras.Sequential(
    [
        layers.Normalization(),
        layers.Resizing(image_size, image_size),
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(factor=0.02),
        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
    ],
    name="data_augmentation",
)
# 计算训练数据的均值和方差以进行归一化。
data_augmentation.layers[0].adapt(x_train)

实现多层感知器(MLP)

def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=keras.activations.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

实现作为层的补丁创建

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        input_shape = ops.shape(images)
        batch_size = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        channels = input_shape[3]
        num_patches_h = height // self.patch_size
        num_patches_w = width // self.patch_size
        patches = keras.ops.image.extract_patches(images, size=self.patch_size)
        patches = ops.reshape(
            patches,
            (
                batch_size,
                num_patches_h * num_patches_w,
                self.patch_size * self.patch_size * channels,
            ),
        )
        return patches

    def get_config(self):
        config = super().get_config()
        config.update({"patch_size": self.patch_size})
        return config

让我们为一张样本图像显示补丁 plt.figure(figsize=(4, 4)) image = x_train[np.random.choice(range(x_train.shape[0]))] plt.imshow(image.astype("uint8")) plt.axis("off")

resized_image = ops.image.resize( ops.convert_to_tensor([image]), size=(image_size, image_size) ) patches = Patches(patch_size)(resized_image) print(f"图像大小: {image_size} X {image_size}") print(f"补丁大小: {patch_size} X {patch_size}") print(f"每张图像的补丁数量: {patches.shape[1]}") print(f"每个补丁的元素数量: {patches.shape[-1]}")

n = int(np.sqrt(patches.shape[1])) plt.figure(figsize=(4, 4)) for i, patch in enumerate(patches[0]): ax = plt.subplot(n, n, i + 1) patch_img = ops.reshape(patch, (patch_size, patch_size, 3)) plt.imshow(ops.convert_to_numpy(patch_img).astype("uint8")) plt.axis("off")

图像大小: 72 X 72
补丁大小: 6 X 6
每个图像的补丁数量: 144
每个补丁的元素: 108

png

png


实现补丁编码层

PatchEncoder 层将通过将补丁投影到大小为 projection_dim 的向量上来线性转换补丁。此外,它还向投影向量添加一个可学习的位置嵌入。

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = ops.expand_dims(
            ops.arange(start=0, stop=self.num_patches, step=1), axis=0
        )
        projected_patches = self.projection(patch)
        encoded = projected_patches + self.position_embedding(positions)
        return encoded

    def get_config(self):
        config = super().get_config()
        config.update({"num_patches": self.num_patches})
        return config

构建ViT模型

ViT模型由多个Transformer块组成,这些块使用layers.MultiHeadAttention层作为自注意力机制,应用于补丁的序列。Transformer块生成一个[batch_size, num_patches, projection_dim]张量,该张量通过一个带有softmax的分类器头进行处理,以产生最终的类别概率输出。

论文中描述的技术不同,该技术在编码补丁的序列前添加了一个可学习的嵌入以作为图像表示,最终Transformer块的所有输出都通过layers.Flatten()重塑,并用作分类器头的图像表示输入。注意,可以使用layers.GlobalAveragePooling1D层来聚合Transformer块的输出,特别是在补丁数量和投影维数较大时。

def create_vit_classifier():
    inputs = keras.Input(shape=input_shape)
    # 增强数据。
    augmented = data_augmentation(inputs)
    # 创建补丁。
    patches = Patches(patch_size)(augmented)
    # 编码补丁。
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # 创建多个Transformer块。
    for _ in range(transformer_layers):
        # 层归一化1。
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # 创建多头注意力层。
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # 跳跃连接1。
        x2 = layers.Add()([attention_output, encoded_patches])
        # 层归一化2。
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP。
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # 跳跃连接2。
        encoded_patches = layers.Add()([x3, x2])

    # 创建一个 [batch_size, projection_dim] 张量。
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # 添加MLP。
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    # 分类输出。
    logits = layers.Dense(num_classes)(features)
    # 创建Keras模型。
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

编译、训练和评估模型

def run_experiment(model):
    optimizer = keras.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
        ],
    )

    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
    checkpoint_callback = keras.callbacks.ModelCheckpoint(
        checkpoint_filepath,
        monitor="val_accuracy",
        save_best_only=True,
        save_weights_only=True,
    )

    history = model.fit(
        x=x_train,
        y=y_train,
        batch_size=batch_size,
        epochs=num_epochs,
        validation_split=0.1,
        callbacks=[checkpoint_callback],
    )

    model.load_weights(checkpoint_filepath)
    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
    print(f"测试准确率: {round(accuracy * 100, 2)}%")
    print(f"测试前 5 准确率: {round(top_5_accuracy * 100, 2)}%")

    return history


vit_classifier = create_vit_classifier()
history = run_experiment(vit_classifier)


def plot_history(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("轮次")
    plt.ylabel(item)
    plt.title("训练与验证 {} 随轮次变化".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_history("loss")
plot_history("top-5-accuracy")
第 1 轮/10 轮
...
第 10 轮/10 轮
 176/176 ━━━━━━━━━━━━━━━━━━━━ 449s 3s/step - 准确率: 0.0790 - 损失: 3.9468 - top-5-准确率: 0.2711 - 验证准确率: 0.0986 - 验证损失: 3.8537 - 验证 top-5-准确率: 0.3052

 313/313 ━━━━━━━━━━━━━━━━━━━━ 66s 198ms/step - 准确率: 0.1001 - 损失: 3.8428 - top-5-准确率: 0.3107
测试准确率: 10.61%
测试 top 5 准确率: 31.51%

png

png

经过 100 轮,ViT 模型在测试数据上达到了约 55% 的准确率和 82% 的 top-5 准确率。这些在 CIFAR-100 数据集上的结果并不具竞争力,因为在同一数据上训练的 ResNet50V2 可以达到 67% 的准确率。

请注意,在论文中报告的最先进结果是通过使用 JFT-300M 数据集对 ViT 模型进行预训练,然后在目标数据集上进行微调实现的。为了在不进行预训练的情况下提高模型质量,您可以尝试训练更多轮次,使用更多的 Transformer 层,调整输入图像大小,更改补丁大小,或增加投影维度。此外,正如论文中提到的,模型的质量不仅受架构选择的影响,还受学习率调度、优化器、权重衰减等参数的影响。在实践中,建议微调使用大型高分辨率数据集预训练的 ViT 模型。