Transformers-源码解析-二十九-

Transformers 源码解析(二十九)

.\models\convbert\modeling_tf_convbert.py

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 ConvBERT model."""


from __future__ import annotations

from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFSequenceSummary,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_convbert import ConvBertConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "YituTech/conv-bert-base"
_CONFIG_FOR_DOC = "ConvBertConfig"

TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "YituTech/conv-bert-base",
    "YituTech/conv-bert-medium-small",
    "YituTech/conv-bert-small",
    # See all ConvBERT models at https://huggingface.co/models?filter=convbert
]


# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->ConvBert
class TFConvBertEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config: ConvBertConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化 TFConvBertEmbeddings 类
        self.config = config
        # 获取嵌入大小
        self.embedding_size = config.embedding_size
        # 获取最大位置嵌入
        self.max_position_embeddings = config.max_position_embeddings
        # 获取初始化范围
        self.initializer_range = config.initializer_range
        # 使用配置的 epsilon 创建 LayerNorm 层,用于正则化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个 Dropout 层,用于防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
    # 在 build 方法中构建模型的嵌入层,用于词嵌入
    def build(self, input_shape=None):
        # 声明一个名为 "word_embeddings" 的命名空间,用于 TensorBoard 可视化
        with tf.name_scope("word_embeddings"):
            # 添加一个权重张量,表示词汇表中每个词的嵌入向量
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        # 声明一个名为 "token_type_embeddings" 的命名空间,用于 TensorBoard 可视化
        with tf.name_scope("token_type_embeddings"):
            # 添加一个权重张量,表示类型词汇表中每个类型的嵌入向量
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        # 声明一个名为 "position_embeddings" 的命名空间,用于 TensorBoard 可视化
        with tf.name_scope("position_embeddings"):
            # 添加一个权重张量,表示位置编码的嵌入向量
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        # 如果模型已经构建过,则直接返回,避免重复构建
        if self.built:
            return
        self.built = True

        # 如果存在 LayerNorm 层,则在其命名空间内构建 LayerNorm 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 根据输入的形状构建 LayerNorm 层,形状为 [None, None, self.config.embedding_size]
                self.LayerNorm.build([None, None, self.config.embedding_size])

    # 从 transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call 复制而来
    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        past_key_values_length=0,
        training: bool = False,
    ) -> tf.Tensor:
        """
        根据输入张量应用嵌入。

        Returns:
            final_embeddings (`tf.Tensor`): 输出的嵌入张量。
        """
        # 如果没有提供 input_ids 和 inputs_embeds 中的任何一个,抛出 ValueError
        if input_ids is None and inputs_embeds is None:
            raise ValueError("Need to provide either `input_ids` or `input_embeds`.")

        # 如果提供了 input_ids,则从权重张量中根据索引获取对应的嵌入向量
        if input_ids is not None:
            # 检查 input_ids 是否在词汇表大小的范围内
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 从权重参数 self.weight 中根据索引 input_ids 获取嵌入向量
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取 inputs_embeds 的形状列表,去掉最后一个维度(用于 batch 维度)
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果 token_type_ids 为 None,则填充为零向量
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 如果 position_ids 为 None,则根据 past_key_values_length 和 input_shape 构建位置编码
        if position_ids is None:
            position_ids = tf.expand_dims(
                tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
            )

        # 根据 position_ids 从 self.position_embeddings 中获取位置嵌入向量
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 根据 token_type_ids 从 self.token_type_embeddings 中获取类型嵌入向量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        # 最终的嵌入向量是 inputs_embeds、position_embeds 和 token_type_embeds 的和
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
        # 对最终的嵌入向量应用 LayerNorm 层
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 在训练时,对最终的嵌入向量应用 dropout
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        return final_embeddings
    # 定义一个名为 TFConvBertSelfAttention 的自定义层,继承自 keras.layers.Layer
    class TFConvBertSelfAttention(keras.layers.Layer):
        # 初始化方法,接受配置 config 和其他关键字参数 kwargs
        def __init__(self, config, **kwargs):
            # 调用父类的初始化方法
            super().__init__(**kwargs)

            # 检查 hidden_size 是否能被 num_attention_heads 整除
            if config.hidden_size % config.num_attention_heads != 0:
                # 若不能整除,抛出 ValueError 异常
                raise ValueError(
                    f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                    f"heads ({config.num_attention_heads})"
                )

            # 根据配置计算新的 num_attention_heads
            new_num_attention_heads = int(config.num_attention_heads / config.head_ratio)
            # 如果新的 num_attention_heads 小于 1,则使用默认的 config.num_attention_heads
            if new_num_attention_heads < 1:
                self.head_ratio = config.num_attention_heads
                num_attention_heads = 1
            else:
                num_attention_heads = new_num_attention_heads
                self.head_ratio = config.head_ratio

            # 将计算得到的 num_attention_heads 赋值给实例变量 self.num_attention_heads
            self.num_attention_heads = num_attention_heads
            # 将配置中的 conv_kernel_size 赋值给实例变量 self.conv_kernel_size
            self.conv_kernel_size = config.conv_kernel_size

            # 检查 hidden_size 是否能被 self.num_attention_heads 整除
            if config.hidden_size % self.num_attention_heads != 0:
                # 若不能整除,抛出 ValueError 异常
                raise ValueError("hidden_size should be divisible by num_attention_heads")

            # 计算每个 attention head 的大小
            self.attention_head_size = config.hidden_size // config.num_attention_heads
            # 计算所有 attention heads 总共的大小
            self.all_head_size = self.num_attention_heads * self.attention_head_size

            # 创建 Dense 层作为 query、key、value 的线性变换
            self.query = keras.layers.Dense(
                self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
            )
            self.key = keras.layers.Dense(
                self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
            )
            self.value = keras.layers.Dense(
                self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
            )

            # 创建 SeparableConv1D 层作为 key 的卷积注意力层
            self.key_conv_attn_layer = keras.layers.SeparableConv1D(
                self.all_head_size,
                self.conv_kernel_size,
                padding="same",
                activation=None,
                depthwise_initializer=get_initializer(1 / self.conv_kernel_size),
                pointwise_initializer=get_initializer(config.initializer_range),
                name="key_conv_attn_layer",
            )

            # 创建 Dense 层作为卷积核的线性变换层
            self.conv_kernel_layer = keras.layers.Dense(
                self.num_attention_heads * self.conv_kernel_size,
                activation=None,
                name="conv_kernel_layer",
                kernel_initializer=get_initializer(config.initializer_range),
            )

            # 创建 Dense 层作为卷积输出的线性变换层
            self.conv_out_layer = keras.layers.Dense(
                self.all_head_size,
                activation=None,
                name="conv_out_layer",
                kernel_initializer=get_initializer(config.initializer_range),
            )

            # 创建 Dropout 层,用于注意力概率的随机丢弃
            self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
            # 将配置对象保存在实例变量 self.config 中
            self.config = config

        # 定义 transpose_for_scores 方法,用于将输入 x 重塑为注意力分数的形状
        def transpose_for_scores(self, x, batch_size):
            # 将 x 从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
            x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
            # 调换维度顺序,变为 [batch_size, num_attention_heads, seq_length, attention_head_size]
            return tf.transpose(x, perm=[0, 2, 1, 3])
    # 定义 build 方法,用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建完成,直接返回
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在查询向量,构建查询向量的层,并指定其形状
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        
        # 如果存在键向量,构建键向量的层,并指定其形状
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        
        # 如果存在值向量,构建值向量的层,并指定其形状
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
        
        # 如果存在键卷积注意力层,构建该层,并指定其形状
        if getattr(self, "key_conv_attn_layer", None) is not None:
            with tf.name_scope(self.key_conv_attn_layer.name):
                self.key_conv_attn_layer.build([None, None, self.config.hidden_size])
        
        # 如果存在卷积核层,构建该层,并指定其形状
        if getattr(self, "conv_kernel_layer", None) is not None:
            with tf.name_scope(self.conv_kernel_layer.name):
                self.conv_kernel_layer.build([None, None, self.all_head_size])
        
        # 如果存在卷积输出层,构建该层,并指定其形状
        if getattr(self, "conv_out_layer", None) is not None:
            with tf.name_scope(self.conv_out_layer.name):
                self.conv_out_layer.build([None, None, self.config.hidden_size])
# 定义 TFConvBertSelfOutput 类,继承自 keras.layers.Layer
class TFConvBertSelfOutput(keras.layers.Layer):
    # 初始化函数,接受 config 和 kwargs 参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层,输出维度为 config.hidden_size,初始化方式为 config.initializer_range
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建 LayerNormalization 层,epsilon 参数为 config.layer_norm_eps
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 Dropout 层,dropout 率为 config.hidden_dropout_prob
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 将 config 存储在 self.config 中
        self.config = config

    # 定义 call 方法,用于执行层的前向传播
    def call(self, hidden_states, input_tensor, training=False):
        # 全连接层计算
        hidden_states = self.dense(hidden_states)
        # Dropout 操作,根据 training 参数决定是否执行
        hidden_states = self.dropout(hidden_states, training=training)
        # LayerNormalization 操作,加上输入张量 input_tensor
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        # 返回处理后的 hidden_states
        return hidden_states

    # 定义 build 方法,用于构建层的参数
    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回
        if self.built:
            return
        # 标记该层已构建
        self.built = True
        # 如果 self.dense 存在,则构建该全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果 self.LayerNorm 存在,则构建 LayerNormalization 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 定义 TFConvBertAttention 类,继承自 keras.layers.Layer
class TFConvBertAttention(keras.layers.Layer):
    # 初始化函数,接受 config 和 kwargs 参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 创建 TFConvBertSelfAttention 层
        self.self_attention = TFConvBertSelfAttention(config, name="self")
        # 创建 TFConvBertSelfOutput 层
        self.dense_output = TFConvBertSelfOutput(config, name="output")

    # 未实现的函数,用于裁剪注意力头部
    def prune_heads(self, heads):
        raise NotImplementedError

    # 定义 call 方法,用于执行层的前向传播
    def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False):
        # 调用 self_attention 层的 call 方法,计算 self-attention 输出
        self_outputs = self.self_attention(
            input_tensor, attention_mask, head_mask, output_attentions, training=training
        )
        # 调用 dense_output 层的 call 方法,计算最终输出
        attention_output = self.dense_output(self_outputs[0], input_tensor, training=training)
        # 如果输出注意力信息,将其添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them

        # 返回输出结果
        return outputs

    # 定义 build 方法,用于构建层的参数
    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回
        if self.built:
            return
        # 标记该层已构建
        self.built = True
        # 如果 self.self_attention 存在,则构建 self_attention 层
        if getattr(self, "self_attention", None) is not None:
            with tf.name_scope(self.self_attention.name):
                self.self_attention.build(None)
        # 如果 self.dense_output 存在,则构建 dense_output 层
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)


# 定义 GroupedLinearLayer 类,继承自 keras.layers.Layer
class GroupedLinearLayer(keras.layers.Layer):
    # 初始化函数,接受 input_size、output_size、num_groups、kernel_initializer 和 kwargs 参数
    def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs):
        super().__init__(**kwargs)
        # 初始化输入维度、输出维度、分组数、初始化方式
        self.input_size = input_size
        self.output_size = output_size
        self.num_groups = num_groups
        self.kernel_initializer = kernel_initializer
        # 计算每组的输入维度和输出维度
        self.group_in_dim = self.input_size // self.num_groups
        self.group_out_dim = self.output_size // self.num_groups
    # 在神经网络层的构建过程中被调用,用于初始化权重参数
    def build(self, input_shape=None):
        # 添加权重:kernel,用于存储多组卷积核的参数
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.group_out_dim, self.group_in_dim, self.num_groups],
            initializer=self.kernel_initializer,
            trainable=True,
        )

        # 添加权重:bias,用于存储每个输出通道的偏置参数
        self.bias = self.add_weight(
            "bias", shape=[self.output_size], initializer=self.kernel_initializer, dtype=self.dtype, trainable=True
        )
        # 调用父类的 build 方法,完成神经网络层的构建
        super().build(input_shape)

    # 实现神经网络层的前向传播过程
    def call(self, hidden_states):
        # 获取输入张量的 batch size
        batch_size = shape_list(hidden_states)[0]
        # 将输入张量进行形状变换和转置,以便与卷积核进行批次乘积
        x = tf.transpose(tf.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim]), [1, 0, 2])
        # 执行批次乘积操作,计算卷积结果
        x = tf.matmul(x, tf.transpose(self.kernel, [2, 1, 0]))
        # 对卷积结果进行再次转置,使其恢复到原始张量的形状
        x = tf.transpose(x, [1, 0, 2])
        # 将卷积结果重新整形为最终输出的形状
        x = tf.reshape(x, [batch_size, -1, self.output_size])
        # 添加偏置项到卷积结果中
        x = tf.nn.bias_add(value=x, bias=self.bias)
        # 返回经过偏置处理后的最终输出张量
        return x
class TFConvBertIntermediate(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 根据配置选择使用单一组或多组线性层
        if config.num_groups == 1:
            self.dense = keras.layers.Dense(
                config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
            )
        else:
            self.dense = GroupedLinearLayer(
                config.hidden_size,
                config.intermediate_size,
                num_groups=config.num_groups,
                kernel_initializer=get_initializer(config.initializer_range),
                name="dense",
            )

        # 根据配置获取中间激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states):
        # 应用线性层
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 dense 属性,则构建对应的 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFConvBertOutput(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 根据配置选择使用单一组或多组线性层
        if config.num_groups == 1:
            self.dense = keras.layers.Dense(
                config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
            )
        else:
            self.dense = GroupedLinearLayer(
                config.intermediate_size,
                config.hidden_size,
                num_groups=config.num_groups,
                kernel_initializer=get_initializer(config.initializer_range),
                name="dense",
            )
        
        # LayerNormalization 层
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # Dropout 层
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states, input_tensor, training=False):
        # 应用线性层
        hidden_states = self.dense(hidden_states)
        # 应用 Dropout
        hidden_states = self.dropout(hidden_states, training=training)
        # LayerNormalization 层,添加残差连接
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 LayerNorm 属性,则构建对应的 LayerNorm 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 如果存在 dense 属性,则构建对应的 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])


class TFConvBertLayer(keras.layers.Layer):
    # 初始化方法,接受一个配置对象和可选的关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 TFConvBertAttention 对象,使用给定的配置对象,并命名为 "attention"
        self.attention = TFConvBertAttention(config, name="attention")
        # 创建 TFConvBertIntermediate 对象,使用给定的配置对象,并命名为 "intermediate"
        self.intermediate = TFConvBertIntermediate(config, name="intermediate")
        # 创建 TFConvBertOutput 对象,使用给定的配置对象,并命名为 "output"
        self.bert_output = TFConvBertOutput(config, name="output")

    # 调用方法,接受隐藏状态、注意力掩码、头部掩码、是否输出注意力、是否训练等参数
    def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
        # 调用 self.attention 对象的 call 方法,传递隐藏状态和其他参数,获取注意力输出
        attention_outputs = self.attention(
            hidden_states, attention_mask, head_mask, output_attentions, training=training
        )
        # 从 attention_outputs 中获取注意力输出的第一个元素
        attention_output = attention_outputs[0]
        # 使用 attention_output 调用 self.intermediate 对象的 call 方法,获取中间输出
        intermediate_output = self.intermediate(attention_output)
        # 使用 intermediate_output 和 attention_output 调用 self.bert_output 对象的 call 方法,获取层输出
        layer_output = self.bert_output(intermediate_output, attention_output, training=training)
        # 构建输出元组,包括 layer_output 和可能的额外注意力输出
        outputs = (layer_output,) + attention_outputs[1:]  # 如果输出了注意力,将它们添加到输出中

        return outputs

    # 构建方法,接受输入形状参数(在这里未使用)
    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回
        if self.built:
            return
        # 设置为已构建状态
        self.built = True
        # 如果存在 self.attention 对象,则在其名称作用域下构建它
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果存在 self.intermediate 对象,则在其名称作用域下构建它
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        # 如果存在 self.bert_output 对象,则在其名称作用域下构建它
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
# 定义 TFConvBertEncoder 类,继承自 keras.layers.Layer
class TFConvBertEncoder(keras.layers.Layer):
    # 初始化方法,接受 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        
        # 创建 TFConvBertLayer 的列表作为层的属性,每个层的名称包含索引号
        self.layer = [TFConvBertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    # 定义调用方法,处理输入和各种参数,生成输出
    def call(
        self,
        hidden_states,
        attention_mask,
        head_mask,
        output_attentions,
        output_hidden_states,
        return_dict,
        training=False,
    ):
        # 如果需要输出隐藏状态,则初始化空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重,则初始化空元组
        all_attentions = () if output_attentions else None

        # 遍历每一层进行处理
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用当前层的 call 方法,生成当前层的输出
            layer_outputs = layer_module(
                hidden_states, attention_mask, head_mask[i], output_attentions, training=training
            )
            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重,则将当前层的注意力权重添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到 all_hidden_states 中,如果需要输出隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的结果,则返回非空元组中的元素
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 返回 TFBaseModelOutput 类的实例,包含最后的隐藏状态、所有隐藏状态和注意力权重
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

    # 构建方法,用于构建层
    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果存在 self.layer 属性,则对每一层进行构建
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                # 使用 tf.name_scope 对每一层的名称进行命名空间管理
                with tf.name_scope(layer.name):
                    layer.build(None)


# 定义 TFConvBertPredictionHeadTransform 类,继承自 keras.layers.Layer
class TFConvBertPredictionHeadTransform(keras.layers.Layer):
    # 初始化方法,接受 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        
        # 创建全连接层 Dense,用于变换隐藏状态的维度
        self.dense = keras.layers.Dense(
            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        
        # 根据配置选择激活函数,并赋值给 transform_act_fn
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act
        
        # LayerNormalization 层,用于归一化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 存储配置
        self.config = config

    # 定义调用方法,处理输入的隐藏状态,通过全连接层和归一化层输出变换后的隐藏状态
    def call(self, hidden_states):
        # 全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 应用激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 归一化处理隐藏状态
        hidden_states = self.LayerNorm(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states
    # 定义模型的构建方法,用于构建模型的各层结构,输入形状为可选参数
    def build(self, input_shape=None):
        # 如果模型已经构建完成,则直接返回,不进行重复构建
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        
        # 如果存在名为dense的属性,并且不为None,则执行以下操作
        if getattr(self, "dense", None) is not None:
            # 使用dense层的名称作为命名空间,构建dense层,输入形状为[None, None, self.config.hidden_size]
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        
        # 如果存在名为LayerNorm的属性,并且不为None,则执行以下操作
        if getattr(self, "LayerNorm", None) is not None:
            # 使用LayerNorm层的名称作为命名空间,构建LayerNorm层,输入形状为[None, None, self.config.hidden_size]
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
# 使用 keras_serializable 装饰器标记这个类,表示它可以被序列化为 Keras 模型
@keras_serializable
# 定义 TFConvBertMainLayer 类,继承自 keras.layers.Layer 类
class TFConvBertMainLayer(keras.layers.Layer):
    # 指定配置类为 ConvBertConfig
    config_class = ConvBertConfig

    # 初始化方法,接受 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 TFConvBertEmbeddings 实例,命名为 "embeddings"
        self.embeddings = TFConvBertEmbeddings(config, name="embeddings")

        # 如果嵌入大小不等于隐藏大小,则创建一个全连接层 embeddings_project
        if config.embedding_size != config.hidden_size:
            self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")

        # 创建 TFConvBertEncoder 实例,命名为 "encoder"
        self.encoder = TFConvBertEncoder(config, name="encoder")

        # 存储传入的配置对象
        self.config = config

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.embeddings

    # 设置输入嵌入层的方法,设定权重和词汇表大小
    def set_input_embeddings(self, value):
        self.embeddings.weight = value
        self.embeddings.vocab_size = value.shape[0]

    # 未实现的方法,用于修剪模型的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 获取扩展的注意力遮罩的方法,根据输入形状和类型生成
    def get_extended_attention_mask(self, attention_mask, input_shape, dtype):
        if attention_mask is None:
            attention_mask = tf.fill(input_shape, 1)

        # 将二维张量注意力遮罩转换为三维,以便进行广播
        extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))

        # 将注意力遮罩转换为所需的格式,用于在 softmax 前过滤掉不需要的位置
        extended_attention_mask = tf.cast(extended_attention_mask, dtype)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        return extended_attention_mask

    # 获取头部遮罩的方法,如果存在头部遮罩则抛出未实现异常,否则返回与隐藏层数量相同的空列表
    def get_head_mask(self, head_mask):
        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.config.num_hidden_layers

        return head_mask

    # 使用 unpack_inputs 装饰器标记的 call 方法,处理模型的前向传播
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
        ):
            # 如果同时指定了 input_ids 和 inputs_embeds,则抛出数值错误
            if input_ids is not None and inputs_embeds is not None:
                raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
            # 如果指定了 input_ids,则获取其形状
            elif input_ids is not None:
                input_shape = shape_list(input_ids)
            # 如果指定了 inputs_embeds,则获取其形状,并去掉最后一个维度
            elif inputs_embeds is not None:
                input_shape = shape_list(inputs_embeds)[:-1]
            else:
                # 如果既未指定 input_ids 也未指定 inputs_embeds,则抛出数值错误
                raise ValueError("You have to specify either input_ids or inputs_embeds")

            # 如果未指定 attention_mask,则创建一个全为1的张量,形状与 input_shape 相同
            if attention_mask is None:
                attention_mask = tf.fill(input_shape, 1)

            # 如果未指定 token_type_ids,则创建一个全为0的张量,形状与 input_shape 相同
            if token_type_ids is None:
                token_type_ids = tf.fill(input_shape, 0)

            # 使用 embeddings 方法生成隐藏状态张量
            hidden_states = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
            # 获取扩展后的 attention_mask
            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, hidden_states.dtype)
            # 获取头部遮罩
            head_mask = self.get_head_mask(head_mask)

            # 如果模型具有 embeddings_project 属性,则使用它处理隐藏状态
            if hasattr(self, "embeddings_project"):
                hidden_states = self.embeddings_project(hidden_states, training=training)

            # 使用 encoder 处理隐藏状态,返回处理后的结果
            hidden_states = self.encoder(
                hidden_states,
                extended_attention_mask,
                head_mask,
                output_attentions,
                output_hidden_states,
                return_dict,
                training=training,
            )

            # 返回处理后的隐藏状态作为最终输出
            return hidden_states

        # 构建模型结构的方法
        def build(self, input_shape=None):
            # 如果模型已经构建过,则直接返回
            if self.built:
                return
            # 标记模型已经构建
            self.built = True
            # 如果模型具有 embeddings 属性,则构建 embeddings 层
            if getattr(self, "embeddings", None) is not None:
                with tf.name_scope(self.embeddings.name):
                    self.embeddings.build(None)
            # 如果模型具有 encoder 属性,则构建 encoder 层
            if getattr(self, "encoder", None) is not None:
                with tf.name_scope(self.encoder.name):
                    self.encoder.build(None)
            # 如果模型具有 embeddings_project 属性,则构建 embeddings_project 层
            if getattr(self, "embeddings_project", None) is not None:
                with tf.name_scope(self.embeddings_project.name):
                    self.embeddings_project.build([None, None, self.config.embedding_size])
"""
An abstract class representing a ConvBERT model for TensorFlow, inheriting from `TFPreTrainedModel`.
Provides functionality for weights initialization, pretrained model handling, and a simple interface for downloading and loading pretrained models.
"""

# 设定配置类为 ConvBertConfig
config_class = ConvBertConfig

# 基础模型的前缀
base_model_prefix = "convbert"
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 调用父类的初始化方法,传递config及其他位置参数和关键字参数

        # 使用TFConvBertMainLayer类初始化一个名为convbert的成员变量
        self.convbert = TFConvBertMainLayer(config, name="convbert")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
        token_type_ids: Optional[Union[np.array, tf.Tensor]] = None,
        position_ids: Optional[Union[np.array, tf.Tensor]] = None,
        head_mask: Optional[Union[np.array, tf.Tensor]] = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 调用convbert对象的call方法,传递各种输入参数
        outputs = self.convbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 返回convbert的输出结果

        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过,直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果self.convbert存在,则在相应的命名空间下构建convbert对象
        if getattr(self, "convbert", None) is not None:
            with tf.name_scope(self.convbert.name):
                self.convbert.build(None)
# 基于 Keras 的自定义层,用于 ConvBERT 模型中的 Masked Language Modeling 头部
class TFConvBertMaskedLMHead(keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 保存模型配置信息
        self.embedding_size = config.embedding_size  # 从配置中获取嵌入向量的大小
        self.input_embeddings = input_embeddings  # 输入嵌入层的权重

    def build(self, input_shape):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
        # 创建偏置项权重,形状为词汇表大小,并初始化为零,可训练

        super().build(input_shape)

    def get_output_embeddings(self):
        return self.input_embeddings  # 返回输入嵌入层的权重

    def set_output_embeddings(self, value):
        self.input_embeddings.weight = value  # 设置输入嵌入层的权重为给定值
        self.input_embeddings.vocab_size = shape_list(value)[0]  # 更新词汇表大小

    def get_bias(self):
        return {"bias": self.bias}  # 返回偏置项权重

    def set_bias(self, value):
        self.bias = value["bias"]  # 设置偏置项权重为给定值
        self.config.vocab_size = shape_list(value["bias"])[0]  # 更新配置中的词汇表大小

    def call(self, hidden_states):
        seq_length = shape_list(tensor=hidden_states)[1]  # 获取隐藏状态的序列长度
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])  # 重塑隐藏状态
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
        # 执行矩阵乘法,将嵌入层权重与隐藏状态相乘(转置后)
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        # 重塑输出形状以匹配模型输出要求
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
        # 添加偏置项到输出隐藏状态

        return hidden_states


# ConvBERT 模型中用于生成预测的自定义 Keras 层
class TFConvBertGeneratorPredictions(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # LayerNormalization 层,使用给定的 epsilon 参数
        self.dense = keras.layers.Dense(config.embedding_size, name="dense")
        # 全连接层,输出大小为配置中的嵌入大小
        self.config = config  # 保存模型配置信息

    def call(self, generator_hidden_states, training=False):
        hidden_states = self.dense(generator_hidden_states)  # 执行全连接操作
        hidden_states = get_tf_activation("gelu")(hidden_states)  # 使用 GELU 激活函数
        hidden_states = self.LayerNorm(hidden_states)  # 应用 LayerNormalization

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.embedding_size])
                # 如果存在 LayerNorm 层,则构建其图层结构
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
                # 如果存在 dense 层,则构建其图层结构


@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
# 使用装饰器添加文档字符串说明的 ConvBERT 模型,带有语言建模头部
class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss):
    # 继承 TFConvBertPreTrainedModel 和 TFMaskedLanguageModelingLoss
    # 初始化方法,接受配置参数、多个输入和关键字参数,调用父类的初始化方法
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, **kwargs)

        # 将配置参数保存到实例变量中
        self.config = config
        # 创建一个 TFConvBertMainLayer 对象,并命名为 convbert
        self.convbert = TFConvBertMainLayer(config, name="convbert")
        # 创建一个 TFConvBertGeneratorPredictions 对象,并命名为 generator_predictions
        self.generator_predictions = TFConvBertGeneratorPredictions(config, name="generator_predictions")

        # 检查 hidden_act 是否为字符串类型,如果是,则通过 get_tf_activation 获取对应的激活函数,否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.activation = get_tf_activation(config.hidden_act)
        else:
            self.activation = config.hidden_act

        # 创建一个 TFConvBertMaskedLMHead 对象,依赖于 convbert.embeddings,并命名为 generator_lm_head
        self.generator_lm_head = TFConvBertMaskedLMHead(config, self.convbert.embeddings, name="generator_lm_head")

    # 返回 generator_lm_head 实例
    def get_lm_head(self):
        return self.generator_lm_head

    # 返回由实例名称和 generator_lm_head 名称组成的字符串,用于前缀偏置名称
    def get_prefix_bias_name(self):
        return self.name + "/" + self.generator_lm_head.name

    # 使用装饰器将下列函数声明为模型的前向传播函数,并添加相应的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播函数,接受多个输入参数,并返回 TFMaskedLMOutput 类型的输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: Optional[bool] = False,
        ) -> Union[Tuple, TFMaskedLMOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 使用类型注解指定函数返回类型为元组或 TFMaskedLMOutput 类型
        generator_hidden_states = self.convbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 convbert 模型返回的隐藏状态中获取生成器的序列输出
        generator_sequence_output = generator_hidden_states[0]
        # 使用生成器预测模型对生成器序列输出进行预测
        prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
        # 使用生成器语言模型头部对预测分数进行进一步处理
        prediction_scores = self.generator_lm_head(prediction_scores, training=training)
        # 如果提供了标签,计算生成器模型的损失;否则损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果 return_dict 为 False,则按顺序返回损失和生成器的隐藏状态
        if not return_dict:
            output = (prediction_scores,) + generator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True,则构造 TFMaskedLMOutput 对象返回
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=generator_hidden_states.hidden_states,
            attentions=generator_hidden_states.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建,则直接返回
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果 convbert 模型存在,则构建 convbert 模型
        if getattr(self, "convbert", None) is not None:
            with tf.name_scope(self.convbert.name):
                self.convbert.build(None)
        # 如果 generator_predictions 模型存在,则构建 generator_predictions 模型
        if getattr(self, "generator_predictions", None) is not None:
            with tf.name_scope(self.generator_predictions.name):
                self.generator_predictions.build(None)
        # 如果 generator_lm_head 模型存在,则构建 generator_lm_head 模型
        if getattr(self, "generator_lm_head", None) is not None:
            with tf.name_scope(self.generator_lm_head.name):
                self.generator_lm_head.build(None)
class TFConvBertClassificationHead(keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层,输出维度为 config.hidden_size,使用指定的初始化器初始化权重
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        
        # 根据 config 配置选择分类器的 dropout 率,如果未指定,则使用隐藏层 dropout 率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义一个 Dropout 层,应用于全连接层的输出
        self.dropout = keras.layers.Dropout(classifier_dropout)
        
        # 定义一个全连接层,输出维度为 config.num_labels,使用指定的初始化器初始化权重
        self.out_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )

        self.config = config

    def call(self, hidden_states, **kwargs):
        # 获取每个样本的第一个 token 的隐藏状态(通常是 [CLS] 标志)
        x = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)  # 对隐藏状态应用 Dropout
        x = self.dense(x)  # 将 Dropout 后的隐藏状态输入全连接层
        x = get_tf_activation(self.config.hidden_act)(x)  # 应用激活函数到全连接层的输出
        x = self.dropout(x)  # 对激活函数的输出再次应用 Dropout
        x = self.out_proj(x)  # 将 Dropout 后的输出输入到输出全连接层

        return x

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果 dense 层已经定义,则根据输入形状构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        
        # 如果 out_proj 层已经定义,则根据输入形状构建 out_proj 层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])
    ) -> Union[Tuple, TFSequenceClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用 ConvBert 模型进行前向传播,获取输出结果
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 将 ConvBert 的输出 logits 传递给分类器,得到分类器的预测结果
        logits = self.classifier(outputs[0], training=training)
        # 如果提供了标签,计算损失;否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典形式的输出
        if not return_dict:
            # 组装输出结果,包括 logits 和 ConvBert 的其它输出
            output = (logits,) + outputs[1:]
            # 返回包含损失和输出结果的元组,如果损失为 None 则不包含损失
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的 TFSequenceClassifierOutput
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过,则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 ConvBert 模型存在,则构建 ConvBert 模型
        if getattr(self, "convbert", None) is not None:
            with tf.name_scope(self.convbert.name):
                self.convbert.build(None)
        # 如果分类器存在,则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
@add_start_docstrings(
    """
    ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    CONVBERT_START_DOCSTRING,
)
class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 ConvBERT 主层,使用给定的配置和名称"convbert"
        self.convbert = TFConvBertMainLayer(config, name="convbert")
        
        # 创建用于序列汇总的 TFSequenceSummary 实例,使用配置中的初始化范围和名称"sequence_summary"
        self.sequence_summary = TFSequenceSummary(
            config, initializer_range=config.initializer_range, name="sequence_summary"
        )
        
        # 创建用于分类的全连接层 Dense,输出维度为1,使用给定的初始化器范围和名称"classifier"
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        
        # 将配置保存到实例中
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
        CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFMultipleChoiceModelOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果存在 input_ids,则确定 num_choices 和 seq_length
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
        else:
            # 否则,使用 inputs_embeds 确定 num_choices 和 seq_length
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]

        # 将 input_ids 摊平成形状为 (-1, seq_length) 的张量,如果 input_ids 不为 None
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        # 将 attention_mask 摊平成形状为 (-1, seq_length) 的张量,如果 attention_mask 不为 None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        # 将 token_type_ids 摊平成形状为 (-1, seq_length) 的张量,如果 token_type_ids 不为 None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        # 将 position_ids 摊平成形状为 (-1, seq_length) 的张量,如果 position_ids 不为 None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        # 将 inputs_embeds 摊平成形状为 (-1, seq_length, hidden_size) 的张量,如果 inputs_embeds 不为 None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )
        # 调用 convbert 模型进行前向传播
        outputs = self.convbert(
            flat_input_ids,
            flat_attention_mask,
            flat_token_type_ids,
            flat_position_ids,
            head_mask,
            flat_inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 对输出进行序列摘要
        logits = self.sequence_summary(outputs[0], training=training)
        # 对序列摘要后的结果进行分类
        logits = self.classifier(logits)
        # 将 logits 重新整形为 (-1, num_choices)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))
        # 如果存在 labels,则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果不返回字典格式的输出,则组合输出结果
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TF 模型多选模型的输出对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经建立,则直接返回
        if self.built:
            return
        # 标记模型已经建立
        self.built = True
        # 如果 convbert 模型存在,则建立 convbert 模型
        if getattr(self, "convbert", None) is not None:
            with tf.name_scope(self.convbert.name):
                self.convbert.build(None)
        # 如果 sequence_summary 模型存在,则建立 sequence_summary 模型
        if getattr(self, "sequence_summary", None) is not None:
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)
        # 如果 classifier 模型存在,则建立 classifier 模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
"""
ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
@add_start_docstrings(
    """
    ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    CONVBERT_START_DOCSTRING,
)
class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化分类任务的标签数
        self.num_labels = config.num_labels
        # 创建 ConvBERT 主层,命名为 "convbert"
        self.convbert = TFConvBertMainLayer(config, name="convbert")
        # 定义分类器的 dropout 层,使用 config 中指定的 dropout 或者默认的隐藏层 dropout
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 定义分类器的全连接层,输出维度为 config 中指定的标签数,使用指定的初始化方法
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        self.config = config

    # 将输入解包,并添加模型前向传播的文档注释
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFTokenClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 ConvBERT 模型,传入各种输入参数
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 ConvBERT 模型输出中取得序列输出
        sequence_output = outputs[0]
        # 对序列输出应用 dropout,用于防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 将 dropout 后的输出送入分类器,得到预测 logits
        logits = self.classifier(sequence_output)
        # 如果有提供标签,则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典,则返回 tuple 类型的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典,则构建 TFTokenClassifierOutput 对象并返回
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过,则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 ConvBERT 模型存在,则构建它
        if getattr(self, "convbert", None) is not None:
            with tf.name_scope(self.convbert.name):
                self.convbert.build(None)
        # 如果分类器存在,则构建它,并指定输入形状为 [None, None, self.config.hidden_size]
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器为 TFConvBertForQuestionAnswering 类添加文档字符串,描述其功能和适用于的任务类型
@add_start_docstrings(
    """
    ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    CONVBERT_START_DOCSTRING,  # 引用之前定义的 ConvBERT 的文档字符串常量
)
class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 设置模型需要的标签数
        self.num_labels = config.num_labels
        # 初始化 ConvBERT 主层,命名为 "convbert"
        self.convbert = TFConvBertMainLayer(config, name="convbert")
        # 创建用于回答问题的输出层,包括初始化和命名
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        # 保存模型配置
        self.config = config

    # 使用装饰器为 call 方法添加文档字符串,描述输入参数和模型输出的样例和用途
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 引用用于示例的检查点
        output_type=TFQuestionAnsweringModelOutput,  # 输出类型为 TFQuestionAnsweringModelOutput
        config_class=_CONFIG_FOR_DOC,  # 引用模型的配置类
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: tf.Tensor | None = None,
        end_positions: tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 下面的参数包括了模型可能用到的所有输入和控制参数
    ) -> Union[Tuple, TFQuestionAnsweringModelOutput]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 ConvBert 模型进行推理,获取模型的输出
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中提取序列输出
        sequence_output = outputs[0]
        # 将序列输出传递给 QA 输出层,得到预测的起始位置和结束位置的 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 沿着最后一个维度分割成起始位置和结束位置的 logits
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        # 去除 logits 的最后一个维度中的大小为 1 的维度
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)
        # 初始化损失为 None
        loss = None

        # 如果给定了起始位置和结束位置的标签,则计算损失
        if start_positions is not None and end_positions is not None:
            # 组装标签字典,用于计算损失
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 调用 Hugging Face 的损失计算函数计算损失
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不要求返回字典,则组装输出元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典,则将输出封装成 TFQuestionAnsweringModelOutput 对象返回
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过,则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 ConvBert 模型存在,则构建 ConvBert 模型
        if getattr(self, "convbert", None) is not None:
            with tf.name_scope(self.convbert.name):
                self.convbert.build(None)
        # 如果 QA 输出层存在,则构建 QA 输出层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

.\models\convbert\tokenization_convbert.py

# 指定文件编码为 UTF-8
# 版权声明,基于 Apache License, Version 2.0
# 详细信息可查阅 http://www.apache.org/licenses/LICENSE-2.0
#
# 该脚本定义了 ConvBERT 的 tokenization 类

import collections  # 导入 collections 模块
import os  # 导入操作系统模块
import unicodedata  # 导入 unicodedata 模块
from typing import List, Optional, Tuple  # 导入类型提示模块中的 List, Optional, Tuple

# 导入 tokenization_utils 模块中的相关函数和类
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 导入 logging 模块中的 logger 对象
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 定义预训练模型词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/vocab.txt",
        "YituTech/conv-bert-medium-small": (
            "https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/vocab.txt"
        ),
        "YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/vocab.txt",
    }
}

# 定义预训练模型的位置编码嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "YituTech/conv-bert-base": 512,
    "YituTech/conv-bert-medium-small": 512,
    "YituTech/conv-bert-small": 512,
}

# 定义预训练模型的初始化配置
PRETRAINED_INIT_CONFIGURATION = {
    "YituTech/conv-bert-base": {"do_lower_case": True},
    "YituTech/conv-bert-medium-small": {"do_lower_case": True},
    "YituTech/conv-bert-small": {"do_lower_case": True},
}

# 从 transformers.models.bert.tokenization_bert.load_vocab 复制而来的函数
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()  # 创建一个有序字典对象
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()  # 逐行读取词汇文件内容
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")  # 去除每行末尾的换行符
        vocab[token] = index  # 将 token 加入到 vocab 字典中,索引为 index
    return vocab  # 返回构建好的词汇表字典

# 从 transformers.models.bert.tokenization_bert.whitespace_tokenize 复制而来的函数
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()  # 去除文本首尾空白字符
    if not text:
        return []  # 若文本为空,则返回空列表
    tokens = text.split()  # 使用空白字符分割文本,生成 token 列表
    return tokens  # 返回分割好的 token 列表

# 从 transformers.models.bert.tokenization_bert.BertTokenizer 复制而来的类定义
class ConvBertTokenizer(PreTrainedTokenizer):
    r"""
    Construct a ConvBERT tokenizer. Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # 定义一个类,用于处理基于特定词汇表的词汇和特殊标记的初始化配置
    
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    
    # 类的初始化方法,接收多个参数用于配置词汇表和标记化过程
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # vocab_file: 包含词汇的文件路径
        # do_lower_case: 是否将输入文本转换为小写进行标记化,默认为True
        # do_basic_tokenize: 是否在WordPiece之前进行基本的标记化,默认为True
        # never_split: 永远不会在标记化过程中拆分的标记集合,在do_basic_tokenize=True时生效
        # unk_token: 未知标记,用于词汇表中不存在的标记
        # sep_token: 分隔标记,用于构建多个序列的序列
        # pad_token: 填充标记,用于批处理不同长度的序列
        # cls_token: 分类器标记,在序列分类时作为序列的第一个标记
        # mask_token: 掩码标记,用于掩码语言建模中的训练
        # tokenize_chinese_chars: 是否标记化中文字符,默认为True;在处理日语时应禁用(参见链接)
        # strip_accents: 是否去除所有重音符号,如果未指定,则由lowercase的值决定
    ):
        # 检查是否存在指定的词汇文件,如果不存在则抛出异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表
        self.vocab = load_vocab(vocab_file)
        # 创建从词汇 ID 到词汇符号的有序字典
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 根据参数设置是否执行基本的分词
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要基本分词,则创建 BasicTokenizer 对象
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )

        # 创建 WordpieceTokenizer 对象
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法,传递相应参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    def do_lower_case(self):
        # 返回 BasicTokenizer 是否执行小写处理的属性值
        return self.basic_tokenizer.do_lower_case

    @property
    def vocab_size(self):
        # 返回词汇表大小
        return len(self.vocab)

    def get_vocab(self):
        # 返回词汇表及其附加的特殊符号编码器
        return dict(self.vocab, **self.added_tokens_encoder)

    def _tokenize(self, text, split_special_tokens=False):
        # 分词函数,根据设定选择使用基本分词器或 Wordpiece 分词器
        split_tokens = []
        if self.do_basic_tokenize:
            # 使用基本分词器进行分词,根据参数决定是否保留特殊符号的分割
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果 token 是不应分割的特殊符号,则直接添加到结果中
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    # 否则使用 WordpieceTokenizer 进一步分词
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 如果不使用基本分词器,则直接使用 WordpieceTokenizer 进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 将词汇符号转换为其对应的 ID
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将 ID 转换为其对应的词汇符号
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 token 序列转换为单个字符串,并去除"##"子词标记
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        # 构建包含特殊符号的输入序列
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Generate token type IDs from two sequences. Token type IDs differentiate between the different sequences
        in the input (e.g., segment A and segment B in a sequence pair).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs representing the second sequence in a sequence pair.

        Returns:
            `List[int]`: List of token type IDs.
        """

        if token_ids_1 is None:
            # If there is only one sequence, return token type IDs as all zeros
            return [0] * len(token_ids_0)
        
        # For a sequence pair, generate token type IDs where the first sequence is 0s and the second sequence is 1s
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
    def create_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define the separator and classification tokens
        sep = [self.sep_token_id]  # List containing the separation token ID
        cls = [self.cls_token_id]  # List containing the classification token ID

        if token_ids_1 is None:
            # If token_ids_1 is None, return a mask with 0s for the first sequence only
            return len(cls + token_ids_0 + sep) * [0]
        else:
            # If token_ids_1 is provided, return a mask with 0s for the first sequence and 1s for the second sequence
            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary of the model to a file.

        Args:
            save_directory (str):
                Directory path where the vocabulary file will be saved.
            filename_prefix (str, *optional*):
                Prefix to be added to the vocabulary file name.

        Returns:
            Tuple[str]: Tuple containing the path to the saved vocabulary file.
        """
        index = 0  # Initialize index for iterating over vocabulary items

        # Determine the full path and filename of the vocabulary file
        if os.path.isdir(save_directory):
            # If save_directory is a directory, construct the full path including the prefix and standard file name
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # If save_directory is a file path, directly use it as the vocabulary file name
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory

        # Write the vocabulary to the determined file path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # Iterate through sorted vocabulary items and write each token to the file
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # Check if the indices are consecutive and log a warning if not
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index  # Update index to current token's index
                writer.write(token + "\n")  # Write the token followed by a newline
                index += 1  # Increment index for the next token

        return (vocab_file,)  # Return tuple containing the path to the saved vocabulary file
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    def __init__(
        self,
        do_lower_case=True,                # 初始化方法,设置是否小写化输入,默认为True
        never_split=None,                  # 初始化方法,设置永远不分割的token集合,默认为None
        tokenize_chinese_chars=True,       # 初始化方法,设置是否分割中文字符,默认为True
        strip_accents=None,                # 初始化方法,设置是否去除所有重音符号,默认根据小写化设置决定
        do_split_on_punc=True,             # 初始化方法,设置是否基本标点符号分割,默认为True
    ):
        if never_split is None:
            never_split = []               # 如果never_split为None,则设置为空列表
        self.do_lower_case = do_lower_case  # 将参数赋值给对象属性,控制是否小写化
        self.never_split = set(never_split)  # 将参数转换为集合并赋值给对象属性,设置永远不分割的token集合
        self.tokenize_chinese_chars = tokenize_chinese_chars  # 将参数赋值给对象属性,控制是否分割中文字符
        self.strip_accents = strip_accents  # 将参数赋值给对象属性,控制是否去除重音符号
        self.do_split_on_punc = do_split_on_punc  # 将参数赋值给对象属性,控制是否基本标点符号分割
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        # 如果传入了 never_split 参数,则将其与对象的 never_split 属性合并成一个新的集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本中的不规范字符或格式
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果启用了 tokenize_chinese_chars,对文本中的中文字符进行特殊处理
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        
        # prevents treating the same character with different unicode codepoints as different characters
        # 使用 NFC 规范化 Unicode 文本,确保不同的 Unicode 编码的同一字符被视为相同字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 将文本按空白字符分割成原始 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        for token in orig_tokens:
            if token not in never_split:
                if self.do_lower_case:
                    # 如果设置了 do_lower_case,将 token 转换为小写
                    token = token.lower()
                    if self.strip_accents is not False:
                        # 如果 strip_accents 不为 False,则去除 token 中的重音符号
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    # 如果 strip_accents 为 True,去除 token 中的重音符号
                    token = self._run_strip_accents(token)
            # 将 token 按标点符号分割,并加入到 split_tokens 列表中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分割后的 token 列表合并成字符串,并按空白字符分割,返回最终的 token 列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 将文本标准化为 NFD 形式,分解为基字符和重音符号
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue  # 如果字符类别为 Mn(重音符号),则跳过
            output.append(char)
        # 将处理后的字符列表连接成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要根据标点符号分割文本,或者文本在never_split列表中,则直接返回文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果字符是标点符号,则将其作为单独的列表项添加到output中,并标记开始一个新单词
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号,且标记为开始新单词,则创建一个新的空列表项
                if start_new_word:
                    output.append([])
                # 将当前字符添加到最后一个列表项中,并标记不再开始新单词
                output[-1].append(char)
                start_new_word = False
            i += 1

        # 将列表中的列表项合并为字符串,并返回结果列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是CJK字符,则在字符前后加入空格,并添加到输出列表中
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                # 如果不是CJK字符,则直接添加到输出列表中
                output.append(char)
        # 将列表中的字符连接为一个字符串,并返回结果
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 判断cp是否在CJK字符的Unicode范围内
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)
            or (cp >= 0x20000 and cp <= 0x2A6DF)
            or (cp >= 0x2A700 and cp <= 0x2B73F)
            or (cp >= 0x2B740 and cp <= 0x2B81F)
            or (cp >= 0x2B820 and cp <= 0x2CEAF)
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)
        ):
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或控制字符,则跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果是空白字符,则替换为单个空格;否则直接添加到输出列表中
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接为一个字符串,并返回清理后的文本
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类的实例
        self.vocab = vocab  # 词汇表,用于词片段化
        self.unk_token = unk_token  # 未知标记,用于替换无法识别的词片段
        self.max_input_chars_per_word = max_input_chars_per_word  # 单个词的最大字符数限制

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 初始化输出词片段列表
        output_tokens = []
        # 对文本进行空白符分割,得到基本的 token
        for token in whitespace_tokenize(text):
            chars = list(token)
            # 如果 token 的字符数超过设定的最大字符数限制,则使用未知标记替代
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            # 使用贪婪的最长匹配算法进行词片段化
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                # 从最长到最短尝试生成子串,并在匹配到词汇表中的词片段时停止
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                # 如果未找到匹配的词片段,则标记为无法识别
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            # 如果存在无法识别的情况,则使用未知标记替代
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        # 返回最终的词片段化结果列表
        return output_tokens

.\models\convbert\tokenization_convbert_fast.py

# coding=utf-8
# 版权归 HuggingFace Inc. 团队所有。
#
# 根据 Apache 许可证版本 2.0 授权使用此文件;
# 除非符合许可证的要求,否则您不能使用此文件。
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件是基于“原样”发布的,
# 没有任何形式的明示或暗示保证或条件。
# 请参阅许可证以获取有关特定语言的权限和限制。
"""ConvBERT 的分词类。"""
import json
from typing import List, Optional, Tuple

from tokenizers import normalizers  # 导入 tokenizers 库中的 normalizers 模块

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入预训练的快速分词器类
from ...utils import logging  # 导入 logging 模块,用于记录日志
from .tokenization_convbert import ConvBertTokenizer  # 导入 ConvBERT 分词器类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}  # 定义词汇表文件名映射字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/vocab.txt",
        "YituTech/conv-bert-medium-small": (
            "https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/vocab.txt"
        ),
        "YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/vocab.txt",
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "YituTech/conv-bert-base": 512,
    "YituTech/conv-bert-medium-small": 512,
    "YituTech/conv-bert-small": 512,
}

PRETRAINED_INIT_CONFIGURATION = {
    "YituTech/conv-bert-base": {"do_lower_case": True},  # 预训练配置字典,指定小写处理为真
    "YituTech/conv-bert-medium-small": {"do_lower_case": True},
    "YituTech/conv-bert-small": {"do_lower_case": True},
}

# 从 transformers.models.bert.tokenization_bert_fast.BertTokenizerFast 复制,将 bert-base-cased->YituTech/conv-bert-base, Bert->ConvBert, BERT->ConvBERT
class ConvBertTokenizerFast(PreTrainedTokenizerFast):
    r"""
    使用 HuggingFace 的 *tokenizers* 库构建“快速”ConvBERT分词器,基于 WordPiece。

    该分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
    ```
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        clean_text (`bool`, *optional*, defaults to `True`):
            Whether or not to clean the text before tokenization by removing any control characters and replacing all
            whitespaces by the classic one.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
            issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original ConvBERT).
        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
            The prefix for subwords.
    ```
    # 定义一些预定义的变量
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = ConvBertTokenizer
    
    ```
    # 初始化方法,用于实例化一个新的 ConvBertTokenizer 对象
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ```
        ):
        # 调用父类的初始化方法,设置模型的词汇文件、分词器文件、大小写敏感性、未知标记、分隔标记、填充标记、类标记、掩码标记、中文字符分词选项和重音符号处理选项
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        # 获取当前后端分词器的正常化状态
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 检查正常化状态是否与参数中的设置一致,如果不一致则更新分词器的正常化器
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):
            # 获取当前正常化器类
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            # 更新正常化状态的设置
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            # 使用更新后的设置重新初始化后端分词器的正常化器
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

        # 更新当前对象的大小写敏感性设置
        self.do_lower_case = do_lower_case

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        从一个序列或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。ConvBERT 序列的格式如下:

        - 单个序列:`[CLS] X [SEP]`
        - 一对序列:`[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                将添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个序列的 ID 列表(可选)。

        Returns:
            `List[int]`: 包含适当特殊标记的 [输入 ID](../glossary#input-ids) 列表。
        """
        # 初始化输出列表,以 [CLS] 标记开始,然后是 token_ids_0,最后加上 [SEP] 标记
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # 如果提供了 token_ids_1,将其添加到输出列表中,并以 [SEP] 标记结尾
        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def create_convbert_sequence_classification_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define separator and classification tokens
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If only one sequence is provided, return a mask with all 0s
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # If two sequences are provided, concatenate their lengths to create the mask
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer's vocabulary to the specified directory.

        Args:
            save_directory (str):
                Directory where the vocabulary files will be saved.
            filename_prefix (str, *optional*):
                Prefix for the saved files.

        Returns:
            `Tuple[str]`: Tuple containing the filenames where the vocabulary was saved.
        """
        # Call the tokenizer's model save method to save the vocabulary
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # Return the filenames where the vocabulary was saved
        return tuple(files)

.\models\convbert\__init__.py

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从 utils 模块中导入所需函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构,包含各模块对应的导入内容列表
_import_structure = {
    "configuration_convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertOnnxConfig"],
    "tokenization_convbert": ["ConvBertTokenizer"],
}

# 检查 tokenizers 是否可用,若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加 tokenization_convbert_fast 模块的导入内容列表
    _import_structure["tokenization_convbert_fast"] = ["ConvBertTokenizerFast"]

# 检查 torch 是否可用,若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加 modeling_convbert 模块的导入内容列表
    _import_structure["modeling_convbert"] = [
        "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ConvBertForMaskedLM",
        "ConvBertForMultipleChoice",
        "ConvBertForQuestionAnswering",
        "ConvBertForSequenceClassification",
        "ConvBertForTokenClassification",
        "ConvBertLayer",
        "ConvBertModel",
        "ConvBertPreTrainedModel",
        "load_tf_weights_in_convbert",
    ]

# 检查 tensorflow 是否可用,若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加 modeling_tf_convbert 模块的导入内容列表
    _import_structure["modeling_tf_convbert"] = [
        "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFConvBertForMaskedLM",
        "TFConvBertForMultipleChoice",
        "TFConvBertForQuestionAnswering",
        "TFConvBertForSequenceClassification",
        "TFConvBertForTokenClassification",
        "TFConvBertLayer",
        "TFConvBertModel",
        "TFConvBertPreTrainedModel",
    ]

# 若为类型检查模式,则进行更详细的导入结构定义
if TYPE_CHECKING:
    # 导入 configuration_convbert 模块的特定类和常量
    from .configuration_convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertOnnxConfig
    # 导入 tokenization_convbert 模块的特定类
    from .tokenization_convbert import ConvBertTokenizer

    # 再次检查 tokenizers 是否可用,若不可用则忽略
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用,则导入 tokenization_convbert_fast 模块的特定类
        from .tokenization_convbert_fast import ConvBertTokenizerFast

    # 再次检查 torch 是否可用,若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果当前环境支持 TensorFlow,则导入 TensorFlow 版的 ConvBERT 模型和相关内容
        from .modeling_convbert import (
            CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            ConvBertForMaskedLM,
            ConvBertForMultipleChoice,
            ConvBertForQuestionAnswering,
            ConvBertForSequenceClassification,
            ConvBertForTokenClassification,
            ConvBertLayer,
            ConvBertModel,
            ConvBertPreTrainedModel,
            load_tf_weights_in_convbert,
        )

    try:
        # 检查是否存在 TensorFlow 的依赖
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 TensorFlow 不可用,则忽略错误
        pass
    else:
        # 如果 TensorFlow 可用,则导入 TensorFlow 版的 ConvBERT 模型和相关内容
        from .modeling_tf_convbert import (
            TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFConvBertForMaskedLM,
            TFConvBertForMultipleChoice,
            TFConvBertForQuestionAnswering,
            TFConvBertForSequenceClassification,
            TFConvBertForTokenClassification,
            TFConvBertLayer,
            TFConvBertModel,
            TFConvBertPreTrainedModel,
        )
else:
    # 导入 sys 模块
    import sys

    # 将当前模块的名称注册到 sys.modules 中
    # 使用 _LazyModule 类来延迟加载模块内容
    # __name__ 表示当前模块的名称
    # globals()["__file__"] 获取当前模块的文件路径
    # _import_structure 包含要导入的模块结构信息
    # module_spec=__spec__ 指定模块的规范对象
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\convnext\configuration_convnext.py

# coding=utf-8
# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" ConvNeXT model configuration"""

# 导入所需模块
from collections import OrderedDict
from typing import Mapping

from packaging import version

# 导入配置工具和相关模块
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练模型的配置文件映射表
CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/convnext-tiny-224": "https://huggingface.co/facebook/convnext-tiny-224/resolve/main/config.json",
    # 查看所有 ConvNeXT 模型请访问 https://huggingface.co/models?filter=convnext
}


class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ConvNextModel`]. It is used to instantiate an
    ConvNeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the ConvNeXT
    [facebook/convnext-tiny-224](https://huggingface.co/facebook/convnext-tiny-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义变量 `model_type` 并赋值为字符串 "convnext"
    model_type = "convnext"
    # 初始化函数,用于初始化一个自定义类的实例
    def __init__(
        self,
        num_channels=3,                              # 设置默认值为3的通道数
        patch_size=4,                                # 设置默认值为4的补丁大小
        num_stages=4,                                # 设置默认值为4的阶段数
        hidden_sizes=None,                           # 隐藏层大小列表,默认为[96, 192, 384, 768]
        depths=None,                                 # 每个阶段的深度列表,默认为[3, 3, 9, 3]
        hidden_act="gelu",                           # 隐藏层激活函数,默认为GELU
        initializer_range=0.02,                      # 初始化器范围,默认为0.02
        layer_norm_eps=1e-12,                        # 层归一化的epsilon值,默认为1e-12
        layer_scale_init_value=1e-6,                 # 层缩放初始化值,默认为1e-6
        drop_path_rate=0.0,                          # DropPath的比率,默认为0.0
        image_size=224,                              # 图像尺寸,默认为224
        out_features=None,                           # 输出特征的名称列表,默认为None
        out_indices=None,                            # 输出特征的索引列表,默认为None
        **kwargs,                                    # 其他关键字参数
    ):
        # 调用父类的初始化方法,传入所有未显式命名的关键字参数
        super().__init__(**kwargs)

        # 初始化各个属性值
        self.num_channels = num_channels              # 设置实例的通道数属性
        self.patch_size = patch_size                  # 设置实例的补丁大小属性
        self.num_stages = num_stages                  # 设置实例的阶段数属性
        self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes
                                                     # 如果隐藏层大小列表为空,则使用默认值
        self.depths = [3, 3, 9, 3] if depths is None else depths
                                                     # 如果深度列表为空,则使用默认值
        self.hidden_act = hidden_act                  # 设置实例的隐藏层激活函数属性
        self.initializer_range = initializer_range    # 设置实例的初始化器范围属性
        self.layer_norm_eps = layer_norm_eps          # 设置实例的层归一化epsilon属性
        self.layer_scale_init_value = layer_scale_init_value
                                                     # 设置实例的层缩放初始化值属性
        self.drop_path_rate = drop_path_rate          # 设置实例的DropPath比率属性
        self.image_size = image_size                  # 设置实例的图像尺寸属性

        # 定义阶段名称列表,包括"stem"和从"stage1"到"stageN"的命名
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]

        # 调用辅助函数获取对齐的输出特征和输出索引
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features,                # 输出特征名称列表
            out_indices=out_indices,                  # 输出特征索引列表
            stage_names=self.stage_names               # 阶段名称列表
        )
# 定义一个名为 ConvNextOnnxConfig 的类,继承自 OnnxConfig 类
class ConvNextOnnxConfig(OnnxConfig):
    
    # 定义类属性 torch_onnx_minimum_version,并赋值为解析后的版本号 "1.11"
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义一个 inputs 的属性方法,返回一个有序字典,表示输入的映射关系
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义一个 atol_for_validation 的属性方法,返回一个浮点数,表示验证时的允许误差
    @property
    def atol_for_validation(self) -> float:
        return 1e-5

.\models\convnext\convert_convnext_to_pytorch.py

# 指定文件编码为UTF-8,确保代码中的中文等字符能正确处理
# 版权声明,声明代码版权归HuggingFace Inc.团队所有
#
# 根据Apache许可证2.0版,只有在符合许可证条件下才能使用此文件
# 您可以在以下网址获得许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件按"原样"分发,不提供任何形式的明示或暗示保证
# 有关许可证的详细信息,请参阅许可证文本
"""从原始存储库中转换ConvNext检查点。

URL: https://github.com/facebookresearch/ConvNeXt
"""


import argparse  # 导入解析命令行参数的模块
import json  # 导入处理JSON格式数据的模块
from pathlib import Path  # 导入处理文件路径的模块

import requests  # 导入发送HTTP请求的模块
import torch  # 导入PyTorch深度学习框架
from huggingface_hub import hf_hub_download  # 导入从Hugging Face Hub下载模型和数据的功能
from PIL import Image  # 导入处理图像的模块

from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor  # 导入ConvNext模型及其相关组件
from transformers.utils import logging  # 导入用于记录日志的模块


logging.set_verbosity_info()  # 设置日志记录级别为信息
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_convnext_config(checkpoint_url):
    config = ConvNextConfig()  # 创建ConvNext模型配置对象

    # 根据checkpoint_url的内容设置模型深度和隐藏层大小
    if "tiny" in checkpoint_url:
        depths = [3, 3, 9, 3]
        hidden_sizes = [96, 192, 384, 768]
    if "small" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [96, 192, 384, 768]
    if "base" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [128, 256, 512, 1024]
    if "large" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [192, 384, 768, 1536]
    if "xlarge" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [256, 512, 1024, 2048]

    # 根据checkpoint_url的内容设置标签数、标签映射文件名及期望形状
    if "1k" in checkpoint_url:
        num_labels = 1000
        filename = "imagenet-1k-id2label.json"
        expected_shape = (1, 1000)
    else:
        num_labels = 21841
        filename = "imagenet-22k-id2label.json"
        expected_shape = (1, 21841)

    repo_id = "huggingface/label-files"
    # 加载并解析标签映射文件,转换为ID到标签的映射字典
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    
    if "1k" not in checkpoint_url:
        # 删除包含21843个标签的数据集中模型没有的类别
        # 参考:https://github.com/google-research/big_transfer/issues/18
        del id2label[9205]
        del id2label[15027]
    
    # 设置模型配置对象的ID到标签和标签到ID映射
    config.num_labels = num_labels
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}
    config.hidden_sizes = hidden_sizes
    config.depths = depths

    return config, expected_shape


def rename_key(name):
    if "downsample_layers.0.0" in name:
        # 将指定模型权重的键名更改为新的键名以匹配新的模型结构
        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")
    if "downsample_layers.0.1" in name:
        # 将指定模型权重的键名更改为新的键名以匹配新的模型结构
        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # 后续会重命名为layernorm
    # 检查字符串中是否包含特定子串 "downsample_layers.1.0",如果是,则替换为 "stages.1.downsampling_layer.0"
    if "downsample_layers.1.0" in name:
        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")
    # 检查字符串中是否包含特定子串 "downsample_layers.1.1",如果是,则替换为 "stages.1.downsampling_layer.1"
    if "downsample_layers.1.1" in name:
        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")
    # 检查字符串中是否包含特定子串 "downsample_layers.2.0",如果是,则替换为 "stages.2.downsampling_layer.0"
    if "downsample_layers.2.0" in name:
        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")
    # 检查字符串中是否包含特定子串 "downsample_layers.2.1",如果是,则替换为 "stages.2.downsampling_layer.1"
    if "downsample_layers.2.1" in name:
        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")
    # 检查字符串中是否包含特定子串 "downsample_layers.3.0",如果是,则替换为 "stages.3.downsampling_layer.0"
    if "downsample_layers.3.0" in name:
        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")
    # 检查字符串中是否包含特定子串 "downsample_layers.3.1",如果是,则替换为 "stages.3.downsampling_layer.1"
    if "downsample_layers.3.1" in name:
        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")
    # 如果字符串中包含 "stages" 但不包含 "downsampling_layer",将其修改为 "stages.layers"
    if "stages" in name and "downsampling_layer" not in name:
        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
    # 如果字符串中包含 "stages",将其替换为 "encoder.stages"
    if "stages" in name:
        name = name.replace("stages", "encoder.stages")
    # 如果字符串中包含 "norm",将其替换为 "layernorm"
    if "norm" in name:
        name = name.replace("norm", "layernorm")
    # 如果字符串中包含 "gamma",将其替换为 "layer_scale_parameter"
    if "gamma" in name:
        name = name.replace("gamma", "layer_scale_parameter")
    # 如果字符串中包含 "head",将其替换为 "classifier"
    if "head" in name:
        name = name.replace("head", "classifier")

    # 返回修改后的字符串 name
    return name
# 定义一个函数,用于准备一张可爱猫咪的图片作为数据处理的基础
def prepare_img():
    # 定义图片的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 HTTP 请求获取图像的二进制数据流,并使用 PIL 库打开为图像对象
    im = Image.open(requests.get(url, stream=True).raw)
    return im

@torch.no_grad()
# 定义一个函数,用于将预训练模型的权重转换到我们的 ConvNext 结构中
def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    复制/粘贴/调整模型权重以适应我们的 ConvNext 结构。
    """

    # 根据 URL 获取 ConvNext 的配置和预期形状
    config, expected_shape = get_convnext_config(checkpoint_url)
    
    # 从 URL 加载原始的 state_dict
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
    
    # 重命名 state_dict 的键值对
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        state_dict[rename_key(key)] = val
    
    # 为除了分类器头部外的所有键值对添加前缀
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        if not key.startswith("classifier"):
            key = "convnext." + key
        state_dict[key] = val
    
    # 加载 HuggingFace 的 ConvNext 图像分类模型
    model = ConvNextForImageClassification(config)
    model.load_state_dict(state_dict)
    model.eval()
    
    # 准备一个 ConvNextImageProcessor 对象,用于处理图像
    size = 224 if "224" in checkpoint_url else 384
    image_processor = ConvNextImageProcessor(size=size)
    # 准备图像数据,返回张量格式的像素值
    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
    
    # 使用模型预测图像的 logits
    logits = model(pixel_values).logits
    
    # 注意:以下的 logits 是在没有中心裁剪的情况下获得的
    # 根据不同的 checkpoint_url,设置期望的 logits 值
    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth":
        expected_logits = torch.tensor([-0.1210, -0.6605, 0.1918])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth":
        expected_logits = torch.tensor([-0.4473, -0.1847, -0.6365])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth":
        expected_logits = torch.tensor([0.4525, 0.7539, 0.0308])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth":
        expected_logits = torch.tensor([0.3561, 0.6350, -0.0384])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth":
        expected_logits = torch.tensor([0.4174, -0.0989, 0.1489])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth":
        expected_logits = torch.tensor([0.2513, -0.1349, -0.1613])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth":
        expected_logits = torch.tensor([1.2980, 0.3631, -0.1198])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth":
        expected_logits = torch.tensor([1.2963, 0.1227, 0.1723])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth":
        expected_logits = torch.tensor([1.7956, 0.8390, 0.2820])
    # 检查模型下载链接是否为指定的预训练模型,设置对应的预期输出结果
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth":
        expected_logits = torch.tensor([-0.2822, -0.0502, -0.0878])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth":
        expected_logits = torch.tensor([-0.5672, -0.0730, -0.4348])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth":
        expected_logits = torch.tensor([0.2681, 0.2365, 0.6246])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth":
        expected_logits = torch.tensor([-0.2642, 0.3931, 0.5116])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth":
        expected_logits = torch.tensor([-0.6677, -0.1873, -0.8379])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth":
        expected_logits = torch.tensor([-0.7749, -0.2967, -0.6444])
    else:
        # 如果链接不在预期列表中,抛出值错误异常
        raise ValueError(f"Unknown URL: {checkpoint_url}")

    # 断言模型输出的前三个元素与预期输出接近,允许的误差为1e-3
    assert torch.allclose(logits[0, :3], expected_logits, atol=1e-3)
    # 断言模型输出的形状与预期形状相同
    assert logits.shape == expected_shape

    # 创建目录以保存 PyTorch 模型和图像处理器
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印保存模型的路径
    print(f"Saving model to {pytorch_dump_folder_path}")
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印保存图像处理器的路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)

    # 提示将模型推送到模型中心
    print("Pushing model to the hub...")
    # 初始化模型名称
    model_name = "convnext"
    # 根据链接中的关键字更新模型名称
    if "tiny" in checkpoint_url:
        model_name += "-tiny"
    elif "small" in checkpoint_url:
        model_name += "-small"
    elif "base" in checkpoint_url:
        model_name += "-base"
    elif "xlarge" in checkpoint_url:
        model_name += "-xlarge"
    elif "large" in checkpoint_url:
        model_name += "-large"
    # 根据链接中的分辨率更新模型名称
    if "224" in checkpoint_url:
        model_name += "-224"
    elif "384" in checkpoint_url:
        model_name += "-384"
    # 根据链接中的训练集更新模型名称
    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
        model_name += "-22k"
    if "22k" in checkpoint_url and "1k" in checkpoint_url:
        model_name += "-22k-1k"

    # 将模型推送到指定的模型中心仓库
    model.push_to_hub(
        repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
        organization="nielsr",
        commit_message="Add model",
    )
if __name__ == "__main__":
    # 如果脚本直接运行(而非被导入为模块),执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--checkpoint_url",
        default="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
        type=str,
        help="URL of the original ConvNeXT checkpoint you'd like to convert."
    )
    # 添加一个命令行参数,用于指定 ConvNeXT 模型的原始检查点的 URL

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=True,
        help="Path to the output PyTorch model directory."
    )
    # 添加一个命令行参数,用于指定输出 PyTorch 模型的目录路径,并且是必须提供的参数

    args = parser.parse_args()
    # 解析命令行参数并将其存储在 args 对象中

    convert_convnext_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
    # 调用函数 convert_convnext_checkpoint,传递解析后的参数 args 中的 checkpoint_url 和 pytorch_dump_folder_path

.\models\convnext\feature_extraction_convnext.py

# coding=utf-8
# 定义文件的编码格式为 UTF-8

# 版权声明,声明代码版权归 The HuggingFace Inc. 团队所有,保留所有权利。
# 根据 Apache 许可证版本 2.0 进行许可,除非符合许可证的要求,否则不得使用此文件。
# 可以在以下网址获取许可证的副本:http://www.apache.org/licenses/LICENSE-2.0

# 如果不符合适用法律或书面同意的要求,本软件是基于“按原样”提供,没有任何明示或暗示的担保或条件。
# 请参阅许可证了解具体的法律规定。

"""ConvNeXT 的特征提取器类。"""

# 导入警告模块
import warnings

# 导入日志工具
from ...utils import logging

# 从 image_processing_convnext 模块导入 ConvNextImageProcessor 类
from .image_processing_convnext import ConvNextImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


class ConvNextFeatureExtractor(ConvNextImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告信息,提示 ConvNextFeatureExtractor 类已经弃用,并将在 Transformers 的第五个版本中删除。
        # 建议使用 ConvNextImageProcessor 替代。
        warnings.warn(
            "The class ConvNextFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use ConvNextImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法,传递所有位置参数和关键字参数
        super().__init__(*args, **kwargs)

.\models\convnext\image_processing_convnext.py

# 导入必要的模块和函数
from typing import Dict, List, Optional, Union  # 导入类型提示相关的模块

import numpy as np  # 导入 NumPy 库,用于处理数组数据

# 导入图像处理相关的工具函数和类
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    center_crop,  # 导入中心裁剪函数
    get_resize_output_image_size,  # 导入获取调整后图像尺寸函数
    resize,  # 导入调整图像尺寸函数
    to_channel_dimension_format,  # 导入转换为通道维度格式函数
)
# 导入图像处理工具函数和常量
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,  # 导入图像标准均值常量
    IMAGENET_STANDARD_STD,  # 导入图像标准标准差常量
    ChannelDimension,  # 导入通道维度枚举
    ImageInput,  # 导入图像输入类
    PILImageResampling,  # 导入 PIL 图像重采样枚举
    infer_channel_dimension_format,  # 导入推断通道维度格式函数
    is_scaled_image,  # 导入判断是否为缩放图像函数
    make_list_of_images,  # 导入创建图像列表函数
    to_numpy_array,  # 导入转换为 NumPy 数组函数
    valid_images,  # 导入验证图像函数
    validate_kwargs,  # 导入验证关键字参数函数
    validate_preprocess_arguments,  # 导入验证预处理参数函数
)
# 导入通用工具函数和类型检查相关函数
from ...utils import TensorType, is_vision_available, logging  # 导入张量类型和可视化库是否可用函数


if is_vision_available():  # 检查是否可用视觉处理库
    import PIL  # 导入 PIL 库,用于图像处理


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象


class ConvNextImageProcessor(BaseImageProcessor):
    r"""
    Constructs a ConvNeXT image processor.
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
            by `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 384}`):
            Resolution of the output image after `resize` is applied. If `size["shortest_edge"]` >= 384, the image is
            resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the image will
            be matched to `int(size["shortest_edge"]/crop_pct)`, after which the image is cropped to
            `(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`. Can
            be overriden by `size` in the `preprocess` method.
        crop_pct (`float` *optional*, defaults to 224 / 256):
            Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
            overriden by `crop_pct` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """

    # 列出模型输入名称列表,包含像素值
    model_input_names = ["pixel_values"]
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        crop_pct: float = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs,
    ) -> None:
        # 调用父类初始化方法
        super().__init__(**kwargs)
        # 如果参数中未提供 size,则设置一个默认的 size 字典,确保 size 中至少包含键 "shortest_edge" 且值为 384
        size = size if size is not None else {"shortest_edge": 384}
        # 根据传入的 size 参数,获取标准化的尺寸字典,确保返回一个方形的尺寸字典
        size = get_size_dict(size, default_to_square=False)

        # 初始化对象属性
        self.do_resize = do_resize  # 是否进行图像调整大小的标志
        self.size = size  # 图像调整大小后的目标尺寸字典
        # 如果 crop_pct 为 None,则设置一个默认的裁剪比例,默认为 224/256
        self.crop_pct = crop_pct if crop_pct is not None else 224 / 256
        self.resample = resample  # 图像调整大小时使用的重采样方法,默认为双线性插值
        self.do_rescale = do_rescale  # 是否进行图像重新缩放的标志
        self.rescale_factor = rescale_factor  # 图像重新缩放的因子,默认为 1/255
        self.do_normalize = do_normalize  # 是否进行图像标准化的标志
        # 如果 image_mean 为 None,则使用预定义的 IMAGENET_STANDARD_MEAN
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        # 如果 image_std 为 None,则使用预定义的 IMAGENET_STANDARD_STD
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
        # 设置一个有效的处理器关键字列表,用于后续验证和处理
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "crop_pct",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    ) -> np.ndarray:
        """
        Resize an image.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary of the form `{"shortest_edge": int}`, specifying the size of the output image. If
                `size["shortest_edge"]` >= 384 image is resized to `(size["shortest_edge"], size["shortest_edge"])`.
                Otherwise, the smaller edge of the image will be matched to `int(size["shortest_edge"] / crop_pct)`,
                after which the image is cropped to `(size["shortest_edge"], size["shortest_edge"])`.
            crop_pct (`float`):
                Percentage of the image to crop. Only has an effect if size < 384.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred from the input
                image.
        """
        # Ensure size dictionary includes the 'shortest_edge' key
        size = get_size_dict(size, default_to_square=False)
        if "shortest_edge" not in size:
            raise ValueError(f"Size dictionary must contain 'shortest_edge' key. Got {size.keys()}")
        
        shortest_edge = size["shortest_edge"]

        if shortest_edge < 384:
            # Calculate the resized shortest edge based on crop percentage
            resize_shortest_edge = int(shortest_edge / crop_pct)
            # Determine the output size after resizing with maintaining aspect ratio
            resize_size = get_resize_output_image_size(
                image, size=resize_shortest_edge, default_to_square=False, input_data_format=input_data_format
            )
            # Resize the image using specified parameters
            image = resize(
                image=image,
                size=resize_size,
                resample=resample,
                data_format=data_format,
                input_data_format=input_data_format,
                **kwargs,
            )
            # Crop the resized image to (shortest_edge, shortest_edge)
            return center_crop(
                image=image,
                size=(shortest_edge, shortest_edge),
                data_format=data_format,
                input_data_format=input_data_format,
                **kwargs,
            )
        else:
            # Resize the image without cropping when size is 384 or larger
            return resize(
                image,
                size=(shortest_edge, shortest_edge),
                resample=resample,
                data_format=data_format,
                input_data_format=input_data_format,
                **kwargs,
            )
    # 图像预处理函数,接受多种参数用于处理图像
    def preprocess(
        self,
        # 输入的图像数据,可以是单个图像或图像列表
        images: ImageInput,
        # 是否执行调整图像大小的操作,默认为None
        do_resize: bool = None,
        # 调整图像大小的目标尺寸,字典格式,包括宽度和高度
        size: Dict[str, int] = None,
        # 裁剪图像的百分比,用于裁剪中心区域,默认为None
        crop_pct: float = None,
        # 重采样方法,例如缩放图像时使用的插值方法,默认为None
        resample: PILImageResampling = None,
        # 是否执行图像缩放操作,默认为None
        do_rescale: bool = None,
        # 缩放因子,用于调整图像大小,默认为None
        rescale_factor: float = None,
        # 是否执行图像标准化操作,默认为None
        do_normalize: bool = None,
        # 图像的均值,用于标准化操作,可以是单个值或列表形式,默认为None
        image_mean: Optional[Union[float, List[float]]] = None,
        # 图像的标准差,用于标准化操作,可以是单个值或列表形式,默认为None
        image_std: Optional[Union[float, List[float]]] = None,
        # 返回的数据类型,可以是字符串或张量类型,默认为None
        return_tensors: Optional[Union[str, TensorType]] = None,
        # 数据的通道格式,首通道或者其他,默认为首通道
        data_format: ChannelDimension = ChannelDimension.FIRST,
        # 输入数据的通道格式,可以是字符串或通道维度,默认为None
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        # 其他关键字参数,用于扩展预处理功能
        **kwargs,

.\models\convnext\modeling_convnext.py

# coding=utf-8
# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch ConvNext model."""

# Import necessary modules and functions from PyTorch and Transformers
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# Import various components from HuggingFace Transformers library
from ...activations import ACT2FN
from ...modeling_outputs import (
    BackboneOutput,
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_convnext import ConvNextConfig

# Get logger instance for logging messages
logger = logging.get_logger(__name__)

# General docstring for configuration
_CONFIG_FOR_DOC = "ConvNextConfig"

# Base docstring for checkpoint
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]

# Image classification checkpoint and expected output
_IMAGE_CLASS_CHECKPOINT = "facebook/convnext-tiny-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# List of pretrained model archives for ConvNext
CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/convnext-tiny-224",
    # See all ConvNext models at https://huggingface.co/models?filter=convnext
]

# Function definition for drop path, a form of stochastic depth regularization
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # binarize
    # 根据输入的张量 input 和保留概率 keep_prob 计算 dropout 后的输出张量
    output = input.div(keep_prob) * random_tensor
    # 返回 dropout 后的输出张量
    return output
# 从 transformers.models.beit.modeling_beit.BeitDropPath 复制的代码,将 Beit 替换为 ConvNext
class ConvNextDropPath(nn.Module):
    """每个样本应用于残差块主路径中的丢弃路径(随机深度)。"""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class ConvNextLayerNorm(nn.Module):
    r"""支持两种数据格式的 LayerNorm:channels_last(默认)或 channels_first。
    输入数据维度的顺序。channels_last 对应形状为 (batch_size, height, width, channels) 的输入,
    而 channels_first 对应形状为 (batch_size, channels, height, width) 的输入。
    """

    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
        self.normalized_shape = (normalized_shape,)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.data_format == "channels_last":
            x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
        elif self.data_format == "channels_first":
            input_dtype = x.dtype
            x = x.float()
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = x.to(dtype=input_dtype)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
        return x


class ConvNextEmbeddings(nn.Module):
    """这个类类似于(并且受到启发于)src/transformers/models/swin/modeling_swin.py 中的 SwinEmbeddings 类。"""

    def __init__(self, config):
        super().__init__()
        self.patch_embeddings = nn.Conv2d(
            config.num_channels, config.hidden_sizes[0], kernel_size=config.patch_size, stride=config.patch_size
        )
        self.layernorm = ConvNextLayerNorm(config.hidden_sizes[0], eps=1e-6, data_format="channels_first")
        self.num_channels = config.num_channels
    # 定义前向传播函数,接收像素值作为输入,并返回处理后的张量
    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        # 获取输入张量的通道数
        num_channels = pixel_values.shape[1]
        # 检查输入张量的通道数是否与模型配置中的通道数一致,若不一致则抛出数值错误异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 使用预定义的函数处理输入像素值,生成嵌入表示
        embeddings = self.patch_embeddings(pixel_values)
        # 对生成的嵌入表示进行层归一化处理
        embeddings = self.layernorm(embeddings)
        # 返回处理后的嵌入表示张量
        return embeddings
# 定义 ConvNeXT 阶段,包含可选的下采样层和多个残差块
class ConvNextStage(nn.Module):
    """ConvNeXT stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
    """


在这段代码中,我们定义了一个名为 `ConvNextStage` 的类,用于表示 ConvNeXT 模型的一个阶段。这个阶段包括一个可选的下采样层和多个残差块,这些块将按照指定的参数配置进行堆叠。
    # 初始化函数,用于构建一个自定义的卷积神经网络模块
    def __init__(self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None):
        # 调用父类的初始化方法
        super().__init__()

        # 如果输入通道数不等于输出通道数或者步长大于1,则创建一个下采样层
        if in_channels != out_channels or stride > 1:
            self.downsampling_layer = nn.Sequential(
                # 使用自定义的 ConvNextLayerNorm 类,对输入进行归一化处理
                ConvNextLayerNorm(in_channels, eps=1e-6, data_format="channels_first"),
                # 添加一个卷积层,用于下采样
                nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride),
            )
        else:
            # 如果输入通道数等于输出通道数且步长为1,则使用一个恒等映射层
            self.downsampling_layer = nn.Identity()
        
        # 如果未提供 drop_path_rates 参数,则初始化为一个与深度 depth 相同长度的全零列表
        drop_path_rates = drop_path_rates or [0.0] * depth
        
        # 创建深度为 depth 的卷积层序列
        self.layers = nn.Sequential(
            *[ConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]
        )

    # 前向传播函数,用于定义模型的数据流向
    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 对输入的 hidden_states 进行下采样处理
        hidden_states = self.downsampling_layer(hidden_states)
        # 将下采样后的结果通过多层卷积处理
        hidden_states = self.layers(hidden_states)
        # 返回处理后的结果张量
        return hidden_states
class ConvNextEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个空的模块列表用于存放各个阶段的 ConvNextStage
        self.stages = nn.ModuleList()
        # 计算每个阶段的 drop path rates
        drop_path_rates = [
            x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
        ]
        # 初始化前一阶段的输出通道数为输入的隐藏大小的第一个元素
        prev_chs = config.hidden_sizes[0]
        # 遍历创建每个阶段的 ConvNextStage
        for i in range(config.num_stages):
            out_chs = config.hidden_sizes[i]
            stage = ConvNextStage(
                config,
                in_channels=prev_chs,
                out_channels=out_chs,
                stride=2 if i > 0 else 1,
                depth=config.depths[i],
                drop_path_rates=drop_path_rates[i],
            )
            # 将创建的阶段添加到模块列表中
            self.stages.append(stage)
            prev_chs = out_chs

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
        # 如果需要输出所有隐藏状态,则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个阶段的模块,并对隐藏状态进行前向传播
        for i, layer_module in enumerate(self.stages):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            hidden_states = layer_module(hidden_states)

        # 如果需要输出所有隐藏状态,则添加最后一个阶段的隐藏状态到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的输出,则根据需要返回隐藏状态和所有隐藏状态的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 返回带有无注意力的基本模型输出
        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )


class ConvNextPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = ConvNextConfig
    base_model_prefix = "convnext"
    main_input_name = "pixel_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 初始化线性层和卷积层的权重,使用正态分布初始化,偏置初始化为零
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 与 TF 版本略有不同,使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        # 初始化层归一化层的权重,偏置初始化为零,权重初始化为 1.0
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


CONVNEXT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
"""
    Parameters:
        config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@add_start_docstrings(
    """
    ConvNext 模型输出裸特征,没有特定头部添加。
    """,
    CONVNEXT_START_DOCSTRING,
)


这段代码定义了一个类 `ConvNextModel`,继承自 `ConvNextPreTrainedModel`,用于构建 ConvNext 模型。在类的初始化函数中,首先调用父类的初始化方法,然后设置了模型的配置信息和各个模块的初始化,包括 `ConvNextEmbeddings` 和 `ConvNextEncoder`。此外,还初始化了一个 `LayerNorm` 层用于最终的归一化处理。

    self.embeddings = ConvNextEmbeddings(config)

这行代码初始化了 ConvNextEmbeddings 类的实例 self.embeddings,并传入了模型配置 config

    self.encoder = ConvNextEncoder(config)

这行代码初始化了 ConvNextEncoder 类的实例 self.encoder,并同样传入了模型配置 config

    self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)

这行代码初始化了 LayerNormself.layernorm,其中 config.hidden_sizes[-1] 表示配置中定义的隐藏层的最后一个尺寸,eps=config.layer_norm_eps 则是配置中定义的层归一化的 epsilon 参数。
# ConvNext模型,其顶部有一个图像分类头部(在池化特征之上的线性层),例如用于ImageNet。
""",
# 使用CONVNEXT_START_DOCSTRING的值作为注释的起始点
CONVNEXT_START_DOCSTRING,
)
# Image classification model inheriting from a pretrained ConvNext model
class ConvNextForImageClassification(ConvNextPreTrainedModel):
def init(self, config):
super().init(config)

        # Number of labels for classification
        self.num_labels = config.num_labels
        # Instantiate ConvNext model
        self.convnext = ConvNextModel(config)

        # Classifier head: either a linear layer or an identity function based on number of labels
        self.classifier = (
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # Initialize weights and perform final setup
        self.post_init()

    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # Forward method for the model
    def forward(
        self,
        pixel_values: torch.FloatTensor = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    # 根据需要决定是否使用预定义的返回字典或者自定义的返回字典配置
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 调用卷积神经网络模型进行前向传播
    outputs = self.convnext(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

    # 根据返回字典的设置选择使用池化后的输出或者直接从输出列表中获取结果
    pooled_output = outputs.pooler_output if return_dict else outputs[1]

    # 使用分类器模型计算输出logits
    logits = self.classifier(pooled_output)

    # 初始化损失值为None
    loss = None
    # 如果提供了标签,则计算损失
    if labels is not None:
        # 如果问题类型未定义,则根据标签类型和标签数量设置问题类型
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        # 根据问题类型选择相应的损失函数
        if self.config.problem_type == "regression":
            loss_fct = MSELoss()
            if self.num_labels == 1:
                # 对于回归问题,使用均方误差损失函数
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            # 对于单标签分类问题,使用交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            # 对于多标签分类问题,使用带logits的二元交叉熵损失函数
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

    # 如果不使用返回字典,则按照原始模型的输出方式返回
    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 如果使用返回字典,则构建特定输出格式的对象返回
    return ImageClassifierOutputWithNoAttention(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
    )

"""
ConvNeXt backbone, to be used with frameworks like DETR and MaskFormer.
"""

定义 ConvNeXtBackbone 类,用于与 DETR 和 MaskFormer 等框架一起使用的卷积神经网络骨干

class ConvNextBackbone(ConvNextPreTrainedModel, BackboneMixin):

# 初始化方法
def __init__(self, config):
    # 调用父类的初始化方法
    super().__init__(config)
    # 调用父类 ConvNextPreTrainedModel 的 _init_backbone 方法
    super()._init_backbone(config)

    # 初始化嵌入层和编码器
    self.embeddings = ConvNextEmbeddings(config)
    self.encoder = ConvNextEncoder(config)
    self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes

    # 为输出特征的隐藏状态添加层归一化
    hidden_states_norms = {}
    # 遍历输出特征和通道数,为每个输出特征添加 ConvNextLayerNorm 层归一化
    for stage, num_channels in zip(self._out_features, self.channels):
        hidden_states_norms[stage] = ConvNextLayerNorm(num_channels, data_format="channels_first")
    self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)

    # 初始化权重并应用最终处理
    self.post_init()

# forward 方法,定义模型的前向传播过程
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    pixel_values: torch.Tensor,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    ) -> BackboneOutput:
    """
    返回 BackBoneOutput 对象。

    Examples:

    ```
    >>> from transformers import AutoImageProcessor, AutoBackbone
    >>> import torch
    >>> from PIL import Image
    >>> import requests

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
    >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")

    >>> inputs = processor(image, return_tensors="pt")
    >>> outputs = model(**inputs)
    ```"""
    
    # 设置返回字典的默认值为 self.config.use_return_dict
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    # 设置输出隐藏状态的默认值为 self.config.output_hidden_states
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    
    # 使用 self.embeddings 对象生成嵌入输出
    embedding_output = self.embeddings(pixel_values)

    # 将嵌入输出传入编码器,并设置输出隐藏状态和返回字典的选项
    outputs = self.encoder(
        embedding_output,
        output_hidden_states=True,
        return_dict=return_dict,
    )

    # 根据是否使用返回字典来选择输出的隐藏状态
    hidden_states = outputs.hidden_states if return_dict else outputs[1]

    # 初始化特征图为空元组
    feature_maps = ()
    # 遍历阶段名称和对应的隐藏状态,如果阶段在输出特征中,则归一化隐藏状态并添加到特征图中
    for stage, hidden_state in zip(self.stage_names, hidden_states):
        if stage in self.out_features:
            hidden_state = self.hidden_states_norms[stage](hidden_state)
            feature_maps += (hidden_state,)

    # 如果不使用返回字典,则将特征图和可能的隐藏状态作为元组输出
    if not return_dict:
        output = (feature_maps,)
        if output_hidden_states:
            output += (hidden_states,)
        return output

    # 使用 BackboneOutput 类返回特征图、隐藏状态(如果有)、注意力(未提供)
    return BackboneOutput(
        feature_maps=feature_maps,
        hidden_states=hidden_states if output_hidden_states else None,
        attentions=None,
    )

# `.\models\convnext\modeling_tf_convnext.py`

```py
# 设置编码格式为utf-8
# 版权声明
from __future__ import annotations

from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

# 导入相对当前文件的模块
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
# 导入包级别的模块
from ...tf_utils import shape_list
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
# 从configuration_convnext模块中导入ConvNextConfig类
from .configuration_convnext import ConvNextConfig

# 获取logger对象
logger = logging.get_logger(__name__)

# 设置文档中的配置和检查点
_CONFIG_FOR_DOC = "ConvNextConfig"
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"

# 定义TFConvNextDropPath类,继承于keras.layers.Layer
class TFConvNextDropPath(keras.layers.Layer):
    # 初始化函数
    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path
    
    # 调用函数
    def call(self, x: tf.Tensor, training=None):
        # 如果处于训练状态
        if training:
            keep_prob = 1 - self.drop_path
            # 获取x的shape
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
            random_tensor = tf.floor(random_tensor)
            return (x / keep_prob) * random_tensor
        # 否则直接返回x
        return x

# 定义TFConvNextEmbeddings类,继承于keras.layers.Layer
class TFConvNextEmbeddings(keras.layers.Layer):
    # 类注释
    """This class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    """
    # 初始化方法,接受一个配置对象和其他关键字参数
    def __init__(self, config: ConvNextConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        
        # 创建一个卷积层用于图像块的嵌入表示
        self.patch_embeddings = keras.layers.Conv2D(
            filters=config.hidden_sizes[0],                # 卷积核数目为配置中的隐藏层大小的第一个元素
            kernel_size=config.patch_size,                 # 卷积核大小为配置中的图像块大小
            strides=config.patch_size,                     # 步长为配置中的图像块大小
            name="patch_embeddings",                       # 层名称为patch_embeddings
            kernel_initializer=get_initializer(config.initializer_range),  # 使用配置中的初始化器初始化卷积核
            bias_initializer=keras.initializers.Zeros(),  # 使用0初始化偏置项
        )
        
        # 创建一个 LayerNormalization 层,用于归一化嵌入表示
        self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
        
        # 记录配置中的通道数
        self.num_channels = config.num_channels
        
        # 记录传入的配置对象
        self.config = config

    # 模型调用方法,接受像素值作为输入,进行嵌入表示的计算
    def call(self, pixel_values):
        # 如果像素值是一个字典,取出其中的像素值
        if isinstance(pixel_values, dict):
            pixel_values = pixel_values["pixel_values"]

        # 断言像素值的通道维度与配置中设置的通道数相匹配,用于调试和错误检查
        tf.debugging.assert_equal(
            shape_list(pixel_values)[1],
            self.num_channels,
            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
        )

        # 当在CPU上运行时,`keras.layers.Conv2D` 不支持 `NCHW` 格式。
        # 所以将输入格式从 `NCHW` 转换为 `NHWC`。
        # shape = (batch_size, in_height, in_width, in_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 计算图像块的嵌入表示
        embeddings = self.patch_embeddings(pixel_values)
        
        # 对嵌入表示进行归一化处理
        embeddings = self.layernorm(embeddings)
        
        # 返回归一化后的嵌入表示
        return embeddings

    # 构建方法,用于构建模型的各个层次结构
    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回
        if self.built:
            return
        
        # 标记模型为已构建状态
        self.built = True
        
        # 如果存在 patch_embeddings 层,则构建其内部结构
        if getattr(self, "patch_embeddings", None) is not None:
            with tf.name_scope(self.patch_embeddings.name):
                self.patch_embeddings.build([None, None, None, self.config.num_channels])
        
        # 如果存在 layernorm 层,则构建其内部结构
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
# 定义自定义层 `TFConvNextLayer`,继承自 `keras.layers.Layer` 类。
"""This corresponds to the `Block` class in the original implementation.

There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow
NHWC ordering, we can just apply the operations straight-away without the permutation.
"""

# 初始化方法,接受 `config`、`dim` 和 `drop_path` 作为参数
def __init__(self, config, dim, drop_path=0.0, **kwargs):
    # 调用父类 `keras.layers.Layer` 的初始化方法
    super().__init__(**kwargs)
    # 设置实例变量 `dim` 和 `config`
    self.dim = dim
    self.config = config

    # 定义深度卷积层 `dwconv`
    self.dwconv = keras.layers.Conv2D(
        filters=dim,
        kernel_size=7,
        padding="same",
        groups=dim,
        kernel_initializer=get_initializer(config.initializer_range),
        bias_initializer="zeros",
        name="dwconv",
    )  # depthwise conv

    # 定义层归一化层 `layernorm`
    self.layernorm = keras.layers.LayerNormalization(
        epsilon=1e-6,
        name="layernorm",
    )

    # 定义第一个点卷积层 `pwconv1`
    self.pwconv1 = keras.layers.Dense(
        units=4 * dim,
        kernel_initializer=get_initializer(config.initializer_range),
        bias_initializer="zeros",
        name="pwconv1",
    )  # pointwise/1x1 convs, implemented with linear layers

    # 获取激活函数并设置为实例变量 `act`
    self.act = get_tf_activation(config.hidden_act)

    # 定义第二个点卷积层 `pwconv2`
    self.pwconv2 = keras.layers.Dense(
        units=dim,
        kernel_initializer=get_initializer(config.initializer_range),
        bias_initializer="zeros",
        name="pwconv2",
    )

    # 设置 `drop_path` 为 `TFConvNextDropPath` 层或线性激活函数 `keras.layers.Activation("linear")`
    # 根据 `drop_path` 大于 `0.0` 条件判断
    self.drop_path = (
        TFConvNextDropPath(drop_path, name="drop_path")
        if drop_path > 0.0
        else keras.layers.Activation("linear", name="drop_path")
    )
    def build(self, input_shape: tf.TensorShape = None):
        # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
        # 初始化一个可训练的层参数,如果配置的初始化值大于零
        self.layer_scale_parameter = (
            self.add_weight(
                shape=(self.dim,),
                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
                trainable=True,
                name="layer_scale_parameter",
            )
            if self.config.layer_scale_init_value > 0
            else None
        )

        # 如果已经构建过,则直接返回
        if self.built:
            return
        self.built = True

        # 如果存在深度可分离卷积层,则构建该层
        if getattr(self, "dwconv", None) is not None:
            with tf.name_scope(self.dwconv.name):
                self.dwconv.build([None, None, None, self.dim])

        # 如果存在层归一化层,则构建该层
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, None, self.dim])

        # 如果存在第一个逐点卷积层,则构建该层
        if getattr(self, "pwconv1", None) is not None:
            with tf.name_scope(self.pwconv1.name):
                self.pwconv1.build([None, None, self.dim])

        # 如果存在第二个逐点卷积层,则构建该层
        if getattr(self, "pwconv2", None) is not None:
            with tf.name_scope(self.pwconv2.name):
                self.pwconv2.build([None, None, 4 * self.dim])

        # 如果存在 drop_path 层,则构建该层
        if getattr(self, "drop_path", None) is not None:
            with tf.name_scope(self.drop_path.name):
                self.drop_path.build(None)

    def call(self, hidden_states, training=False):
        input = hidden_states
        # 应用深度可分离卷积层
        x = self.dwconv(hidden_states)
        # 应用层归一化
        x = self.layernorm(x)
        # 应用第一个逐点卷积层
        x = self.pwconv1(x)
        # 应用激活函数
        x = self.act(x)
        # 应用第二个逐点卷积层
        x = self.pwconv2(x)

        # 如果存在层参数缩放参数,则进行缩放
        if self.layer_scale_parameter is not None:
            x = self.layer_scale_parameter * x

        # 加上原始输入和 drop_path 层的输出
        x = input + self.drop_path(x, training=training)
        return x
class TFConvNextStage(keras.layers.Layer):
    """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config (`ConvNextV2Config`):
            Model configuration class.
        in_channels (`int`):
            Number of input channels.
        out_channels (`int`):
            Number of output channels.
        depth (`int`):
            Number of residual blocks.
        drop_path_rates(`List[float]`):
            Stochastic depth rates for each layer.
    """

    def __init__(
        self,
        config: ConvNextConfig,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 2,
        stride: int = 2,
        depth: int = 2,
        drop_path_rates: Optional[List[float]] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        
        # 如果输入通道数不等于输出通道数或者步幅大于1,添加下采样层
        if in_channels != out_channels or stride > 1:
            self.downsampling_layer = [
                keras.layers.LayerNormalization(
                    epsilon=1e-6,
                    name="downsampling_layer.0",
                ),
                # 由于在 `TFConvNextEmbeddings` 层中将输入从 NCHW 转置到 NHWC 格式,
                # 此处输入将按 NHWC 格式处理。从此处到模型输出,所有输出都将保持 NHWC 格式,
                # 直到最后输出时再次转换为 NCHW 格式。
                keras.layers.Conv2D(
                    filters=out_channels,
                    kernel_size=kernel_size,
                    strides=stride,
                    kernel_initializer=get_initializer(config.initializer_range),
                    bias_initializer=keras.initializers.Zeros(),
                    name="downsampling_layer.1",
                ),
            ]
        else:
            # 如果输入通道数等于输出通道数且步幅为1,则为恒等映射
            self.downsampling_layer = [tf.identity]

        # 根据 depth 和 drop_path_rates 创建 TFConvNextLayer 的列表作为网络的主要层
        drop_path_rates = drop_path_rates or [0.0] * depth
        self.layers = [
            TFConvNextLayer(
                config,
                dim=out_channels,
                drop_path=drop_path_rates[j],
                name=f"layers.{j}",
            )
            for j in range(depth)
        ]
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride

    def call(self, hidden_states):
        # 执行下采样层
        for layer in self.downsampling_layer:
            hidden_states = layer(hidden_states)
        
        # 执行主要层(残差块)
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        
        return hidden_states
    # 定义一个方法 `build`,用于构建神经网络层
    def build(self, input_shape=None):
        # 如果已经构建过网络,则直接返回,不重复构建
        if self.built:
            return
        # 设置标志位,表示网络已经构建
        self.built = True
        
        # 如果存在 `layers` 属性,遍历每一层进行构建
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                # 使用每一层的名称为当前层创建命名空间
                with tf.name_scope(layer.name):
                    # 调用每一层的 `build` 方法进行构建,传入 `None` 作为输入形状
                    layer.build(None)
        
        # 如果输入通道数不等于输出通道数或者步长大于 1
        if self.in_channels != self.out_channels or self.stride > 1:
            # 使用第一个下采样层的名称创建命名空间
            with tf.name_scope(self.downsampling_layer[0].name):
                # 调用第一个下采样层的 `build` 方法,传入输入形状 `[None, None, None, self.in_channels]`
                self.downsampling_layer[0].build([None, None, None, self.in_channels])
            # 使用第二个下采样层的名称创建命名空间
            with tf.name_scope(self.downsampling_layer[1].name):
                # 调用第二个下采样层的 `build` 方法,传入输入形状 `[None, None, None, self.in_channels]`
                self.downsampling_layer[1].build([None, None, None, self.in_channels])
# TFConvNextEncoder 类,继承自 keras.layers.Layer
class TFConvNextEncoder(keras.layers.Layer):
    
    # 初始化方法,接受一个 config 对象和额外的关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化阶段列表
        self.stages = []
        
        # 计算并生成一个列表,包含每个阶段的 drop_path_rate
        drop_path_rates = tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
        
        # 将 drop_path_rates 按照 config.depths 切分成多个部分
        drop_path_rates = tf.split(drop_path_rates, config.depths)
        
        # 将每个 Tensor 转换为列表形式
        drop_path_rates = [x.numpy().tolist() for x in drop_path_rates]
        
        # 获取第一个隐藏层的通道数
        prev_chs = config.hidden_sizes[0]
        
        # 根据 num_stages 创建不同的 TFConvNextStage 实例
        for i in range(config.num_stages):
            # 获取当前阶段的输出通道数
            out_chs = config.hidden_sizes[i]
            
            # 创建 TFConvNextStage 实例并添加到 stages 列表中
            stage = TFConvNextStage(
                config,
                in_channels=prev_chs,
                out_channels=out_chs,
                stride=2 if i > 0 else 1,
                depth=config.depths[i],
                drop_path_rates=drop_path_rates[i],
                name=f"stages.{i}",
            )
            self.stages.append(stage)
            prev_chs = out_chs

    # call 方法,用于执行正向传播
    def call(self, hidden_states, output_hidden_states=False, return_dict=True):
        # 如果要输出所有隐藏状态,则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个阶段并执行正向传播
        for i, layer_module in enumerate(self.stages):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 执行当前阶段的正向传播
            hidden_states = layer_module(hidden_states)

        # 如果要输出所有隐藏状态,则添加最终的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False,则返回非空的结果元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 返回 TFBaseModelOutput 实例,包含最终隐藏状态和所有隐藏状态
        return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)

    # build 方法,用于构建模型的各个阶段
    def build(self, input_shape=None):
        # 遍历每个阶段并在命名作用域内构建阶段
        for stage in self.stages:
            with tf.name_scope(stage.name):
                stage.build(None)



# TFConvNextMainLayer 类,继承自 keras.layers.Layer,并使用 keras_serializable 装饰器
@keras_serializable
class TFConvNextMainLayer(keras.layers.Layer):
    
    # 配置类属性,指定为 ConvNextConfig
    config_class = ConvNextConfig

    # 初始化方法,接受一个 config 对象和额外的关键字参数
    def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
        super().__init__(**kwargs)

        # 初始化 config 属性
        self.config = config
        
        # 创建 TFConvNextEmbeddings 实例,并命名为 "embeddings"
        self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
        
        # 创建 TFConvNextEncoder 实例,并命名为 "encoder"
        self.encoder = TFConvNextEncoder(config, name="encoder")
        
        # 创建 LayerNormalization 层,并设置 epsilon 参数
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
        
        # 如果 add_pooling_layer 为 True,则创建 GlobalAvgPool2D 层,并设置 data_format
        self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None

    # call 方法,用于执行正向传播
    @unpack_inputs
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        # 输入参数解包装饰器,用于接收多个输入参数
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 使用嵌入层将像素值转换为嵌入表示,根据训练状态进行操作
        embedding_output = self.embeddings(pixel_values, training=training)

        # 使用编码器处理嵌入表示,可以选择是否输出隐藏状态
        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的最后隐藏状态,并将通道维度移到第二个位置,确保一致性
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2))
        
        # 通过池化器和层归一化器生成池化后的输出
        pooled_output = self.layernorm(self.pooler(last_hidden_state))

        # 如果需要输出所有隐藏状态,则将它们的通道维度移到正确的位置
        if output_hidden_states:
            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])

        # 根据是否返回字典,构造返回的模型输出
        if not return_dict:
            hidden_states = hidden_states if output_hidden_states else ()
            return (last_hidden_state, pooled_output) + hidden_states

        # 如果返回字典,则使用特定格式返回模型输出
        return TFBaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
        )

    # 构建模型的方法,初始化嵌入层、编码器和层归一化器
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, self.config.hidden_sizes[-1]])
    """
    CONVNEXT_INPUTS_DOCSTRING = r"""
    A class attribute containing a raw string that serves as documentation for the expected input formats
    of models and layers in the ConvNext model.

    This string outlines the two supported input formats for TensorFlow models and layers in the `transformers`
    library:
    - Inputs can be provided as keyword arguments, akin to PyTorch models.
    - Alternatively, inputs can be passed in a list, tuple, or dictionary within the first positional argument.

    The support for the second format is particularly beneficial for seamless integration with Keras methods like
    `model.fit()`. When using Keras methods, inputs and labels can be passed in any format supported by `model.fit()`.

    For cases outside Keras methods, such as custom layers or models using the Keras Functional API, three approaches
    are recommended for gathering input Tensors:
    - A single Tensor containing `pixel_values` exclusively: `model(pixel_values)`
    - A list of varying length containing input Tensors in specified order: `model([pixel_values, attention_mask])`
      or `model([pixel_values, attention_mask, token_type_ids])`
    - A dictionary associating input Tensors with their respective names: `model({"pixel_values": pixel_values,
      "token_type_ids": token_type_ids})`

    For users creating models and layers through subclassing, typical Python function input practices apply.

    This documentation aims to clarify the input expectations and usage guidelines for ConvNext models and layers.

    """
    """
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            # `pixel_values`参数可以是`np.ndarray`、`tf.Tensor`、`List[tf.Tensor]`、`Dict[str, tf.Tensor]`或`Dict[str, np.ndarray]`类型的数据,每个示例必须具有形状为`(batch_size, num_channels, height, width)`的结构。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回张量中的`hidden_states`。此参数仅在即时执行模式下有效,在图模式下将使用配置中的值。

        return_dict (`bool`, *optional*):
            # 是否返回[`~utils.ModelOutput`]而不是普通元组。此参数仅在即时执行模式下有效,在图模式下将始终设置为True。
"""
@add_start_docstrings(
    "The bare ConvNext model outputting raw features without any specific head on top.",
    CONVNEXT_START_DOCSTRING,
)
class TFConvNextModel(TFConvNextPreTrainedModel):
    def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化 ConvNext 主层,配置是否添加池化层
        self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        r"""
        Returns:
            Depending on `return_dict`, either returns:
                - TFBaseModelOutputWithPooling (if `return_dict=True`)
                - Tuple[tf.Tensor] (if `return_dict=False`)

        Examples:
            示例用法代码块,演示如何使用 TFConvNextModel 进行推理。

        """
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用 ConvNext 主层进行前向传播
        outputs = self.convnext(
            pixel_values=pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        if not return_dict:
            # 如果 return_dict=False,返回元组形式的输出
            return (outputs[0],) + outputs[1:]

        # 如果 return_dict=True,返回 TFBaseModelOutputWithPooling 对象
        return TFBaseModelOutputWithPooling(
            last_hidden_state=outputs.last_hidden_state,
            pooler_output=outputs.pooler_output,
            hidden_states=outputs.hidden_states,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "convnext", None) is not None:
            with tf.name_scope(self.convnext.name):
                # 构建 ConvNext 主层
                self.convnext.build(None)


@add_start_docstrings(
    """
    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """,
    CONVNEXT_START_DOCSTRING,
)
"""
        # TFConvNextForImageClassification 类的构造函数,继承自 TFConvNextPreTrainedModel 和 TFSequenceClassificationLoss
        def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
            # 调用父类 TFConvNextPreTrainedModel 的构造函数
            super().__init__(config, *inputs, **kwargs)

            # 设置分类任务的标签数目
            self.num_labels = config.num_labels
            # 创建 TFConvNextMainLayer 的实例作为特征提取器
            self.convnext = TFConvNextMainLayer(config, name="convnext")

            # 分类器部分
            # 创建一个全连接层作为分类器,输出单元数为 config.num_labels
            self.classifier = keras.layers.Dense(
                units=config.num_labels,
                kernel_initializer=get_initializer(config.initializer_range),
                bias_initializer="zeros",
                name="classifier",
            )
            # 保存配置信息
            self.config = config

        @unpack_inputs
        @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
        # 定义模型的前向传播方法,接收多种输入参数并返回模型输出
        def call(
            self,
            pixel_values: TFModelInputType | None = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            labels: np.ndarray | tf.Tensor | None = None,
            training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:
            Returns either TFSequenceClassifierOutput or a tuple of tf.Tensor:
                TFSequenceClassifierOutput: Output containing loss, logits, and hidden states.
                Tuple[tf.Tensor]: Tuple of logits and additional hidden states.

        Examples:
            Example usage demonstrating image processing and classification using Transformers and TensorFlow.

            ```
            >>> from transformers import AutoImageProcessor, TFConvNextForImageClassification
            >>> import tensorflow as tf
            >>> from PIL import Image
            >>> import requests

            >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
            >>> image = Image.open(requests.get(url, stream=True).raw)

            >>> image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
            >>> model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")

            >>> inputs = image_processor(images=image, return_tensors="tf")
            >>> outputs = model(**inputs)
            >>> logits = outputs.logits
            >>> # model predicts one of the 1000 ImageNet classes
            >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
            >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
            ```

        """
        # Determine whether to include hidden states in the output
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Determine whether to use a return dictionary for the output
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Check if pixel_values are provided, raise an error if not
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # Pass pixel_values through the ConvNext model
        outputs = self.convnext(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # Determine the pooled output based on return_dict flag
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # Calculate logits using the classifier network
        logits = self.classifier(pooled_output)

        # Compute loss if labels are provided using the helper function hf_compute_loss
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # If return_dict is False, format the output as a tuple
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # Return TFSequenceClassifierOutput if return_dict is True
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )

    def build(self, input_shape=None):
        # Check if the model is already built, if so, return immediately
        if self.built:
            return
        # Set built flag to True indicating the model is being built
        self.built = True

        # Build the ConvNext model if it exists
        if getattr(self, "convnext", None) is not None:
            with tf.name_scope(self.convnext.name):
                self.convnext.build(None)

        # Build the classifier model if it exists
        if getattr(self, "classifier", None) is not None:
            if hasattr(self.classifier, "name"):
                with tf.name_scope(self.classifier.name):
                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])

.\models\convnext\__init__.py

# 引入类型检查模块,用于检查类型是否可用
from typing import TYPE_CHECKING

# 从工具模块中引入必要的依赖项和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_torch_available,
    is_vision_available,
)

# 定义一个字典结构,用于存储模块导入的结构
_import_structure = {
    "configuration_convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig", "ConvNextOnnxConfig"]
}

# 检查视觉处理模块是否可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则向导入结构中添加特征提取和图像处理的相关内容
    _import_structure["feature_extraction_convnext"] = ["ConvNextFeatureExtractor"]
    _import_structure["image_processing_convnext"] = ["ConvNextImageProcessor"]

# 检查是否torch可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则向导入结构中添加模型相关的内容
    _import_structure["modeling_convnext"] = [
        "CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ConvNextForImageClassification",
        "ConvNextModel",
        "ConvNextPreTrainedModel",
        "ConvNextBackbone",
    ]

# 检查是否tf可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则向导入结构中添加TensorFlow模型相关的内容
    _import_structure["modeling_tf_convnext"] = [
        "TFConvNextForImageClassification",
        "TFConvNextModel",
        "TFConvNextPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从配置文件导入相关的配置信息和类定义
    from .configuration_convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig, ConvNextOnnxConfig

    try:
        # 检查视觉处理模块是否可用
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用,则从相应模块导入特征提取和图像处理类
        from .feature_extraction_convnext import ConvNextFeatureExtractor
        from .image_processing_convnext import ConvNextImageProcessor

    try:
        # 检查torch是否可用
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用,则从模型定义模块导入相关类和配置信息
        from .modeling_convnext import (
            CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
            ConvNextBackbone,
            ConvNextForImageClassification,
            ConvNextModel,
            ConvNextPreTrainedModel,
        )

    try:
        # 检查tf是否可用
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果条件不满足,则从当前目录下的 model_tf_convnext 模块中导入以下类和函数
        from .modeling_tf_convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
else:
    # 导入 sys 模块,用于操作 Python 解释器的系统功能
    import sys

    # 将当前模块名(__name__)作为键,以 _LazyModule 对象作为值,赋给 sys.modules 字典
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

.\models\convnextv2\configuration_convnextv2.py

# coding=utf-8
# Copyright 2023 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ConvNeXTV2 model configuration
"""

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志工具
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices  # 导入背骨配置混合类和获取对齐输出特征输出索引的方法

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/convnextv2-tiny-1k-224": "https://huggingface.co/facebook/convnextv2-tiny-1k-224/resolve/main/config.json",
}

class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ConvNextV2Model`]. It is used to instantiate an
    ConvNeXTV2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the ConvNeXTV2
    [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 设置模型类型为 "convnextv2"
    model_type = "convnextv2"
    # 初始化函数,用于初始化一个类的实例
    def __init__(
        self,
        num_channels=3,                   # 图像输入通道数,默认为3
        patch_size=4,                     # 图像分块大小,默认为4
        num_stages=4,                     # 模型阶段数,默认为4
        hidden_sizes=None,                # 每个阶段的隐藏层大小列表,默认为[96, 192, 384, 768]
        depths=None,                      # 每个阶段的层数列表,默认为[3, 3, 9, 3]
        hidden_act="gelu",                # 隐藏层激活函数,默认为'gelu'
        initializer_range=0.02,           # 参数初始化范围,默认为0.02
        layer_norm_eps=1e-12,             # Layer Normalization 的 epsilon 值,默认为1e-12
        drop_path_rate=0.0,               # Drop Path 比率,默认为0.0
        image_size=224,                   # 输入图像大小,默认为224
        out_features=None,                # 输出特征列表,用于对齐模型输出特征
        out_indices=None,                 # 输出特征索引列表,用于对齐模型输出特征
        **kwargs,
    ):
        super().__init__(**kwargs)        # 调用父类的初始化函数

        self.num_channels = num_channels  # 设置图像输入通道数
        self.patch_size = patch_size      # 设置图像分块大小
        self.num_stages = num_stages      # 设置模型阶段数
        self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes  # 设置每个阶段的隐藏层大小列表
        self.depths = [3, 3, 9, 3] if depths is None else depths  # 设置每个阶段的层数列表
        self.hidden_act = hidden_act      # 设置隐藏层激活函数
        self.initializer_range = initializer_range  # 设置参数初始化范围
        self.layer_norm_eps = layer_norm_eps        # 设置 Layer Normalization 的 epsilon 值
        self.drop_path_rate = drop_path_rate        # 设置 Drop Path 比率
        self.image_size = image_size        # 设置输入图像大小
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
        # 设置模型阶段的名称列表,包括 'stem' 和 'stage1', 'stage2', ...
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )
        # 调用函数获取对齐后的输出特征和输出特征索引

.\models\convnextv2\convert_convnextv2_to_pytorch.py

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert ConvNeXTV2 checkpoints from the original repository.

URL: https://github.com/facebookresearch/ConvNeXt"""

import argparse  # 导入用于处理命令行参数的模块
import json  # 导入用于处理JSON格式数据的模块
import os  # 导入用于操作系统相关功能的模块

import requests  # 导入用于发送HTTP请求的模块
import torch  # 导入PyTorch深度学习库
from huggingface_hub import hf_hub_download  # 导入Hugging Face Hub下载模块
from PIL import Image  # 导入Python Imaging Library,用于图像处理

from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification  # 导入转换器模块
from transformers.image_utils import PILImageResampling  # 导入图像处理工具模块
from transformers.utils import logging  # 导入日志记录工具模块


logging.set_verbosity_info()  # 设置日志记录详细程度为信息级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_convnextv2_config(checkpoint_url):
    config = ConvNextV2Config()  # 创建ConvNeXTV2配置对象

    # 根据checkpoint_url中的关键词设置深度和隐藏层大小
    if "atto" in checkpoint_url:
        depths = [2, 2, 6, 2]
        hidden_sizes = [40, 80, 160, 320]
    if "femto" in checkpoint_url:
        depths = [2, 2, 6, 2]
        hidden_sizes = [48, 96, 192, 384]
    if "pico" in checkpoint_url:
        depths = [2, 2, 6, 2]
        hidden_sizes = [64, 128, 256, 512]
    if "nano" in checkpoint_url:
        depths = [2, 2, 8, 2]
        hidden_sizes = [80, 160, 320, 640]
    if "tiny" in checkpoint_url:
        depths = [3, 3, 9, 3]
        hidden_sizes = [96, 192, 384, 768]
    if "base" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [128, 256, 512, 1024]
    if "large" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [192, 384, 768, 1536]
    if "huge" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [352, 704, 1408, 2816]

    num_labels = 1000  # 设置分类标签数量为1000
    filename = "imagenet-1k-id2label.json"  # 设置包含标签映射的文件名
    expected_shape = (1, 1000)  # 设置预期输出形状

    repo_id = "huggingface/label-files"  # 设置Hugging Face Hub仓库ID
    config.num_labels = num_labels  # 设置配置对象中的分类标签数量
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))  # 下载并加载标签映射数据
    id2label = {int(k): v for k, v in id2label.items()}  # 转换标签映射数据格式为整数到标签的字典形式

    config.id2label = id2label  # 设置配置对象中的id到标签的映射
    config.label2id = {v: k for k, v in id2label.items()}  # 设置配置对象中的标签到id的映射
    config.hidden_sizes = hidden_sizes  # 设置配置对象中的隐藏层大小列表
    config.depths = depths  # 设置配置对象中的网络深度列表

    return config, expected_shape  # 返回配置对象和预期输出形状的元组


def rename_key(name):
    if "downsample_layers.0.0" in name:
        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")  # 将指定的键名替换为新的名称
    if "downsample_layers.0.1" in name:
        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # 将指定的键名替换为新的名称(稍后重命名为layernorm)
    if "downsample_layers.1.0" in name:
        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")  # 将指定的键名替换为新的名称
    # 检查字符串变量 name 是否包含 "downsample_layers.1.1",如果是则替换为 "stages.1.downsampling_layer.1"
    if "downsample_layers.1.1" in name:
        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")

    # 检查字符串变量 name 是否包含 "downsample_layers.2.0",如果是则替换为 "stages.2.downsampling_layer.0"
    if "downsample_layers.2.0" in name:
        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")

    # 检查字符串变量 name 是否包含 "downsample_layers.2.1",如果是则替换为 "stages.2.downsampling_layer.1"
    if "downsample_layers.2.1" in name:
        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")

    # 检查字符串变量 name 是否包含 "downsample_layers.3.0",如果是则替换为 "stages.3.downsampling_layer.0"
    if "downsample_layers.3.0" in name:
        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")

    # 检查字符串变量 name 是否包含 "downsample_layers.3.1",如果是则替换为 "stages.3.downsampling_layer.1"
    if "downsample_layers.3.1" in name:
        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")

    # 检查字符串变量 name 是否包含 "stages",但不包含 "downsampling_layer",则进行部分替换
    # 例如,"stages.0.0" 应替换为 "stages.0.layers.0"
    if "stages" in name and "downsampling_layer" not in name:
        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]

    # 检查字符串变量 name 是否包含 "gamma",如果是则替换为 "weight"
    if "gamma" in name:
        name = name.replace("gamma", "weight")

    # 检查字符串变量 name 是否包含 "beta",如果是则替换为 "bias"
    if "beta" in name:
        name = name.replace("beta", "bias")

    # 检查字符串变量 name 是否包含 "stages",如果是则替换为 "encoder.stages"
    if "stages" in name:
        name = name.replace("stages", "encoder.stages")

    # 检查字符串变量 name 是否包含 "norm",如果是则替换为 "layernorm"
    if "norm" in name:
        name = name.replace("norm", "layernorm")

    # 检查字符串变量 name 是否包含 "head",如果是则替换为 "classifier"
    if "head" in name:
        name = name.replace("head", "classifier")

    # 返回修改后的字符串变量 name
    return name
# 准备一个图像数据,用于模型验证
def prepare_img():
    # 图片的 URL 地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 库获取图片数据流,并使用 PIL 库打开图像
    im = Image.open(requests.get(url, stream=True).raw)
    return im


# 根据 checkpoint_url 参数选择合适的预处理器对象
def convert_preprocessor(checkpoint_url):
    if "224" in checkpoint_url:
        # 如果 URL 中包含 "224",则选择大小为 224,并计算裁剪比例
        size = 224
        crop_pct = 224 / 256
    elif "384" in checkpoint_url:
        # 如果 URL 中包含 "384",则选择大小为 384,无需裁剪
        size = 384
        crop_pct = None
    else:
        # 否则,默认选择大小为 512,无需裁剪
        size = 512
        crop_pct = None

    # 返回一个 ConvNextImageProcessor 对象,其中包括所选参数和图像标准化的设置
    return ConvNextImageProcessor(
        size=size,
        crop_pct=crop_pct,
        image_mean=[0.485, 0.456, 0.406],
        image_std=[0.229, 0.224, 0.225],
        resample=PILImageResampling.BICUBIC,
    )


# 使用 @torch.no_grad() 装饰器,确保转换过程中不进行梯度计算
@torch.no_grad()
# 根据给定的 checkpoint_url,将模型权重复制/调整到 ConvNeXTV2 结构中
def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub):
    """
    Copy/paste/tweak model's weights to our ConvNeXTV2 structure.
    """
    # 打印信息:从 checkpoint_url 下载原始模型
    print("Downloading original model from checkpoint...")
    # 根据 URL 获取 ConvNeXTV2 的配置和预期的形状
    config, expected_shape = get_convnextv2_config(checkpoint_url)
    # 从 URL 加载原始模型的状态字典
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]

    # 打印信息:转换模型参数
    print("Converting model parameters...")
    # 重命名状态字典的键
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        state_dict[rename_key(key)] = val
    # 给所有键添加前缀,除了以 "classifier" 开头的键
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        if not key.startswith("classifier"):
            key = "convnextv2." + key
        state_dict[key] = val

    # 加载 HuggingFace 模型结构
    model = ConvNextV2ForImageClassification(config)
    # 加载转换后的状态字典到模型中
    model.load_state_dict(state_dict)
    # 将模型设置为评估模式
    model.eval()

    # 准备 ConvNextImageProcessor 对象,用于处理图像输入
    preprocessor = convert_preprocessor(checkpoint_url)
    # 准备图像数据,返回 PyTorch 张量
    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
    # 执行模型推理,得到预测 logits
    logits = model(**inputs).logits

    # 检查 logits 是否符合预期,根据不同的 checkpoint_url 设置不同的预期 logits
    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381])
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt":
        expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt":
        expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt":
        expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt":
        expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt":
        expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt":
        expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt":
        expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt":
        expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt":
        expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt":
        expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt":
        expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309])
    # 如果 checkpoint_url 是特定的 URL,则设置预期的 logits 值为特定的张量
    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt":
        expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826])
    else:
        # 如果 URL 不匹配任何已知的模型文件,抛出 ValueError 异常
        raise ValueError(f"Unknown URL: {checkpoint_url}")

    # 使用 assert 语句检查 logits 的前五个元素是否与预期的 logits 非常接近
    assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3)
    # 断言确保 logits 的形状与期望形状相匹配
    assert logits.shape == expected_shape
    # 打印信息表明模型输出与原始结果匹配
    print("Model outputs match the original results!")

    # 如果需要保存模型
    if save_model:
        # 打印信息表示正在保存模型到本地
        print("Saving model to local...")
        # 创建用于保存模型的文件夹(如果不存在)
        if not os.path.isdir(pytorch_dump_folder_path):
            os.mkdir(pytorch_dump_folder_path)

        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将预处理器保存到同一路径
        preprocessor.save_pretrained(pytorch_dump_folder_path)

    # 设置模型名称为 "convnextv2"
    model_name = "convnextv2"
    # 根据 checkpoint_url 的内容修改模型名称
    if "atto" in checkpoint_url:
        model_name += "-atto"
    if "femto" in checkpoint_url:
        model_name += "-femto"
    if "pico" in checkpoint_url:
        model_name += "-pico"
    # 以下是根据 checkpoint_url 中不同关键词修改模型名称的逻辑
    if "nano" in checkpoint_url:
        model_name += "-nano"
    elif "tiny" in checkpoint_url:
        model_name += "-tiny"
    elif "base" in checkpoint_url:
        model_name += "-base"
    elif "large" in checkpoint_url:
        model_name += "-large"
    elif "huge" in checkpoint_url:
        model_name += "-huge"
    # 进一步修改模型名称,考虑包含 "22k" 和 "1k" 的情况
    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
        model_name += "-22k"
    elif "22k" in checkpoint_url and "1k" in checkpoint_url:
        model_name += "-22k-1k"
    elif "1k" in checkpoint_url:
        model_name += "-1k"
    # 最后根据 checkpoint_url 中包含的分辨率信息修改模型名称
    if "224" in checkpoint_url:
        model_name += "-224"
    elif "384" in checkpoint_url:
        model_name += "-384"
    elif "512" in checkpoint_url:
        model_name += "-512"

    # 如果需要将模型推送到 hub
    if push_to_hub:
        # 打印信息表示正在将模型推送到 hub
        print(f"Pushing {model_name} to the hub...")
        # 将模型推送到 hub 上
        model.push_to_hub(model_name)
        # 将预处理器也推送到 hub 上,使用相同的模型名称
        preprocessor.push_to_hub(model_name)
if __name__ == "__main__":
    # 如果这个脚本是作为主程序执行的话,则执行以下代码

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--checkpoint_url",
        default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt",
        type=str,
        help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.",
    )
    # 添加名为 `--checkpoint_url` 的参数,用于指定要转换的 ConvNeXTV2 原始检查点的 URL

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="model",
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加名为 `--pytorch_dump_folder_path` 的参数,用于指定输出的 PyTorch 模型目录的路径,默认为 "model"

    parser.add_argument("--save_model", action="store_true", help="Save model to local")
    # 添加名为 `--save_model` 的可选参数,如果指定则将模型保存到本地

    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
    # 添加名为 `--push_to_hub` 的可选参数,如果指定则将模型和图像预处理器推送到 Hub

    args = parser.parse_args()
    # 解析命令行参数并返回一个 Namespace 对象

    convert_convnextv2_checkpoint(
        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
    )
    # 调用函数 `convert_convnextv2_checkpoint`,传入命令行参数中解析的对应值作为参数
posted @ 2024-06-29 15:50  绝不原创的飞龙  阅读(5)  评论(0编辑  收藏  举报