Transformers-源码解析-七十-

Transformers 源码解析(七十)

.\models\marian\tokenization_marian.py

# 导入所需的模块和库
import json  # 导入处理 JSON 数据的模块
import os  # 导入操作系统功能的模块
import re  # 导入正则表达式模块
import warnings  # 导入警告处理模块
from pathlib import Path  # 导入路径操作相关的模块
from shutil import copyfile  # 导入文件复制功能的模块
from typing import Any, Dict, List, Optional, Tuple, Union  # 导入类型提示相关的模块

import sentencepiece  # 导入句子拼接(SentencePiece)模块

from ...tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器基类
from ...utils import logging  # 导入日志记录模块


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

# 定义用于保存词汇文件的名称映射
VOCAB_FILES_NAMES = {
    "source_spm": "source.spm",
    "target_spm": "target.spm",
    "vocab": "vocab.json",
    "target_vocab_file": "target_vocab.json",
    "tokenizer_config_file": "tokenizer_config.json",
}

# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "source_spm": {
        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/source.spm"
    },
    "target_spm": {
        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/target.spm"
    },
    "vocab": {
        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json"
    },
    "tokenizer_config_file": {
        "Helsinki-NLP/opus-mt-en-de": (
            "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/tokenizer_config.json"
        )
    },
}

# 定义预训练位置嵌入大小的映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
PRETRAINED_INIT_CONFIGURATION = {}

SPIECE_UNDERLINE = "▁"

# MarianTokenizer 类,继承自 PreTrainedTokenizer
class MarianTokenizer(PreTrainedTokenizer):
    r"""
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # 词汇文件的名称列表,用于指定每种语言的词汇文件名
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射,将语言名称映射到其对应的预训练模型的词汇文件路径
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 将预训练模型初始化配置复制给实例变量
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 将预训练位置编码的最大输入大小复制给实例变量
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入的名称列表,包括 "input_ids" 和 "attention_mask"
    model_input_names = ["input_ids", "attention_mask"]
    # 用于匹配语言代码的正则表达式对象,匹配形式为 ">>.+<<"
    language_code_re = re.compile(">>.+<<")  # type: re.Pattern

    def __init__(
        self,
        source_spm,
        target_spm,
        vocab,
        target_vocab_file=None,
        source_lang=None,
        target_lang=None,
        unk_token="<unk>",
        eos_token="</s>",
        pad_token="<pad>",
        model_max_length=512,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        separate_vocabs=False,
        **kwargs,
    ) -> None:
        # 如果没有提供 sp_model_kwargs,则将其设置为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 断言源 SentencePiece 模型文件存在
        assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"

        # 标记是否使用分离的词汇表
        self.separate_vocabs = separate_vocabs
        # 加载 JSON 格式的词汇表文件,并将其赋给实例变量 encoder
        self.encoder = load_json(vocab)
        # 如果 unk_token 不在 encoder 中,则抛出 KeyError 异常
        if str(unk_token) not in self.encoder:
            raise KeyError("<unk> token must be in the vocab")
        # 断言 pad_token 在 encoder 中
        assert str(pad_token) in self.encoder

        # 如果使用分离的词汇表
        if separate_vocabs:
            # 加载目标语言的词汇表文件,并将其赋给实例变量 target_encoder
            self.target_encoder = load_json(target_vocab_file)
            # 创建一个从值到键的映射,赋给实例变量 decoder
            self.decoder = {v: k for k, v in self.target_encoder.items()}
            # 初始化支持的语言代码列表为空
            self.supported_language_codes = []
        else:
            # 创建一个从值到键的映射,赋给实例变量 decoder
            self.decoder = {v: k for k, v in self.encoder.items()}
            # 初始化支持的语言代码列表,仅包含符合特定格式的语言代码
            self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")]

        # 设置源语言和目标语言
        self.source_lang = source_lang
        self.target_lang = target_lang
        # 保存源和目标 SentencePiece 模型文件路径
        self.spm_files = [source_spm, target_spm]

        # 加载 SentencePiece 模型,用于预处理
        self.spm_source = load_spm(source_spm, self.sp_model_kwargs)
        self.spm_target = load_spm(target_spm, self.sp_model_kwargs)
        # 当前使用的 SentencePiece 模型,默认为源语言对应的模型
        self.current_spm = self.spm_source
        # 当前使用的编码器,默认为 encoder
        self.current_encoder = self.encoder

        # 设置规范化器
        self._setup_normalizer()

        super().__init__(
            # bos_token=bos_token, 未使用。开始解码时使用配置中的 decoder_start_token_id
            source_lang=source_lang,
            target_lang=target_lang,
            unk_token=unk_token,
            eos_token=eos_token,
            pad_token=pad_token,
            model_max_length=model_max_length,
            sp_model_kwargs=self.sp_model_kwargs,
            target_vocab_file=target_vocab_file,
            separate_vocabs=separate_vocabs,
            **kwargs,
        )

    def _setup_normalizer(self):
        try:
            # 尝试导入 MosesPunctNormalizer,并设置为实例方法 self.punc_normalizer
            from sacremoses import MosesPunctNormalizer

            self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
        except (ImportError, FileNotFoundError):
            # 如果导入失败,给出警告信息,并设置一个简单的标识符匿名函数作为 self.punc_normalizer
            warnings.warn("Recommended: pip install sacremoses.")
            self.punc_normalizer = lambda x: x
    def normalize(self, x: str) -> str:
        """规范化输入字符串,处理空字符串的情况。如果输入为空字符串,则返回空字符串。"""
        return self.punc_normalizer(x) if x else ""

    def _convert_token_to_id(self, token):
        """将给定的 token 转换为对应的 ID。如果 token 不在当前编码器中,则使用未知 token 的 ID。"""
        return self.current_encoder.get(token, self.current_encoder[self.unk_token])

    def remove_language_code(self, text: str):
        """移除文本中的语言代码,例如 >>fr<<,以便后续处理使用 SentencePiece。"""
        match = self.language_code_re.match(text)
        code: list = [match.group(0)] if match else []
        return code, self.language_code_re.sub("", text)

    def _tokenize(self, text: str) -> List[str]:
        """对文本进行标记化处理,返回标记化后的列表。"""
        code, text = self.remove_language_code(text)
        pieces = self.current_spm.encode(text, out_type=str)
        return code + pieces

    def _convert_id_to_token(self, index: int) -> str:
        """使用解码器将索引(整数)转换为对应的 token(字符串)。"""
        return self.decoder.get(index, self.unk_token)

    def batch_decode(self, sequences, **kwargs):
        """
        将一组 token ID 列表转换为字符串列表,通过调用 decode 方法实现。

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                tokenized input ids 的列表。可以使用 `__call__` 方法获得。
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                是否在解码时移除特殊 token。
            clean_up_tokenization_spaces (`bool`, *optional*):
                是否清理 tokenization 空格。如果为 `None`,则默认使用 `self.clean_up_tokenization_spaces`。
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                是否使用源 tokenizer 解码序列(仅适用于序列到序列问题)。
            kwargs (additional keyword arguments, *optional*):
                将传递给底层模型特定的解码方法。

        Returns:
            `List[str]`: 解码后的句子列表。
        """
        return super().batch_decode(sequences, **kwargs)
    def decode(self, token_ids, **kwargs):
        """
        Converts a sequence of ids into a string using the tokenizer and vocabulary,
        with options to remove special tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        """
        return super().decode(token_ids, **kwargs)

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
        Uses the source SentencePiece model (`spm_source`) if `_decode_use_source_tokenizer` is True,
        otherwise uses the target model (`spm_target`) to convert tokens back into a string.

        Args:
            tokens (List[str]): List of tokens to be converted into a string.

        Returns:
            str: The reconstructed string from tokens.
        """
        # Determine whether to use the source or target tokenizer based on the flag `_decode_use_source_tokenizer`
        sp_model = self.spm_source if self._decode_use_source_tokenizer else self.spm_target
        current_sub_tokens = []
        out_string = ""
        for token in tokens:
            # Check if the token is a special token that should not be decoded using SentencePiece
            if token in self.all_special_tokens:
                # Decode accumulated sub-tokens and append the current special token with a space
                out_string += sp_model.decode_pieces(current_sub_tokens) + token + " "
                current_sub_tokens = []
            else:
                # Accumulate tokens that are not special tokens
                current_sub_tokens.append(token)
        # Final decode of remaining sub-tokens and replace SentencePiece underline with a space
        out_string += sp_model.decode_pieces(current_sub_tokens)
        out_string = out_string.replace(SPIECE_UNDERLINE, " ")
        return out_string.strip()

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """
        Builds model inputs from token ids by appending `eos_token_id` to token_ids_0 or to both token_ids_0 and token_ids_1.

        Args:
            token_ids_0 (List[int]): List of token ids for the first sequence.
            token_ids_1 (List[int], optional): List of token ids for the second sequence, if processing pairs.

        Returns:
            List[int]: Concatenated list of token ids with `eos_token_id` appended.
        """
        if token_ids_1 is None:
            return token_ids_0 + [self.eos_token_id]
        else:
            # If processing pairs, concatenate token_ids_0 and token_ids_1 with eos_token_id appended
            return token_ids_0 + token_ids_1 + [self.eos_token_id]

    def _switch_to_input_mode(self):
        """
        Sets current SentencePiece model (`current_spm`) and encoder (`current_encoder`) to use the source versions.
        """
        self.current_spm = self.spm_source
        self.current_encoder = self.encoder

    def _switch_to_target_mode(self):
        """
        Sets current SentencePiece model (`current_spm`) to use the target version (`spm_target`).
        Sets current encoder (`current_encoder`) to use `target_encoder` if `separate_vocabs` is True.
        """
        self.current_spm = self.spm_target
        if self.separate_vocabs:
            self.current_encoder = self.target_encoder
    # 返回编码器的大小,即其包含的词汇量大小
    def vocab_size(self) -> int:
        return len(self.encoder)

    # 将词汇保存到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录不存在,则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        saved_files = []

        # 如果设置了分离的词汇表选项
        if self.separate_vocabs:
            # 构建源语言词汇表文件名
            out_src_vocab_file = os.path.join(
                save_directory,
                (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"],
            )
            # 构建目标语言词汇表文件名
            out_tgt_vocab_file = os.path.join(
                save_directory,
                (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["target_vocab_file"],
            )
            # 分别保存源语言和目标语言的编码器到JSON文件
            save_json(self.encoder, out_src_vocab_file)
            save_json(self.target_encoder, out_tgt_vocab_file)
            saved_files.append(out_src_vocab_file)
            saved_files.append(out_tgt_vocab_file)
        else:
            # 构建词汇表文件名(未分离词汇表时)
            out_vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
            )
            # 将编码器保存到JSON文件
            save_json(self.encoder, out_vocab_file)
            saved_files.append(out_vocab_file)

        # 复制或保存SentencePiece模型文件
        for spm_save_filename, spm_orig_path, spm_model in zip(
            [VOCAB_FILES_NAMES["source_spm"], VOCAB_FILES_NAMES["target_spm"]],
            self.spm_files,
            [self.spm_source, self.spm_target],
        ):
            # 构建保存路径
            spm_save_path = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + spm_save_filename
            )
            # 如果原始路径和保存路径不同且原始路径是文件,则复制文件
            if os.path.abspath(spm_orig_path) != os.path.abspath(spm_save_path) and os.path.isfile(spm_orig_path):
                copyfile(spm_orig_path, spm_save_path)
                saved_files.append(spm_save_path)
            # 如果原始路径不是文件,则将序列化后的模型写入保存路径
            elif not os.path.isfile(spm_orig_path):
                with open(spm_save_path, "wb") as fi:
                    content_spiece_model = spm_model.serialized_model_proto()
                    fi.write(content_spiece_model)
                saved_files.append(spm_save_path)

        # 返回保存的文件路径组成的元组
        return tuple(saved_files)

    # 获取编码器字典,等同于获取源语言编码器字典
    def get_vocab(self) -> Dict:
        return self.get_src_vocab()

    # 获取源语言编码器字典
    def get_src_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    # 获取目标语言编码器字典
    def get_tgt_vocab(self):
        return dict(self.target_encoder, **self.added_tokens_decoder)

    # 定义对象的序列化状态
    def __getstate__(self) -> Dict:
        # 复制对象的字典属性
        state = self.__dict__.copy()
        # 将指定键的值设置为None,以便对象可以进行序列化
        state.update(
            {k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer", "target_vocab_file"]}
        )
        return state
    # 重写对象的状态,将给定的字典作为对象的新状态
    def __setstate__(self, d: Dict) -> None:
        self.__dict__ = d

        # 为了向后兼容性
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 加载和设置子词模型来源和目标文件的路径
        self.spm_source, self.spm_target = (load_spm(f, self.sp_model_kwargs) for f in self.spm_files)
        # 设置当前的子词模型为来源模型
        self.current_spm = self.spm_source
        # 初始化正则化器
        self._setup_normalizer()

    # 返回特殊令牌(例如 EOS)的数量
    def num_special_tokens_to_add(self, *args, **kwargs):
        """Just EOS"""
        return 1

    # 创建一个掩码,标记特殊令牌(非 EOS 或 PAD )
    def _special_token_mask(self, seq):
        all_special_ids = set(self.all_special_ids)  # 仅调用一次以避免在列表推导内部多次调用
        all_special_ids.remove(self.unk_token_id)  # <unk> 仅在某些情况下是特殊的
        return [1 if x in all_special_ids else 0 for x in seq]

    # 获取特殊令牌的掩码列表,其中条目为 [1] 表示为 [eos] 或 [pad],否则为 0
    def get_special_tokens_mask(
        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
        if already_has_special_tokens:
            return self._special_token_mask(token_ids_0)
        elif token_ids_1 is None:
            return self._special_token_mask(token_ids_0) + [1]
        else:
            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
# 根据给定的路径加载 SentencePiece 模型,并使用提供的参数配置
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
    # 创建一个 SentencePieceProcessor 对象,使用传入的参数进行配置
    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
    # 加载指定路径下的 SentencePiece 模型文件
    spm.Load(path)
    return spm

# 将给定的数据保存为 JSON 格式到指定的文件路径
def save_json(data, path: str) -> None:
    # 以写入模式打开指定路径的文件
    with open(path, "w") as f:
        # 将数据以漂亮的格式(缩进为2)写入 JSON 文件
        json.dump(data, f, indent=2)

# 加载指定路径的 JSON 文件,并返回其中的数据结构(可以是字典或列表)
def load_json(path: str) -> Union[Dict, List]:
    # 以读取模式打开指定路径的 JSON 文件
    with open(path, "r") as f:
        # 解析 JSON 文件并将其内容返回
        return json.load(f)

.\models\marian\__init__.py

# 版权声明和许可条款声明,指出版权归 HuggingFace Team 所有,采用 Apache License 2.0
#
# 此部分导入所需的模块和函数,从 utils 模块导入必要的依赖检查和工具函数
from typing import TYPE_CHECKING
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义导入结构,用于组织导入的模块和函数
_import_structure = {
    "configuration_marian": ["MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "MarianConfig", "MarianOnnxConfig"],
}

# 检查 sentencepiece 是否可用,如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加 tokenization_marian 模块到导入结构中
    _import_structure["tokenization_marian"] = ["MarianTokenizer"]

# 检查 torch 是否可用,如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加 modeling_marian 模块到导入结构中,包含多个类和常量
    _import_structure["modeling_marian"] = [
        "MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MarianForCausalLM",
        "MarianModel",
        "MarianMTModel",
        "MarianPreTrainedModel",
    ]

# 检查 tensorflow 是否可用,如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加 modeling_tf_marian 模块到导入结构中,包含多个 TensorFlow 相关的类
    _import_structure["modeling_tf_marian"] = ["TFMarianModel", "TFMarianMTModel", "TFMarianPreTrainedModel"]

# 检查 flax 是否可用,如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加 modeling_flax_marian 模块到导入结构中,包含多个 Flax 相关的类
    _import_structure["modeling_flax_marian"] = ["FlaxMarianModel", "FlaxMarianMTModel", "FlaxMarianPreTrainedModel"]

# 如果是类型检查阶段,执行以下导入语句
if TYPE_CHECKING:
    # 从 configuration_marian 模块导入特定类和常量
    from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig, MarianOnnxConfig

    # 检查 sentencepiece 是否可用,如果可用则从 tokenization_marian 模块导入 MarianTokenizer 类
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_marian import MarianTokenizer

    # 检查 torch 是否可用,如果可用则从 modeling_marian 模块导入多个类
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_marian import (
            MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
            MarianForCausalLM,
            MarianModel,
            MarianMTModel,
            MarianPreTrainedModel,
        )

    # 不再检查 tensorflow 是否可用,因为这部分代码不在类型检查块内,避免导入 TensorFlow 相关模块
    else:
        # 如果前面的导入失败,则尝试从当前目录下导入相关模块
        from .modeling_tf_marian import TFMarianModel, TFMarianMTModel, TFMarianPreTrainedModel

    try:
        # 检查是否没有安装 Flax,如果没有安装则抛出 OptionalDependencyNotAvailable 异常
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果检测到 OptionalDependencyNotAvailable 异常,则不做任何处理
        pass
    else:
        # 如果没有抛出异常,则尝试从当前目录下导入 Flax 版本的 Marian 模型相关模块
        from .modeling_flax_marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
else:
    # 如果以上所有的条件都不满足,则执行以下代码块

    # 导入 sys 模块,用于对当前模块进行操作
    import sys

    # 将当前模块名对应的模块对象替换为一个懒加载模块对象 _LazyModule
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\markuplm\configuration_markuplm.py

# coding=utf-8
# Copyright 2021, The Microsoft Research Asia MarkupLM Team authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MarkupLM model configuration
"""

# 从配置工具中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从工具中导入日志记录功能
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型与其配置文件的映射字典
MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/config.json",
    "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/config.json",
}

# 定义 MarkupLMConfig 类,继承自 PretrainedConfig 类
class MarkupLMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MarkupLMModel`]. It is used to instantiate a
    MarkupLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MarkupLM
    [microsoft/markuplm-base](https://huggingface.co/microsoft/markuplm-base) architecture.

    Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs. Read the
    documentation from [`BertConfig`] for more information.

    Examples:

    ```
    >>> from transformers import MarkupLMModel, MarkupLMConfig

    >>> # Initializing a MarkupLM microsoft/markuplm-base style configuration
    >>> configuration = MarkupLMConfig()

    >>> # Initializing a model from the microsoft/markuplm-base style configuration
    >>> model = MarkupLMModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "markuplm"
    model_type = "markuplm"

    # 初始化函数,设置了多个模型配置参数
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        bos_token_id=0,
        eos_token_id=2,
        max_xpath_tag_unit_embeddings=256,
        max_xpath_subs_unit_embeddings=1024,
        tag_pad_id=216,
        subs_pad_id=1001,
        xpath_unit_hidden_size=32,
        max_depth=50,
        position_embedding_type="absolute",
        use_cache=True,
        classifier_dropout=None,
        **kwargs,
        ):
        # 调用父类的初始化方法,传入相关参数和关键字参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )
        # 初始化模型的词汇表大小
        self.vocab_size = vocab_size
        # 初始化模型的隐藏层大小
        self.hidden_size = hidden_size
        # 初始化模型的隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 初始化模型的注意力头数量
        self.num_attention_heads = num_attention_heads
        # 初始化模型的隐藏层激活函数
        self.hidden_act = hidden_act
        # 初始化模型的中间层大小
        self.intermediate_size = intermediate_size
        # 初始化模型的隐藏层丢弃率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 初始化模型的注意力丢弃率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 初始化模型的最大位置嵌入长度
        self.max_position_embeddings = max_position_embeddings
        # 初始化模型的类型词汇表大小
        self.type_vocab_size = type_vocab_size
        # 初始化模型的初始化范围
        self.initializer_range = initializer_range
        # 初始化模型的层归一化 epsilon
        self.layer_norm_eps = layer_norm_eps
        # 初始化模型的位置嵌入类型
        self.position_embedding_type = position_embedding_type
        # 初始化模型是否使用缓存
        self.use_cache = use_cache
        # 初始化模型分类器的丢弃率
        self.classifier_dropout = classifier_dropout
        # 额外的属性
        # 初始化模型的最大深度
        self.max_depth = max_depth
        # 初始化模型的最大XPath标签单元嵌入
        self.max_xpath_tag_unit_embeddings = max_xpath_tag_unit_embeddings
        # 初始化模型的最大XPath子项单元嵌入
        self.max_xpath_subs_unit_embeddings = max_xpath_subs_unit_embeddings
        # 初始化模型标签填充符的ID
        self.tag_pad_id = tag_pad_id
        # 初始化模型子项填充符的ID
        self.subs_pad_id = subs_pad_id
        # 初始化模型XPath单元隐藏层大小
        self.xpath_unit_hidden_size = xpath_unit_hidden_size

.\models\markuplm\feature_extraction_markuplm.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for MarkupLM.
"""

import html

from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...utils import is_bs4_available, logging, requires_backends

# 检查是否安装了 BeautifulSoup 库,若安装则导入
if is_bs4_available():
    import bs4
    from bs4 import BeautifulSoup

# 获取模块的日志记录器
logger = logging.get_logger(__name__)

class MarkupLMFeatureExtractor(FeatureExtractionMixin):
    r"""
    Constructs a MarkupLM feature extractor. This can be used to get a list of nodes and corresponding xpaths from HTML
    strings.

    This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
    of the main methods. Users should refer to this superclass for more information regarding those methods.

    """

    def __init__(self, **kwargs):
        # 要求后端依赖检查,确保 BeautifulSoup 库存在
        requires_backends(self, ["bs4"])
        super().__init__(**kwargs)

    def xpath_soup(self, element):
        # 初始化 xpath 标签和下标列表
        xpath_tags = []
        xpath_subscripts = []
        child = element if element.name else element.parent
        # 遍历元素的所有父级节点,获取路径信息
        for parent in child.parents:  # type: bs4.element.Tag
            siblings = parent.find_all(child.name, recursive=False)
            xpath_tags.append(child.name)
            # 计算当前节点在其兄弟节点中的位置索引
            xpath_subscripts.append(
                0 if 1 == len(siblings) else next(i for i, s in enumerate(siblings, 1) if s is child)
            )
            child = parent
        # 由于路径是从叶子节点到根节点的顺序,需要反转列表顺序以得到正确的 XPath 路径
        xpath_tags.reverse()
        xpath_subscripts.reverse()
        return xpath_tags, xpath_subscripts
    # 定义一个方法,从给定的 HTML 字符串中提取文本和相关信息
    def get_three_from_single(self, html_string):
        # 使用 BeautifulSoup 解析 HTML 字符串
        html_code = BeautifulSoup(html_string, "html.parser")

        # 初始化存储所有文本的列表和两个空的序列列表
        all_doc_strings = []  # 存储所有文本内容的列表
        string2xtag_seq = []  # 存储每个文本的 XPath 标签序列
        string2xsubs_seq = []  # 存储每个文本的 XPath 下标序列

        # 遍历 HTML 文档的所有节点
        for element in html_code.descendants:
            # 如果当前节点是可导航字符串类型
            if isinstance(element, bs4.element.NavigableString):
                # 如果当前节点的父节点不是标签节点,则跳过
                if type(element.parent) != bs4.element.Tag:
                    continue

                # 解码并去除文本内容的空白字符和转义序列
                text_in_this_tag = html.unescape(element).strip()
                # 如果处理后的文本为空,则跳过
                if not text_in_this_tag:
                    continue

                # 将处理后的文本添加到文本列表中
                all_doc_strings.append(text_in_this_tag)

                # 调用 xpath_soup 方法获取当前节点的 XPath 标签和下标序列
                xpath_tags, xpath_subscripts = self.xpath_soup(element)
                # 将 XPath 标签序列和下标序列添加到对应的列表中
                string2xtag_seq.append(xpath_tags)
                string2xsubs_seq.append(xpath_subscripts)

        # 检查文本列表和 XPath 序列列表的长度是否一致,若不一致则抛出 ValueError
        if len(all_doc_strings) != len(string2xtag_seq):
            raise ValueError("Number of doc strings and xtags does not correspond")
        if len(all_doc_strings) != len(string2xsubs_seq):
            raise ValueError("Number of doc strings and xsubs does not correspond")

        # 返回三个列表:所有文本内容、每个文本的 XPath 标签序列、每个文本的 XPath 下标序列
        return all_doc_strings, string2xtag_seq, string2xsubs_seq

    # 定义一个方法,根据给定的 XPath 标签序列和下标序列构造 XPath 表达式
    def construct_xpath(self, xpath_tags, xpath_subscripts):
        # 初始化空的 XPath 字符串
        xpath = ""
        # 遍历 XPath 标签序列和下标序列,构造 XPath 表达式
        for tagname, subs in zip(xpath_tags, xpath_subscripts):
            xpath += f"/{tagname}"  # 添加标签名到 XPath 中
            if subs != 0:
                xpath += f"[{subs}]"  # 如果下标不为 0,则添加下标到 XPath 中
        # 返回构造好的 XPath 表达式
        return xpath
    def __call__(self, html_strings) -> BatchFeature:
        """
        Main method to prepare for the model one or several HTML strings.

        Args:
            html_strings (`str`, `List[str]`):
                The HTML string or batch of HTML strings from which to extract nodes and corresponding xpaths.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **nodes** -- Nodes.
            - **xpaths** -- Corresponding xpaths.

        Examples:

        ```
        >>> from transformers import MarkupLMFeatureExtractor

        >>> page_name_1 = "page1.html"
        >>> page_name_2 = "page2.html"
        >>> page_name_3 = "page3.html"

        >>> with open(page_name_1) as f:
        ...     single_html_string = f.read()

        >>> feature_extractor = MarkupLMFeatureExtractor()

        >>> # single example
        >>> encoding = feature_extractor(single_html_string)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])

        >>> # batched example

        >>> multi_html_strings = []

        >>> with open(page_name_2) as f:
        ...     multi_html_strings.append(f.read())
        >>> with open(page_name_3) as f:
        ...     multi_html_strings.append(f.read())

        >>> encoding = feature_extractor(multi_html_strings)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])
        ```"""

        # Input type checking for clearer error
        # 检查输入类型以提供更清晰的错误信息
        valid_strings = False

        # Check that strings has a valid type
        # 检查字符串的类型是否有效
        if isinstance(html_strings, str):
            valid_strings = True
        elif isinstance(html_strings, (list, tuple)):
            if len(html_strings) == 0 or isinstance(html_strings[0], str):
                valid_strings = True

        if not valid_strings:
            raise ValueError(
                "HTML strings must of type `str`, `List[str]` (batch of examples), "
                f"but is of type {type(html_strings)}."
            )

        is_batched = bool(isinstance(html_strings, (list, tuple)) and (isinstance(html_strings[0], str)))

        if not is_batched:
            html_strings = [html_strings]

        # Get nodes + xpaths
        # 获取节点和对应的 XPath
        nodes = []
        xpaths = []
        for html_string in html_strings:
            # Extract nodes and related sequences
            # 提取节点和相关的序列
            all_doc_strings, string2xtag_seq, string2xsubs_seq = self.get_three_from_single(html_string)
            nodes.append(all_doc_strings)
            xpath_strings = []
            for node, tag_list, sub_list in zip(all_doc_strings, string2xtag_seq, string2xsubs_seq):
                # Construct XPath string from tag and sub lists
                # 从标签列表和子列表构建 XPath 字符串
                xpath_string = self.construct_xpath(tag_list, sub_list)
                xpath_strings.append(xpath_string)
            xpaths.append(xpath_strings)

        # return as Dict
        # 作为字典返回
        data = {"nodes": nodes, "xpaths": xpaths}
        encoded_inputs = BatchFeature(data=data, tensor_type=None)

        return encoded_inputs

.\models\markuplm\modeling_markuplm.py

# coding=utf-8
# 版权 2022 年由 Microsoft Research Asia 和 HuggingFace Inc. 团队所有。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)获得许可;
# 除非符合许可证要求或获得书面许可,否则您不能使用此文件。
# 您可以在以下网址获得许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则软件按“原样”分发,
# 没有任何形式的明示或暗示担保或条件。
# 请参阅许可证以获取特定语言的权限和限制。
""" PyTorch MarkupLM 模型。"""

import math
import os
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 从外部导入一些函数和类
from ...activations import ACT2FN
from ...file_utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import logging
from .configuration_markuplm import MarkupLMConfig

# 获取 logger 对象用于记录日志
logger = logging.get_logger(__name__)

# 以下是一些用于文档的变量
_CHECKPOINT_FOR_DOC = "microsoft/markuplm-base"
_CONFIG_FOR_DOC = "MarkupLMConfig"

# 预训练模型存档列表
MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/markuplm-base",
    "microsoft/markuplm-large",
]

# XPathEmbeddings 类定义,用于构建来自 xpath 标签和下标的嵌入
class XPathEmbeddings(nn.Module):
    """Construct the embeddings from xpath tags and subscripts.

    We drop tree-id in this version, as its info can be covered by xpath.
    """

    def __init__(self, config):
        super(XPathEmbeddings, self).__init__()
        # 最大深度设定
        self.max_depth = config.max_depth

        # 将 xpath 单元序列映射为隐藏尺寸
        self.xpath_unitseq2_embeddings = nn.Linear(config.xpath_unit_hidden_size * self.max_depth, config.hidden_size)

        # dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 激活函数 ReLU
        self.activation = nn.ReLU()
        
        # 将 xpath 单元序列映射为内部尺寸的线性层
        self.xpath_unitseq2_inner = nn.Linear(config.xpath_unit_hidden_size * self.max_depth, 4 * config.hidden_size)
        
        # 将内部尺寸映射为嵌入尺寸的线性层
        self.inner2emb = nn.Linear(4 * config.hidden_size, config.hidden_size)

        # xpath 标签子嵌入的 Embedding 列表
        self.xpath_tag_sub_embeddings = nn.ModuleList(
            [
                nn.Embedding(config.max_xpath_tag_unit_embeddings, config.xpath_unit_hidden_size)
                for _ in range(self.max_depth)
            ]
        )

        # xpath 下标子嵌入的 Embedding 列表
        self.xpath_subs_sub_embeddings = nn.ModuleList(
            [
                nn.Embedding(config.max_xpath_subs_unit_embeddings, config.xpath_unit_hidden_size)
                for _ in range(self.max_depth)
            ]
        )
    python
        # 定义前向传播函数,接受两个可选参数:xpath_tags_seq 和 xpath_subs_seq
        def forward(self, xpath_tags_seq=None, xpath_subs_seq=None):
            # 初始化空列表,用于存储各层次标签路径的嵌入
            xpath_tags_embeddings = []
            # 初始化空列表,用于存储各层次子节点路径的嵌入
            xpath_subs_embeddings = []
    
            # 循环遍历每个层次的深度(最多为 self.max_depth)
            for i in range(self.max_depth):
                # 将当前层次的标签路径数据传入对应的嵌入层,并将结果添加到标签路径嵌入列表中
                xpath_tags_embeddings.append(self.xpath_tag_sub_embeddings[i](xpath_tags_seq[:, :, i]))
                # 将当前层次的子节点路径数据传入对应的嵌入层,并将结果添加到子节点路径嵌入列表中
                xpath_subs_embeddings.append(self.xpath_subs_sub_embeddings[i](xpath_subs_seq[:, :, i]))
    
            # 沿着最后一个维度(深度方向)连接所有标签路径的嵌入,形成完整的标签路径嵌入
            xpath_tags_embeddings = torch.cat(xpath_tags_embeddings, dim=-1)
            # 沿着最后一个维度(深度方向)连接所有子节点路径的嵌入,形成完整的子节点路径嵌入
            xpath_subs_embeddings = torch.cat(xpath_subs_embeddings, dim=-1)
    
            # 将标签路径嵌入和子节点路径嵌入按元素相加,得到整体的路径嵌入
            xpath_embeddings = xpath_tags_embeddings + xpath_subs_embeddings
    
            # 将整体路径嵌入传入内部层 self.xpath_unitseq2_inner 进行处理,并经过激活函数 activation 和 dropout 处理
            xpath_embeddings = self.inner2emb(self.dropout(self.activation(self.xpath_unitseq2_inner(xpath_embeddings))))
    
            # 返回计算得到的最终路径嵌入
            return xpath_embeddings
# 从输入的 `input_ids` 中创建位置编码,替换非填充符号为它们的位置编号。位置编号从 `padding_idx+1` 开始,忽略填充符号。这个函数是从 fairseq 的 `utils.make_positions` 修改而来。

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    # 创建一个 mask 张量,标记非填充符号为 1,其余为 0
    mask = input_ids.ne(padding_idx).int()
    # 对 mask 张量进行累积求和,并加上 `past_key_values_length`,再乘以 mask 本身,确保只有非填充位置会被计数
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 最后将得到的位置索引加上填充索引 `padding_idx`,并转换为长整型
    return incremental_indices.long() + padding_idx


class MarkupLMEmbeddings(nn.Module):
    """从词嵌入、位置嵌入和标记类型嵌入构建嵌入向量。"""

    def __init__(self, config):
        super(MarkupLMEmbeddings, self).__init__()
        self.config = config
        # 词嵌入层,将词汇表大小映射到隐藏大小,使用 `padding_idx` 指定填充符号
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 位置嵌入层,将最大位置编码数量映射到隐藏大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        self.max_depth = config.max_depth

        # XPath 嵌入层的初始化
        self.xpath_embeddings = XPathEmbeddings(config)

        # 标记类型嵌入层,将标记类型词汇表大小映射到隐藏大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # LayerNorm 层,归一化隐藏状态向量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层,用于随机丢弃隐藏状态向量,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 创建一个持久化的缓冲区,存储位置编码,用于模型训练过程中的批处理
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

        # 填充符号索引
        self.padding_idx = config.pad_token_id
        # 重新定义位置嵌入层,指定填充符号的索引
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    # 从输入的嵌入张量 `inputs_embeds` 中创建位置编码
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        由于我们直接提供了嵌入向量,无法推断哪些是填充的,因此直接生成顺序的位置编号。

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 生成一个从 `padding_idx + 1` 到 `sequence_length + padding_idx + 1` 的序列作为位置编码
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 将位置编码扩展成与 `inputs_embeds` 相同的形状
        return position_ids.unsqueeze(0).expand(input_shape)

    def forward(
        self,
        input_ids=None,
        xpath_tags_seq=None,
        xpath_subs_seq=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        past_key_values_length=0,
        ):
            # 如果输入的 input_ids 不为空,则获取其形状作为 input_shape
            if input_ids is not None:
                input_shape = input_ids.size()
            else:
                # 否则,获取 inputs_embeds 的除最后一维外的所有维度作为 input_shape
                input_shape = inputs_embeds.size()[:-1]

            # 确定设备为 input_ids 的设备(如果 input_ids 不为空),否则为 inputs_embeds 的设备
            device = input_ids.device if input_ids is not None else inputs_embeds.device

            # 如果未提供 position_ids,则根据 input_ids 创建位置编码,保留任何填充的标记
            if position_ids is None:
                if input_ids is not None:
                    position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
                else:
                    position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

            # 如果未提供 token_type_ids,则创建与 input_shape 相同形状的全零张量作为 token_type_ids
            if token_type_ids is None:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

            # 如果未提供 inputs_embeds,则使用 input_ids 获取对应的词嵌入
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)

            # 准备 xpath_tags_seq 序列,如果未提供,则创建全为 self.config.tag_pad_id 的张量
            if xpath_tags_seq is None:
                xpath_tags_seq = self.config.tag_pad_id * torch.ones(
                    tuple(list(input_shape) + [self.max_depth]), dtype=torch.long, device=device
                )

            # 准备 xpath_subs_seq 序列,如果未提供,则创建全为 self.config.subs_pad_id 的张量
            if xpath_subs_seq is None:
                xpath_subs_seq = self.config.subs_pad_id * torch.ones(
                    tuple(list(input_shape) + [self.max_depth]), dtype=torch.long, device=device
                )

            # 将词嵌入作为 words_embeddings
            words_embeddings = inputs_embeds

            # 根据 position_ids 获取位置编码作为 position_embeddings
            position_embeddings = self.position_embeddings(position_ids)

            # 根据 token_type_ids 获取 token 类型编码作为 token_type_embeddings
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            # 根据 xpath_tags_seq 和 xpath_subs_seq 获取 xpath 嵌入作为 xpath_embeddings
            xpath_embeddings = self.xpath_embeddings(xpath_tags_seq, xpath_subs_seq)

            # 将所有嵌入进行加和作为最终的 embeddings
            embeddings = words_embeddings + position_embeddings + token_type_embeddings + xpath_embeddings

            # 对 embeddings 进行 LayerNorm 归一化处理
            embeddings = self.LayerNorm(embeddings)

            # 对 embeddings 进行 dropout 处理
            embeddings = self.dropout(embeddings)

            # 返回最终的嵌入向量 embeddings
            return embeddings
# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制而来,将 Bert 替换为 MarkupLM
class MarkupLMSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性变换层,输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Layer normalization 层,对隐藏状态进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层,用于随机断开神经元连接,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # Dropout 随机断开神经元连接
        hidden_states = self.dropout(hidden_states)
        # Layer normalization,并添加输入张量进行残差连接
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制而来
class MarkupLMIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性变换层,输入维度为 config.hidden_size,输出维度为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 中间激活函数,根据配置选择合适的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制而来,将 Bert 替换为 MarkupLM
class MarkupLMOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性变换层,输入维度为 config.intermediate_size,输出维度为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # Layer normalization 层,对隐藏状态进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层,用于随机断开神经元连接,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # Dropout 随机断开神经元连接
        hidden_states = self.dropout(hidden_states)
        # Layer normalization,并添加输入张量进行残差连接
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertPooler 复制而来
class MarkupLMPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性变换层,输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Tanh 激活函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 池化操作,直接取第一个 token 对应的隐藏状态作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 线性变换
        pooled_output = self.dense(first_token_tensor)
        # 应用 Tanh 激活函数
        pooled_output = self.activation(pooled_output)
        return pooled_output


# 从 transformers.models.bert.modeling_bert.BertPredictionHeadTransform 复制而来,将 Bert 替换为 MarkupLM
class MarkupLMPredictionHeadTransform(nn.Module):
    # 初始化函数,用于创建对象时的初始化操作,接收一个配置参数 config
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个全连接层,输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置中的隐藏层激活函数,选择相应的激活函数或者使用预定义的激活函数映射表 ACT2FN
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # 创建一个 LayerNorm 层,对隐藏状态进行归一化,设置归一化的维度和 epsilon 参数
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播函数,接收一个张量 hidden_states,返回一个张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量通过全连接层 self.dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 将线性变换后的张量通过激活函数 self.transform_act_fn 进行非线性变换
        hidden_states = self.transform_act_fn(hidden_states)
        # 将经过激活函数变换后的张量通过 LayerNorm 进行归一化处理
        hidden_states = self.LayerNorm(hidden_states)
        # 返回处理后的张量作为前向传播的结果
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertLMPredictionHead 复制而来,将 Bert 替换为 MarkupLM
class MarkupLMLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.transform = MarkupLMPredictionHeadTransform(config)

        # 输出权重与输入嵌入相同,但每个标记有一个仅输出的偏置项。
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要一个链接来确保偏置项在调整 `resize_token_embeddings` 时正确调整大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOnlyMLMHead 复制而来,将 Bert 替换为 MarkupLM
class MarkupLMOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.predictions = MarkupLMLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


# 从 transformers.models.bert.modeling_bert.BertSelfAttention 复制而来,将 Bert 替换为 MarkupLM
class MarkupLMSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 查询、键、值的线性转换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )

        # 如果使用相对位置嵌入,需要额外的距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = config.is_decoder
    # 将输入张量 x 进行形状变换,以便用于多头注意力机制
    new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
    x = x.view(new_x_shape)
    
    # 对变换后的张量进行维度置换,以便多头注意力机制能够操作
    return x.permute(0, 2, 1, 3)

# 实现自定义的前向传播方法,用于 Transformer 模型的每一层
def forward(
    self,
    hidden_states: torch.Tensor,  # 输入的隐藏状态张量
    attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码张量,可选
    head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码张量,可选
    encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 编码器隐藏状态张量,可选
    encoder_attention_mask: Optional[torch.FloatTensor] = None,  # 编码器注意力掩码张量,可选
    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 过去的键值对,可选
    output_attentions: Optional[bool] = False,  # 是否输出注意力权重,可选
# 从transformers.models.bert.modeling_bert.BertAttention复制过来,将Bert改为MarkupLM
class MarkupLMAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化自注意力层,使用给定的配置和位置嵌入类型
        self.self = MarkupLMSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化自注意力输出层,使用给定的配置
        self.output = MarkupLMSelfOutput(config)
        # 初始化用于存储已修剪注意力头的集合
        self.pruned_heads = set()

    # 方法:修剪注意力头
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用辅助函数找到可修剪的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 使用自注意力层处理隐藏状态和其他可选参数
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 使用自注意力输出层处理自注意力层的输出和隐藏状态
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果输出注意力值,则添加到输出元组中
        outputs = (attention_output,) + self_outputs[1:]
        return outputs


# 从transformers.models.bert.modeling_bert.BertLayer复制过来,将Bert改为MarkupLM
class MarkupLMLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化用于分块前馈传递的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度
        self.seq_len_dim = 1
        # 初始化自注意力层
        self.attention = MarkupLMAttention(config)
        # 是否为解码器
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
            # 如果添加交叉注意力但不是解码器,则抛出值错误异常
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 使用绝对位置嵌入类型初始化交叉注意力层
            self.crossattention = MarkupLMAttention(config, position_embedding_type="absolute")
        # 初始化中间层
        self.intermediate = MarkupLMIntermediate(config)
        # 初始化输出层
        self.output = MarkupLMOutput(config)
    # 定义一个方法 forward,用于处理模型的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码,可选参数
        head_mask: Optional[torch.FloatTensor] = None,  # 多头注意力的掩码,可选参数
        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 编码器的隐藏状态,可选参数
        encoder_attention_mask: Optional[torch.FloatTensor] = None,  # 编码器的注意力掩码,可选参数
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 过去的键值对,可选参数
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重,可选参数,默认为 False
    ) -> Tuple[torch.Tensor]:  # 返回类型为包含张量的元组
        # 如果有过去的键值对,则提取自注意力的过去键值对的缓存,位置在1和2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用自注意力层处理隐藏状态,返回自注意力的输出
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        attention_output = self_attention_outputs[0]  # 提取自注意力的输出

        # 如果当前模块是解码器,最后一个输出是自注意力的键值对缓存元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]  # 提取除了最后一个元素外的所有输出
            present_key_value = self_attention_outputs[-1]  # 提取最后一个元素作为当前的键值对
        else:
            outputs = self_attention_outputs[1:]  # 如果不是解码器,添加自注意力权重到输出中

        cross_attn_present_key_value = None
        # 如果当前模块是解码器且有编码器的隐藏状态作为输入
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 提取跨注意力的过去键值对缓存,位置在过去键值对元组的倒数第二和最后一个位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用跨注意力层处理自注意力的输出和编码器的隐藏状态,返回跨注意力的输出
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]  # 提取跨注意力的输出
            outputs = outputs + cross_attention_outputs[1:-1]  # 添加跨注意力的权重到输出中

            # 将跨注意力的现在的键值对添加到当前的键值对中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 将注意力输出应用到前向传播的分块处理中
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs  # 将处理后的输出和之前的输出合并成元组

        # 如果是解码器,将注意力的键值对作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs  # 返回处理后的输出元组
    # 定义一个方法用于执行神经网络的前向传播,处理注意力输出
    def feed_forward_chunk(self, attention_output):
        # 使用 self.intermediate 方法处理注意力输出,得到中间层输出
        intermediate_output = self.intermediate(attention_output)
        # 使用 self.output 方法处理中间层输出和注意力输出,得到最终层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回最终层的输出作为这个方法的结果
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制代码,并将Bert->MarkupLM
class MarkupLMEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建一个由多个MarkupLMLayer对象组成的层列表,层数由config.num_hidden_layers指定
        self.layer = nn.ModuleList([MarkupLMLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点设置为False
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果输出隐藏状态为真,则初始化空元组,否则为None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重为真,则初始化空元组,否则为None
        all_self_attentions = () if output_attentions else None
        # 如果输出交叉注意力为真且配置允许,则初始化空元组,否则为None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用梯度检查点并且处于训练模式下
        if self.gradient_checkpointing and self.training:
            # 如果设置了使用缓存,则发出警告并设置use_cache为False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果不使用缓存,则初始化空元组,否则为None
        next_decoder_cache = () if use_cache else None
        # 遍历每一层的编码器层模块
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态为真,则将当前隐藏状态添加到all_hidden_states中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果有头部掩码,则选择当前层的头部掩码,否则为None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果有过去的键值对,则选择当前层的过去键值对,否则为None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用梯度检查点并且处于训练模式下
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来计算当前层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的模块计算输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存,则将当前层的输出的最后一个元素添加到next_decoder_cache中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果输出注意力为真,则将当前层的输出的第二个元素添加到all_self_attentions中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果配置允许,则将当前层的输出的第三个元素添加到all_cross_attentions中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果输出隐藏状态为真,则将最终隐藏状态添加到all_hidden_states中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典,则返回非空值的元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回带有过去和交叉注意力的基础模型输出
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
    """
    This model is a PyTorch `torch.nn.Module` sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MarkupLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    """
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列中每个token在词汇表中的索引。

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)

        xpath_tags_seq (`torch.LongTensor` of shape `({0}, config.max_depth)`, *optional*):
            # 输入序列中每个token对应的标签ID,填充到config.max_depth。

        xpath_subs_seq (`torch.LongTensor` of shape `({0}, config.max_depth)`, *optional*):
            # 输入序列中每个token对应的子脚本ID,填充到config.max_depth。

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮盖掩码,避免在填充的token索引上进行注意力计算。`1`表示未被遮盖的token,`0`表示被遮盖的token。

            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段落token索引,用于指示输入的第一部分和第二部分。`0`对应*句子A*的token,`1`对应*句子B*的token。

            [What are token type IDs?](../glossary#token-type-ids)

        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列token在位置嵌入中的索引。选择范围为`[0, config.max_position_embeddings - 1]`。

            [What are position IDs?](../glossary#position-ids)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 自注意力模块中选择性屏蔽的头部的掩码。`1`表示头部未被屏蔽,`0`表示头部被屏蔽。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选项,可以直接传递嵌入表示而不是`input_ids`。如果想要对如何将*input_ids*索引转换为关联向量有更多控制,则很有用。

        output_attentions (`bool`, *optional*):
            # 如果设置为`True`,则返回所有注意力层的注意力张量。详见返回张量中的`attentions`获取更多详情。

        output_hidden_states (`bool`, *optional*):
            # 如果设置为`True`,则返回所有层的隐藏状态。详见返回张量中的`hidden_states`获取更多详情。

        return_dict (`bool`, *optional*):
            # 如果设置为`True`,模型将返回[`~file_utils.ModelOutput`]而不是简单的元组。
"""
@add_start_docstrings(
    "The bare MarkupLM Model transformer outputting raw hidden-states without any specific head on top.",
    MARKUPLM_START_DOCSTRING,
)
class MarkupLMModel(MarkupLMPreTrainedModel):
    # 从transformers.models.bert.modeling_bert.BertModel.__init__复制而来,将Bert改为MarkupLM
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # 初始化模型的嵌入层
        self.embeddings = MarkupLMEmbeddings(config)
        
        # 初始化模型的编码器层
        self.encoder = MarkupLMEncoder(config)

        # 如果指定要添加池化层,则初始化池化层;否则设为None
        self.pooler = MarkupLMPooler(config) if add_pooling_layer else None

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回输入嵌入层
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入层
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
    # 从transformers.models.bert.modeling_bert.BertModel.forward复制而来
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        xpath_tags_seq: Optional[torch.LongTensor] = None,
        xpath_subs_seq: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Model forward pass. See BaseModelOutputWithPoolingAndCrossAttentions for specific outputs.
        """
        # 从transformers.models.bert.modeling_bert.BertModel.prepare_inputs_for_generation复制而来
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
    ):
    ):
        # 获取输入的形状
        input_shape = input_ids.shape
        # 如果没有提供注意力遮罩(mask),则创建全为1的注意力遮罩
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果存在过去的键值对(past_key_values),则根据它进行修剪输入的decoder_input_ids
        if past_key_values is not None:
            # 获取过去键值对的长度
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递了最后一个输入ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认保留旧行为:仅保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 对输入进行修剪,仅保留后缀部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回一个包含重排后的缓存信息的字典
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    # 从transformers.models.bert.modeling_bert.BertModel._reorder_cache复制而来
    def _reorder_cache(self, past_key_values, beam_idx):
        # 重新排序过去的键值对
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                # 对每一层的过去状态根据beam_idx重新排序并添加到元组中
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past
@add_start_docstrings(
    """
    MarkupLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    MARKUPLM_START_DOCSTRING,
)
class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ 复制而来,将 bert->markuplm, Bert->MarkupLM
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.markuplm = MarkupLMModel(config, add_pooling_layer=False)  # 初始化 MarkupLMModel 模型
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)  # 初始化用于 QA 输出的线性层

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        xpath_tags_seq: Optional[torch.Tensor] = None,
        xpath_subs_seq: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """MarkupLM Model with a `token_classification` head on top.""",
    MARKUPLM_START_DOCSTRING
)
class MarkupLMForTokenClassification(MarkupLMPreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ 复制而来,将 bert->markuplm, Bert->MarkupLM
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.markuplm = MarkupLMModel(config, add_pooling_layer=False)  # 初始化 MarkupLMModel 模型
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)  # 初始化 dropout 层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 初始化用于分类的线性层

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法 `forward`,用于模型的前向传播
    def forward(
        # 输入的词索引序列,可以是张量或者空值
        self,
        input_ids: Optional[torch.Tensor] = None,
        # XPath 标签序列,可以是张量或者空值
        xpath_tags_seq: Optional[torch.Tensor] = None,
        # XPath 子句序列,可以是张量或者空值
        xpath_subs_seq: Optional[torch.Tensor] = None,
        # 注意力掩码,可以是张量或者空值
        attention_mask: Optional[torch.Tensor] = None,
        # 标记类型 ID,可以是张量或者空值
        token_type_ids: Optional[torch.Tensor] = None,
        # 位置 ID,可以是张量或者空值
        position_ids: Optional[torch.Tensor] = None,
        # 头部掩码,可以是张量或者空值
        head_mask: Optional[torch.Tensor] = None,
        # 输入的嵌入表示,可以是张量或者空值
        inputs_embeds: Optional[torch.Tensor] = None,
        # 标签,可以是张量或者空值
        labels: Optional[torch.Tensor] = None,
        # 是否输出注意力权重,默认为 None
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态,默认为 None
        output_hidden_states: Optional[bool] = None,
        # 是否以字典形式返回输出,默认为 None
        return_dict: Optional[bool] = None,
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 参数不为 None,则使用该参数;否则使用 self.config.use_return_dict

        outputs = self.markuplm(
            input_ids,
            xpath_tags_seq=xpath_tags_seq,
            xpath_subs_seq=xpath_subs_seq,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 self.markuplm 进行模型推理,传入各种参数如 input_ids, xpath_tags_seq 等,根据 return_dict 返回不同的结果格式

        sequence_output = outputs[0]
        # 获取模型输出中的序列输出

        prediction_scores = self.classifier(sequence_output)  # (batch_size, seq_length, node_type_size)
        # 使用 self.classifier 对序列输出进行分类预测,得到预测分数,形状为 (batch_size, seq_length, node_type_size)

        loss = None
        if labels is not None:
            # 如果提供了标签信息
            loss_fct = CrossEntropyLoss()
            # 使用交叉熵损失函数
            loss = loss_fct(
                prediction_scores.view(-1, self.config.num_labels),
                labels.view(-1),
            )
            # 计算预测分数和标签之间的损失值

        if not return_dict:
            # 如果 return_dict 为 False
            output = (prediction_scores,) + outputs[2:]
            # 构建输出元组,包含预测分数和额外的输出内容
            return ((loss,) + output) if loss is not None else output
            # 如果有损失值,返回损失和输出内容;否则只返回输出内容

        return TokenClassifierOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 如果 return_dict 为 True,以 TokenClassifierOutput 形式返回结果,包括损失、预测分数、隐藏状态和注意力权重
"""
在标记LM模型的基础上增加一个顶部的序列分类/回归头部(即在池化输出之上的线性层),例如用于GLUE任务。
"""
@add_start_docstrings(
    """
    在标记LM模型的基础上增加一个顶部的序列分类/回归头部(即在池化输出之上的线性层),例如用于GLUE任务。
    """,
    MARKUPLM_START_DOCSTRING,
)
class MarkupLMForSequenceClassification(MarkupLMPreTrainedModel):
    """
    从transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__复制而来,将bert->markuplm, Bert->MarkupLM。
    """

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量
        self.config = config  # 存储配置信息

        self.markuplm = MarkupLMModel(config)  # 初始化标记LM模型
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)  # 定义dropout层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 定义线性分类器

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        xpath_tags_seq: Optional[torch.Tensor] = None,
        xpath_subs_seq: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        """
        将输入的多个张量传递给MarkupLM模型以进行前向传播,支持多种输入和输出设置。
        """
        **kwargs,
    ) -> SequenceClassifierOutput:
        pass  # 实际的前向传播逻辑在这里未展示,需要根据实际情况填充

.\models\markuplm\processing_markuplm.py

# 设置文件编码为 UTF-8
# 版权声明,版权归 The HuggingFace Inc. team 所有,采用 Apache License 2.0
# 如果不遵循许可证,不得使用此文件中的代码
# 可以在上述链接获取许可证的副本
# 根据适用法律或书面同意,本软件根据"原样"分发,无任何明示或暗示的担保或条件
# 详见许可证,限制软件使用的特定语言和条件
"""
Processor class for MarkupLM.
"""
# 导入所需的类型和联合类型
from typing import Optional, Union

# 从文件工具中导入所需的类
from ...file_utils import TensorType
# 从处理工具中导入所需的类
from ...processing_utils import ProcessorMixin
# 从基本标记化工具中导入批处理编码、填充策略和截断策略
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy


class MarkupLMProcessor(ProcessorMixin):
    r"""
    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
    processor.

    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.

    Args:
        feature_extractor (`MarkupLMFeatureExtractor`):
            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
        parse_html (`bool`, *optional*, defaults to `True`):
            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
    """

    # 定义特征提取器和标记化器的类名
    feature_extractor_class = "MarkupLMFeatureExtractor"
    tokenizer_class = ("MarkupLMTokenizer", "MarkupLMTokenizerFast")
    # 是否解析 HTML 字符串来生成节点和对应的 XPath,默认为 True
    parse_html = True

    def __call__(
        self,
        html_strings=None,
        nodes=None,
        xpaths=None,
        node_labels=None,
        questions=None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ) -> BatchEncoding:
        """
        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
        returns the output.

        Optionally, one can also provide a `text` argument which is passed along as first sequence.

        Please refer to the docstring of the above two methods for more information.
        """
        # 首先,根据 parse_html 参数处理 HTML 字符串,生成 nodes 和 xpaths
        if self.parse_html:
            # 如果 parse_html 设置为 True,则必须传入 HTML 字符串
            if html_strings is None:
                raise ValueError("Make sure to pass HTML strings in case `parse_html` is set to `True`")

            # 如果 parse_html 设置为 True,则不能同时传入 nodes、xpaths 或 node_labels
            if nodes is not None or xpaths is not None or node_labels is not None:
                raise ValueError(
                    "Please don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`"
                )

            # 使用特征提取器处理 HTML 字符串,获取 nodes 和 xpaths
            features = self.feature_extractor(html_strings)
            nodes = features["nodes"]
            xpaths = features["xpaths"]
        else:
            # 如果 parse_html 设置为 False,则不能传入 HTML 字符串
            if html_strings is not None:
                raise ValueError("You have passed HTML strings but `parse_html` is set to `False`.")
            # 如果 parse_html 设置为 False,则必须传入 nodes 和 xpaths
            if nodes is None or xpaths is None:
                raise ValueError("Make sure to pass nodes and xpaths in case `parse_html` is set to `False`")

        # 其次,应用分词器处理输入数据
        if questions is not None and self.parse_html:
            # 如果同时传入了 questions 并且 parse_html 为 True,则将 questions 转为列表形式
            if isinstance(questions, str):
                questions = [questions]  # add batch dimension (as the feature extractor always adds a batch dimension)

        # 使用分词器处理输入数据,返回编码后的结果
        encoded_inputs = self.tokenizer(
            text=questions if questions is not None else nodes,
            text_pair=nodes if questions is not None else None,
            xpaths=xpaths,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            return_tensors=return_tensors,
            **kwargs,
        )

        # 返回编码后的输入数据
        return encoded_inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        """
        # 调用分词器的 batch_decode 方法,将所有参数转发给它
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 定义一个方法 `decode`,该方法将其所有参数转发给 `TrOCRTokenizer` 的 `PreTrainedTokenizer.decode` 方法。
    # 请参考 `PreTrainedTokenizer.decode` 方法的文档字符串获取更多信息。
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        """
        # 调用 `TrOCRTokenizer` 对象的 `decode` 方法,并返回其结果
        return self.tokenizer.decode(*args, **kwargs)

    # 定义一个属性 `model_input_names`
    @property
    def model_input_names(self):
        # 获取 `self.tokenizer` 的 `model_input_names` 属性值,并返回
        tokenizer_input_names = self.tokenizer.model_input_names
        return tokenizer_input_names

.\models\markuplm\tokenization_markuplm.py

# coding=utf-8
# 版权 Microsoft Research 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache License, Version 2.0 授权,除非符合许可,否则不得使用此文件。
# 您可以在以下网址获取许可的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按“原样”分发的软件,
# 没有任何形式的担保或条件,包括但不限于对适销性和特定用途的隐含担保。
# 有关详细信息,请参阅许可证。
"""MarkupLM 的标记化类。"""

import json  # 导入 JSON 库
import os    # 导入操作系统相关功能
from functools import lru_cache  # 导入 lru_cache 装饰器
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示

import regex as re  # 导入正则表达式库

from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings  # 从文件工具中导入相关功能
from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入标记化工具
from ...tokenization_utils_base import (
    ENCODE_KWARGS_DOCSTRING,
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)  # 导入标记化基础工具
from ...utils import logging  # 导入日志工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/vocab.json",
        "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/vocab.json",
    },
    "merges_file": {
        "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
        "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/markuplm-base": 512,
    "microsoft/markuplm-large": 512,
}


@lru_cache()
def bytes_to_unicode():
    """
    返回 utf-8 字节列表及其对应的 Unicode 字符映射。
    避免映射到空白字符和控制字符,以免在 bpe 编码中出错。
    可逆的 bpe 编码适用于 Unicode 字符串,因此如果要避免 UNKs,
    则需要在词汇表中包含大量的 Unicode 字符。
    例如,对于大约 100 亿个令牌的数据集,您需要大约 5000 个 Unicode 字符以确保良好的覆盖率。
    这相当于您正常使用的 32K bpe 词汇表的显著百分比。
    为了避免这种情况,我们需要 utf-8 字节和 Unicode 字符串之间的查找表。
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """
    # 返回单词中所有相邻字符对组成的集合。这里单词被表示为由符号元组构成(符号是长度可变的字符串)。
    def get_symbol_pairs(word):
        # 初始化一个空集合来存放符号对
        pairs = set()
        # 从单词的第一个符号开始迭代到最后一个符号
        prev_char = word[0]  # 获取单词的第一个符号作为前一个符号
        for char in word[1:]:  # 从第二个符号开始迭代到最后一个符号
            # 将前一个符号和当前符号组成一个符号对,并添加到集合中
            pairs.add((prev_char, char))
            # 更新前一个符号为当前符号,以便下一次迭代使用
            prev_char = char
        # 返回包含所有符号对的集合
        return pairs
class MarkupLMTokenizer(PreTrainedTokenizer):
    r"""
    Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE). [`MarkupLMTokenizer`] can be used to
    turn HTML strings into to token-level `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and
    `xpath_tags_seq`. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
    """
    # 载入预定义的词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型词汇文件的映射表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练位置嵌入大小的映射表
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 初始化方法,用于实例化对象
    def __init__(
        self,
        vocab_file,
        merges_file,
        tags_dict,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        max_depth=50,
        max_width=1000,
        pad_width=1001,
        pad_token_label=-100,
        only_label_first_subword=True,
        **kwargs,
    ):
        # 初始化方法参数,设置对象的属性
        ...

    def get_xpath_seq(self, xpath):
        """
        根据给定的 xpath 表达式(如 "/html/body/div/li[1]/div/span[2]"),返回标签 ID 和对应的下标列表,考虑最大深度限制。
        """
        # 初始化空列表,用于存储 xpath 表达式中的标签 ID 和下标
        xpath_tags_list = []
        xpath_subs_list = []

        # 按"/"分割 xpath 表达式
        xpath_units = xpath.split("/")
        for unit in xpath_units:
            # 如果单元为空,则跳过
            if not unit.strip():
                continue
            # 分割标签名和下标(如果有)
            name_subs = unit.strip().split("[")
            tag_name = name_subs[0]
            # 下标默认为0,如果存在则取其整数形式
            sub = 0 if len(name_subs) == 1 else int(name_subs[1][:-1])
            # 获取标签名对应的标签 ID,如果不存在则使用默认的未知标签 ID
            xpath_tags_list.append(self.tags_dict.get(tag_name, self.unk_tag_id))
            # 下标取最大宽度和实际值的较小者
            xpath_subs_list.append(min(self.max_width, sub))

        # 限制列表长度不超过最大深度
        xpath_tags_list = xpath_tags_list[: self.max_depth]
        xpath_subs_list = xpath_subs_list[: self.max_depth]
        # 如果列表长度不足最大深度,使用填充标签 ID 和填充宽度进行填充
        xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
        xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))

        # 返回标签 ID 列表和下标列表
        return xpath_tags_list, xpath_subs_list

    @property
    def vocab_size(self):
        # 返回编码器的大小,即词汇表的大小
        return len(self.encoder)

    def get_vocab(self):
        # 获取词汇表,包括原始编码器和额外添加的编码器
        vocab = self.encoder.copy()
        vocab.update(self.added_tokens_encoder)
        return vocab
    def bpe(self, token):
        # 如果 token 已经在缓存中,则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        # 将 token 转换为字符元组
        word = tuple(token)
        # 获取 token 的所有字符对
        pairs = get_pairs(word)

        # 如果没有字符对,则直接返回 token
        if not pairs:
            return token

        # 进入循环,直到 token 无法再进行 BPE 分割
        while True:
            # 找出当前字符对中频率最低的字符对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果该字符对不在 BPE 词汇表中,则跳出循环
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            # 遍历 token 中的字符
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    # 如果找不到 first,则将剩余部分添加到 new_word 中并结束循环
                    new_word.extend(word[i:])
                    break
                else:
                    # 将 first 之前的部分添加到 new_word 中
                    new_word.extend(word[i:j])
                    i = j

                # 如果当前字符是 first 并且下一个字符是 second,则将它们合并为一个新的字符添加到 new_word 中
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    # 否则将当前字符直接添加到 new_word 中
                    new_word.append(word[i])
                    i += 1
            # 更新 word 为新的字符元组
            new_word = tuple(new_word)
            word = new_word
            # 如果 word 只剩一个字符,则跳出循环
            if len(word) == 1:
                break
            else:
                # 否则继续生成新的字符对进行下一轮合并
                pairs = get_pairs(word)
        # 将最终合并后的字符元组转换为字符串形式
        word = " ".join(word)
        # 将 token 及其对应的合并结果存入缓存中
        self.cache[token] = word
        # 返回最终合并后的字符串
        return word

    def _tokenize(self, text):
        """Tokenize a string."""
        # 初始化空列表用于存储 BPE 分割后的 token
        bpe_tokens = []
        # 使用正则表达式按照 self.pat 提供的模式将 text 分割成 token
        for token in re.findall(self.pat, text):
            # 将每个 token 编码成字节序列,然后映射到 Unicode 字符串,避免 BPE 中的控制符号(在我们的情况下是空格)
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # 将所有字节映射为 Unicode 字符串,避免 BPE 中的控制符(在我们的情况下是空格)
            # 对每个经过 BPE 处理后的 token 进行拆分并添加到 bpe_tokens 列表中
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        # 返回最终的 BPE 分割后的 token 列表
        return bpe_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据 token 查找其在词汇表中对应的 id,如果找不到则返回 unk_token 对应的 id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据 index 查找其在词汇表中对应的 token
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 打印警告信息,提示目前不支持生成任务,解码是实验性的且可能会变化
        logger.warning(
            "MarkupLM now does not support generative tasks, decoding is experimental and subject to change."
        )
        # 将 tokens 列表中的 token 合并为一个字符串
        text = "".join(tokens)
        # 将合并后的字符串转换为字节序列,然后根据 byte_decoder 将其解码为 utf-8 编码的字符串,处理过程中可能会出现错误
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        # 返回最终解码后的文本字符串
        return text
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在,如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建词汇表文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构建合并文件路径
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 保存词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 保存合并文件
        index = 0
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 遍历并按照 token_index 排序保存 BPE 合并信息
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        # 检查是否需要在文本前添加空格,并处理传入的其他参数
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
        return (text, kwargs)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoBERTa sequence has the following format:
        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # 返回包含特殊 token 的单个序列输入
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回包含特殊 token 的序列对输入
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def build_xpath_tags_with_special_tokens(
        self, xpath_tags_0: List[int], xpath_tags_1: Optional[List[int]] = None
    ):
        # 待实现,用于构建带有特殊 token 的 XPath 标签序列
    ) -> List[int]:
        # 定义用于填充的特殊标记序列
        pad = [self.pad_xpath_tags_seq]
        # 如果 xpath_tags_1 的长度为 0,则返回前后添加填充标记后的 xpath_tags_0 序列
        if len(xpath_tags_1) == 0:
            return pad + xpath_tags_0 + pad
        # 否则返回前后添加填充标记后的 xpath_tags_0 序列,再加上 xpath_tags_1 序列和填充标记
        return pad + xpath_tags_0 + pad + xpath_tags_1 + pad

    def build_xpath_subs_with_special_tokens(
        self, xpath_subs_0: List[int], xpath_subs_1: Optional[List[int]] = None
    ) -> List[int]:
        # 定义用于填充的特殊标记序列
        pad = [self.pad_xpath_subs_seq]
        # 如果 xpath_subs_1 为 None 或者其长度为 0,则返回前后添加填充标记后的 xpath_subs_0 序列
        if len(xpath_subs_1) == 0:
            return pad + xpath_subs_0 + pad
        # 否则返回前后添加填充标记后的 xpath_subs_0 序列,再加上 xpath_subs_1 序列和填充标记
        return pad + xpath_subs_0 + pad + xpath_subs_1 + pad

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Args:
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # 如果已经有特殊标记,则调用父类的方法返回特殊标记掩码
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 如果 token_ids_1 为 None,则返回一个列表,以1开头,后接 token_ids_0 的长度个0,最后接1
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        # 否则返回一个列表,以1开头,后接 token_ids_0 的长度个0,再接两个1,再接 token_ids_1 的长度个0,最后接1
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of zeros.
        """
        # 定义用于分隔的特殊标记和类别标记
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # 如果 token_ids_1 为 None,则返回一个长度为 cls + token_ids_0 + sep 的列表,全为0
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # 否则返回一个长度为 cls + token_ids_0 + sep + token_ids_1 + sep 的列表,全为0
        return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 定义一个方法,使对象可以被调用,接收多种文本输入格式以及其他参数
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        xpaths: Union[List[List[int]], List[List[List[int]]]] = None,
        node_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 调用 `batch_encode_plus` 方法,并传递所有参数
        @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
        def batch_encode_plus(
            self,
            batch_text_or_text_pairs: Union[
                List[TextInput],
                List[TextInputPair],
                List[PreTokenizedInput],
            ],
            is_pair: bool = None,
            xpaths: Optional[List[List[List[int]]]] = None,
            node_labels: Optional[Union[List[int], List[List[int]]]] = None,
            add_special_tokens: bool = True,
            padding: Union[bool, str, PaddingStrategy] = False,
            truncation: Union[bool, str, TruncationStrategy] = None,
            max_length: Optional[int] = None,
            stride: int = 0,
            pad_to_multiple_of: Optional[int] = None,
            return_tensors: Optional[Union[str, TensorType]] = None,
            return_token_type_ids: Optional[bool] = None,
            return_attention_mask: Optional[bool] = None,
            return_overflowing_tokens: bool = False,
            return_special_tokens_mask: bool = False,
            return_offsets_mapping: bool = False,
            return_length: bool = False,
            verbose: bool = True,
            **kwargs,
        ):
    ) -> BatchEncoding:
        # 获取填充和截断策略,支持旧版参数 'truncation_strategy' 和 'pad_to_max_length' 的兼容性
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用_batch_encode_plus方法进行批量编码
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            xpaths=xpaths,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        xpaths: Optional[List[List[List[int]]]] = None,
        node_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        if return_offsets_mapping:
            # 如果要返回偏移映射,则抛出 NotImplementedError
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast."
            )

        # 调用 _batch_prepare_for_model 方法准备批量输入数据
        batch_outputs = self._batch_prepare_for_model(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            xpaths=xpaths,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        # 返回 BatchEncoding 对象,封装批处理输出结果
        return BatchEncoding(batch_outputs)

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def _batch_prepare_for_model(
        self,
        batch_text_or_text_pairs,
        is_pair: bool = None,
        xpaths: Optional[List[List[int]]] = None,
        node_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens.

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        # Initialize an empty dictionary to store batch outputs
        batch_outputs = {}

        # Iterate over each example in the batch, paired with xpaths
        for idx, example in enumerate(zip(batch_text_or_text_pairs, xpaths)):
            # Unpack the example into text or text pairs and xpaths
            batch_text_or_text_pair, xpaths_example = example

            # Call a method to prepare the inputs for the model
            outputs = self.prepare_for_model(
                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,  # First sequence or single sequence
                batch_text_or_text_pair[1] if is_pair else None,  # Second sequence (if pair) or None
                xpaths_example,  # XPath example for special handling
                node_labels=node_labels[idx] if node_labels is not None else None,  # Node labels if provided
                add_special_tokens=add_special_tokens,  # Whether to add special tokens
                padding=PaddingStrategy.DO_NOT_PAD.value,  # Padding strategy (no padding here)
                truncation=truncation_strategy.value,  # Truncation strategy for sequences
                max_length=max_length,  # Maximum length of sequences
                stride=stride,  # Stride for overflowing tokens
                pad_to_multiple_of=None,  # No padding to multiple of any specific number
                return_attention_mask=False,  # Do not return attention masks
                return_token_type_ids=return_token_type_ids,  # Whether to return token type IDs
                return_overflowing_tokens=return_overflowing_tokens,  # Whether to return overflowing tokens
                return_special_tokens_mask=return_special_tokens_mask,  # Whether to return special tokens mask
                return_length=return_length,  # Whether to return the length of sequences
                return_tensors=None,  # Do not convert batch to tensors immediately
                prepend_batch_axis=False,  # Do not prepend batch axis
                verbose=verbose,  # Verbosity level
            )

            # Aggregate outputs into batch_outputs dictionary
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # Pad the batch outputs according to specified padding strategy and maximum length
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,  # Padding strategy enumeration value
            max_length=max_length,  # Maximum length of sequences for padding
            pad_to_multiple_of=pad_to_multiple_of,  # Pad to multiple of specified value
            return_attention_mask=return_attention_mask,  # Whether to return attention masks
        )

        # Convert batch_outputs into a BatchEncoding object with specified tensor type
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # Return the final prepared batch outputs
        return batch_outputs

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
    # 定义一个方法,用于将文本编码成模型可以接受的输入格式
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        xpaths: Optional[List[List[int]]] = None,
        node_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> List[int]:
        # 调用 encode_plus 方法进行文本编码,并获取编码后的输入字典
        encoded_inputs = self.encode_plus(
            text=text,
            text_pair=text_pair,
            xpaths=xpaths,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 返回编码结果中的输入 token IDs 列表
        return encoded_inputs["input_ids"]

    # 使用 add_end_docstrings 装饰器为 encode_plus 方法添加文档字符串
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        xpaths: Optional[List[List[int]]] = None,
        node_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
        `__call__` should be used instead.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
            text_pair (`List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a list of strings (nodes of a single example) or a
                list of list of strings (nodes of a batch of examples).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略以及相关参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用_encode_plus方法进行编码
        return self._encode_plus(
            text=text,
            xpaths=xpaths,
            text_pair=text_pair,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    ) -> BatchEncoding:
        # 如果设置了返回偏移映射,则抛出未实现的错误
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 调用实例方法 prepare_for_model(),准备输入以供模型使用
        return self.prepare_for_model(
            text=text,  # 主要文本输入
            text_pair=text_pair,  # 可选的第二文本输入(用于双输入模型)
            xpaths=xpaths,  # XPath 标签序列
            node_labels=node_labels,  # 节点标签序列
            add_special_tokens=add_special_tokens,  # 是否添加特殊标记(如 [CLS], [SEP])
            padding=padding_strategy.value,  # 填充策略
            truncation=truncation_strategy.value,  # 截断策略
            max_length=max_length,  # 最大长度限制
            stride=stride,  # 滑动窗口步长
            pad_to_multiple_of=pad_to_multiple_of,  # 填充到某个倍数
            return_tensors=return_tensors,  # 返回的张量类型
            prepend_batch_axis=True,  # 是否在批处理维度前添加批处理轴
            return_attention_mask=return_attention_mask,  # 是否返回注意力掩码
            return_token_type_ids=return_token_type_ids,  # 是否返回token类型IDs
            return_overflowing_tokens=return_overflowing_tokens,  # 是否返回溢出的token
            return_special_tokens_mask=return_special_tokens_mask,  # 是否返回特殊token的掩码
            return_length=return_length,  # 是否返回序列长度
            verbose=verbose,  # 是否输出详细信息
        )

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 主要文本输入或预分词输入
        text_pair: Optional[PreTokenizedInput] = None,  # 可选的第二文本输入(用于双输入模型)
        xpaths: Optional[List[List[int]]] = None,  # XPath 标签序列列表
        node_labels: Optional[List[int]] = None,  # 节点标签列表
        add_special_tokens: bool = True,  # 是否添加特殊标记(如 [CLS], [SEP])
        padding: Union[bool, str, PaddingStrategy] = False,  # 填充策略
        truncation: Union[bool, str, TruncationStrategy] = None,  # 截断策略
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 滑动窗口步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到某个倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回序列长度
        verbose: bool = True,  # 是否输出详细信息
        prepend_batch_axis: bool = False,  # 是否在批处理维度前添加批处理轴
        **kwargs,  # 其他参数
    def truncate_sequences(
        self,
        ids: List[int],  # 序列的ID列表
        xpath_tags_seq: List[List[int]],  # XPath 标签序列的列表
        xpath_subs_seq: List[List[int]],  # XPath 子序列的列表
        pair_ids: Optional[List[int]] = None,  # 可选的第二序列的ID列表
        pair_xpath_tags_seq: Optional[List[List[int]]] = None,  # 可选的第二XPath标签序列的列表
        pair_xpath_subs_seq: Optional[List[List[int]]] = None,  # 可选的第二XPath子序列的列表
        labels: Optional[List[int]] = None,  # 标签列表(如分类任务的标签)
        num_tokens_to_remove: int = 0,  # 需要移除的token数量
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",  # 截断策略
        stride: int = 0,  # 滑动窗口步长
    # 定义一个私有方法 `_pad`,用于对输入进行填充操作
    def _pad(
        # 输入参数 `encoded_inputs` 可以是字典(单个样本)或者批编码(多个样本)
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        # 最大长度参数,指定填充后的最大长度
        max_length: Optional[int] = None,
        # 填充策略,默认为不填充
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        # 如果指定,将填充长度调整为该数的倍数
        pad_to_multiple_of: Optional[int] = None,
        # 是否返回注意力掩码,默认根据填充策略自动确定
        return_attention_mask: Optional[bool] = None,

.\models\markuplm\tokenization_markuplm_fast.py

# 设置文件编码格式为 UTF-8
# 版权声明:2022 年 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件;
# 除非符合许可证的规定,否则不得使用本文件。
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件是基于"原样"分发的,无论明示或默示的保证或条件。
# 有关详细信息,请参阅许可证。
"""
MarkupLM 的快速标记类。它重写了慢速标记器类的两个方法,即 _batch_encode_plus 和 _encode_plus,
在这些方法中使用了 Rust 标记器。
"""

import json
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union

from tokenizers import pre_tokenizers, processors

# 导入文件工具和常量定义
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
# 导入基础标记工具类
from ...tokenization_utils_base import (
    ENCODE_KWARGS_DOCSTRING,
    AddedToken,
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
# 导入快速标记工具类
from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 导入日志工具
from ...utils import logging
# 导入特定的标记化类
from .tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, MarkupLMTokenizer

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/vocab.json",
        "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/vocab.json",
    },
    "merges_file": {
        "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
        "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
    },
}

# 定义预训练位置嵌入的尺寸映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/markuplm-base": 512,
    "microsoft/markuplm-large": 512,
}

@lru_cache()
def bytes_to_unicode():
    """
    返回 utf-8 字节列表及其映射到 unicode 字符串的映射表。我们特别避免映射到空格或控制字符,以免在 bpe 编码时出错。
    可逆的 bpe 编码适用于 unicode 字符串。这意味着如果您希望避免 UNKs,您需要在词汇中包含大量的 unicode 字符。
    当您处理类似 10B 令牌数据集时,您可能需要约 5K 个 unicode 字符以获得良好的覆盖率。
    这相当于正常 32K bpe 词汇表的显著比例。为了避免这种情况,我们希望在 utf-8 字节和 unicode 字符串之间建立查找表。
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    # 循环迭代范围在0到255之间的整数
    for b in range(2**8):
        # 如果当前整数不在列表bs中
        if b not in bs:
            # 将当前整数b添加到bs列表中
            bs.append(b)
            # 向cs列表中添加新的元素,该元素为2的8次方加上当前迭代次数n
            cs.append(2**8 + n)
            # 增加迭代计数n的值
            n += 1
    
    # 将cs列表中的每个整数转换为对应的Unicode字符,并形成一个新的列表cs
    cs = [chr(n) for n in cs]
    
    # 使用zip函数将bs列表和cs列表中的元素一一配对,然后生成一个字典
    return dict(zip(bs, cs))
# 定义一个函数,用于获取单词中的符号对集合。这里假设单词是由符号元组表示的,每个符号可以是长度可变的字符串。
def get_pairs(word):
    """
    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
    strings).
    """
    # 初始化一个空集合,用于存储符号对
    pairs = set()
    # 获取单词的第一个符号作为前一个符号
    prev_char = word[0]
    # 遍历单词中除第一个符号之外的所有符号
    for char in word[1:]:
        # 将前一个符号和当前符号作为一个符号对加入到集合中
        pairs.add((prev_char, char))
        # 更新前一个符号为当前符号,以便下一次迭代使用
        prev_char = char
    # 返回所有的符号对集合
    return pairs


class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).

    [`MarkupLMTokenizerFast`] can be used to turn HTML strings into to token-level `input_ids`, `attention_mask`,
    `token_type_ids`, `xpath_tags_seq` and `xpath_tags_seq`. This tokenizer inherits from [`PreTrainedTokenizer`] which
    contains most of the main methods.

    Users should refer to this superclass for more information regarding those methods.
    """
    # 导入所需的库或模块
    Args:
        vocab_file (`str`):
            # 词汇表文件的路径。
        merges_file (`str`):
            # 合并文件的路径。
        errors (`str`, *optional*, defaults to `"replace"`):
            # 解码字节为 UTF-8 时遇到错误的处理方式。详见 Python 文档中的 bytes.decode 描述。
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            # 在预训练期间用作序列开头的特殊标记。也可用作序列分类器的标记。
            # <Tip>提示:在使用特殊标记构建序列时,并非使用此标记作为序列的开头标记。实际上使用的是 `cls_token`。</Tip>
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            # 序列结尾的特殊标记。
            # <Tip>提示:在使用特殊标记构建序列时,并非使用此标记作为序列的结尾标记。实际上使用的是 `sep_token`。</Tip>
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            # 分隔符标记,在构建来自多个序列的序列时使用,例如序列分类或问题回答中的文本和问题。同时也用作使用特殊标记构建序列的最后一个标记。
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            # 分类器标记,在进行序列分类(整个序列而不是每个标记的分类)时使用。在使用特殊标记构建序列时,它是序列的第一个标记。
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            # 未知标记。如果词汇表中不存在的标记,将无法将其转换为 ID,而会被设置为此标记。
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            # 用于填充的标记,例如在对不同长度的序列进行批处理时使用。
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            # 用于掩码值的标记。在使用掩码语言建模训练模型时使用,模型将尝试预测此标记。
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            # 是否在输入之前添加一个初始空格。这样可以像对待其他单词一样对待前导单词。(RoBERTa 分词器通过前导空格来检测单词的开头)。
    
    # 以下变量可能为预训练模型配置的文件和大小映射提供了默认值
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = MarkupLMTokenizer
    def __init__(
        self,
        vocab_file,
        merges_file,
        tags_dict,
        tokenizer_file=None,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        max_depth=50,
        max_width=1000,
        pad_width=1001,
        pad_token_label=-100,
        only_label_first_subword=True,
        trim_offsets=False,
        **kwargs,
    ):
        """
        Initialize the class with required and optional parameters for tokenization and tagging.

        Args:
            vocab_file (str): Path to vocabulary file.
            merges_file (str): Path to merges file for tokenization.
            tags_dict (dict): Dictionary mapping tag names to IDs.
            tokenizer_file (str, optional): Path to tokenizer file. Defaults to None.
            errors (str, optional): Error handling method during tokenization. Defaults to "replace".
            bos_token (str, optional): Beginning of sequence token. Defaults to "<s>".
            eos_token (str, optional): End of sequence token. Defaults to "</s>".
            sep_token (str, optional): Separator token. Defaults to "</s>".
            cls_token (str, optional): Classification token. Defaults to "<s>".
            unk_token (str, optional): Unknown token. Defaults to "<unk>".
            pad_token (str, optional): Padding token. Defaults to "<pad>".
            mask_token (str, optional): Mask token. Defaults to "<mask>".
            add_prefix_space (bool, optional): Whether to add prefix space during tokenization. Defaults to False.
            max_depth (int, optional): Maximum depth for XPath processing. Defaults to 50.
            max_width (int, optional): Maximum width for XPath processing. Defaults to 1000.
            pad_width (int, optional): Padding width for XPath processing. Defaults to 1001.
            pad_token_label (int, optional): Padding token label for subword tagging. Defaults to -100.
            only_label_first_subword (bool, optional): Whether to label only the first subword. Defaults to True.
            trim_offsets (bool, optional): Whether to trim offsets. Defaults to False.
            **kwargs: Additional keyword arguments.
        """
        pass

    def get_xpath_seq(self, xpath):
        """
        Given the xpath expression of one particular node (like "/html/body/div/li[1]/div/span[2]"), return a list of
        tag IDs and corresponding subscripts, taking into account max depth.
        """
        xpath_tags_list = []
        xpath_subs_list = []

        xpath_units = xpath.split("/")
        for unit in xpath_units:
            if not unit.strip():
                continue
            name_subs = unit.strip().split("[")
            tag_name = name_subs[0]
            sub = 0 if len(name_subs) == 1 else int(name_subs[1][:-1])
            xpath_tags_list.append(self.tags_dict.get(tag_name, self.unk_tag_id))
            xpath_subs_list.append(min(self.max_width, sub))

        xpath_tags_list = xpath_tags_list[: self.max_depth]
        xpath_subs_list = xpath_subs_list[: self.max_depth]
        xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
        xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))

        return xpath_tags_list, xpath_subs_list

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        xpaths: Union[List[List[int]], List[List[List[int]]]] = None,
        node_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        """
        Encode the input text(s) along with associated parameters into token IDs, token type IDs, and attention masks.

        Args:
            text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]): Input text or texts.
            text_pair (Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]], optional): Second input text or texts. Defaults to None.
            xpaths (Union[List[List[int]], List[List[List[int]]]], optional): List of XPath sequences. Defaults to None.
            node_labels (Optional[Union[List[int], List[List[int]]]], optional): Node labels corresponding to XPaths. Defaults to None.
            add_special_tokens (bool, optional): Whether to add special tokens. Defaults to True.
            padding (Union[bool, str, PaddingStrategy], optional): Padding strategy or maximum length for padding. Defaults to False.
            truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy or maximum length for truncation. Defaults to None.
            max_length (Optional[int], optional): Maximum length of the returned sequences. Defaults to None.
            stride (int, optional): Stride for overflowing tokens. Defaults to 0.
            pad_to_multiple_of (Optional[int], optional): Pad to a multiple of specified value. Defaults to None.
            return_tensors (Optional[Union[str, TensorType]], optional): Type of tensors to return. Defaults to None.
            return_token_type_ids (Optional[bool], optional): Whether to return token type IDs. Defaults to None.
            return_attention_mask (Optional[bool], optional): Whether to return attention mask. Defaults to None.
            return_overflowing_tokens (bool, optional): Whether to return overflowing tokens. Defaults to False.
            return_special_tokens_mask (bool, optional): Whether to return special tokens mask. Defaults to False.
            return_offsets_mapping (bool, optional): Whether to return offsets mapping. Defaults to False.
            return_length (bool, optional): Whether to return length of the encoded sequence. Defaults to False.
            verbose (bool, optional): Whether to output verbose information. Defaults to True.
            **kwargs: Additional keyword arguments.
        """
        pass
    # 定义一个方法用于批量编码文本或文本对,并返回批编码结果
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,  # 是否为文本对
        xpaths: Optional[List[List[List[int]]]] = None,  # XPath信息,用于处理HTML/XML类型的输入
        node_labels: Optional[Union[List[int], List[List[int]]]] = None,  # 节点标签信息
        add_special_tokens: bool = True,  # 是否添加特殊标记(如[CLS]和[SEP])
        padding: Union[bool, str, PaddingStrategy] = False,  # 填充策略,可以是布尔值、字符串或填充策略对象
        truncation: Union[bool, str, TruncationStrategy] = None,  # 截断策略,可以是布尔值、字符串或截断策略对象
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 滑动窗口的步长
        pad_to_multiple_of: Optional[int] = None,  # 将序列填充到某个整数的倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回token_type_ids
        return_attention_mask: Optional[bool] = None,  # 是否返回attention_mask
        return_overflowing_tokens: bool = False,  # 是否返回溢出的tokens
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回序列长度
        verbose: bool = True,  # 是否打印详细信息
        **kwargs,  # 其它关键字参数
    ) -> BatchEncoding:
        # 处理'padding'、'truncation'、'max_length'等参数,保证向后兼容性
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法进行批量编码处理,并返回结果
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            xpaths=xpaths,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    # 定义一个方法用于将单个文本(或文本对)进行标记化处理,并返回标记化结果
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        batched_input = [(text, pair)] if pair else [text]
        # 调用_tokenizer的encode_batch方法,将文本(或文本对)进行批量编码
        encodings = self._tokenizer.encode_batch(
            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
        )

        # 返回编码结果中的第一个序列的token列表
        return encodings[0].tokens

    # 应用函数装饰器,添加文档字符串到下面的函数中
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        xpaths: Optional[List[List[int]]] = None,
        node_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
        `__call__` should be used instead.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
            text_pair (`List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                list of list of strings (words of a batch of examples).
        """

        # 获取填充和截断策略,同时处理过时的参数以保证向后兼容性
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 `_encode_plus` 进行实际的编码和准备工作
        return self._encode_plus(
            text=text,
            xpaths=xpaths,
            text_pair=text_pair,
            node_labels=node_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    # 定义一个方法用于批量编码文本或文本对,支持多种输入类型
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,  # 是否是文本对
        xpaths: Optional[List[List[List[int]]]] = None,  # XPath 路径列表,用于定位文本在原始数据中的位置
        node_labels: Optional[List[List[int]]] = None,  # 节点标签列表,用于标识文本对应的节点信息
        add_special_tokens: bool = True,  # 是否添加特殊标记(如[CLS]和[SEP])
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略,默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略,默认不截断
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 滑动窗口步长
        pad_to_multiple_of: Optional[int] = None,  # 填充长度的倍数
        return_tensors: Optional[str] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回token type ids
        return_attention_mask: Optional[bool] = None,  # 是否返回attention mask
        return_overflowing_tokens: bool = False,  # 是否返回溢出的 tokens
        return_special_tokens_mask: bool = False,  # 是否返回特殊 tokens 的 mask
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回长度
        verbose: bool = True,  # 是否显示详细信息
    ):
        # 定义一个方法用于编码单个文本或文本对
        def _encode_plus(
            self,
            text: Union[TextInput, PreTokenizedInput],  # 输入的文本或预分词的输入
            text_pair: Optional[PreTokenizedInput] = None,  # 可选的文本对
            xpaths: Optional[List[List[int]]] = None,  # XPath 路径列表,用于定位文本在原始数据中的位置
            node_labels: Optional[List[int]] = None,  # 节点标签列表,用于标识文本对应的节点信息
            add_special_tokens: bool = True,  # 是否添加特殊标记(如[CLS]和[SEP])
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略,默认不填充
            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略,默认不截断
            max_length: Optional[int] = None,  # 最大长度限制
            stride: int = 0,  # 滑动窗口步长
            pad_to_multiple_of: Optional[int] = None,  # 填充长度的倍数
            return_tensors: Optional[bool] = None,  # 返回的张量类型
            return_token_type_ids: Optional[bool] = None,  # 是否返回token type ids
            return_attention_mask: Optional[bool] = None,  # 是否返回attention mask
            return_overflowing_tokens: bool = False,  # 是否返回溢出的 tokens
            return_special_tokens_mask: bool = False,  # 是否返回特殊 tokens 的 mask
            return_offsets_mapping: bool = False,  # 是否返回偏移映射
            return_length: bool = False,  # 是否返回长度
            verbose: bool = True,  # 是否显示详细信息
            **kwargs,  # 其他关键字参数
        ):
    ) -> BatchEncoding:
        # 将输入组成批处理输入
        # 两种选项:
        # 1) 只有文本,如果文本是字符串列表,则 text 必须是列表
        # 2) 文本 + 文本对,此时 text 是字符串,text_pair 是字符串列表
        batched_input = [(text, text_pair)] if text_pair else [text]
        batched_xpaths = [xpaths]
        batched_node_labels = [node_labels] if node_labels is not None else None
        # 调用 _batch_encode_plus 方法进行批处理编码
        batched_output = self._batch_encode_plus(
            batched_input,
            is_pair=bool(text_pair is not None),
            xpaths=batched_xpaths,
            node_labels=batched_node_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 如果 return_tensors 是 None 并且不返回 overflowing tokens,则移除首个批处理维度
        # 在这种情况下,overflowing tokens 作为输出的一个批次被返回,因此保留它们
        if return_tensors is None and not return_overflowing_tokens:
            batched_output = BatchEncoding(
                {
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
                    for key, value in batched_output.items()
                },
                batched_output.encodings,
            )

        # 检查并警告关于过长序列的情况
        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)

        # 返回批处理输出
        return batched_output

    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ):
        # 该方法用于对编码输入进行填充
        # 返回填充后的编码输入
        pass

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A RoBERTa sequence has the following format:
        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens added.
        """
        if token_ids_1 is None:
            # Return a single sequence with special tokens `<s>` (CLS), sequence tokens, and `</s>` (SEP)
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        # For a pair of sequences, concatenate special tokens `<s>` (CLS), sequence 1 tokens, `</s>` (SEP),
        # sequence 2 tokens, and another `</s>` (SEP) at the end
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of zeros representing token type ids (not used in RoBERTa).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            # Return zeros for token type ids for a single sequence with special tokens
            return len(cls + token_ids_0 + sep) * [0]
        
        # Return zeros for token type ids for a pair of sequences with special tokens
        return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer's vocabulary to a directory.

        Args:
            save_directory (str):
                Directory where the vocabulary files will be saved.
            filename_prefix (str, *optional*):
                Optional prefix for the saved files.

        Returns:
            Tuple[str]: Tuple containing the saved file paths.
        """
        # Save the model's vocabulary files to the specified directory with an optional filename prefix
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

.\models\markuplm\__init__.py

# 版权声明和许可信息
#
# 版权所有 2022 年 HuggingFace 团队。保留所有权利。
# 
# 根据 Apache 许可证 2.0 版本(“许可证”)进行许可;
# 您只能在遵守许可证的情况下使用此文件。
# 您可以在以下网址获取许可证副本:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非适用法律要求或书面同意,软件
# 是基于“按原样提供”的基础上分发的,
# 没有任何明示或暗示的担保或条件。
# 有关详细信息,请参阅许可证。
from typing import TYPE_CHECKING

# 从 utils 模块导入所需的依赖和函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义导入结构
_import_structure = {
    "configuration_markuplm": ["MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "MarkupLMConfig"],
    "feature_extraction_markuplm": ["MarkupLMFeatureExtractor"],
    "processing_markuplm": ["MarkupLMProcessor"],
    "tokenization_markuplm": ["MarkupLMTokenizer"],
}

# 检查是否 tokenizers 可用,若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加快速 tokenization 的导入
    _import_structure["tokenization_markuplm_fast"] = ["MarkupLMTokenizerFast"]

# 检查是否 torch 可用,若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加 modeling 相关的导入
    _import_structure["modeling_markuplm"] = [
        "MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MarkupLMForQuestionAnswering",
        "MarkupLMForSequenceClassification",
        "MarkupLMForTokenClassification",
        "MarkupLMModel",
        "MarkupLMPreTrainedModel",
    ]

# 如果是类型检查阶段,则从各模块导入特定类和常量
if TYPE_CHECKING:
    from .configuration_markuplm import MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP, MarkupLMConfig
    from .feature_extraction_markuplm import MarkupLMFeatureExtractor
    from .processing_markuplm import MarkupLMProcessor
    from .tokenization_markuplm import MarkupLMTokenizer

    # 类型检查下,如果 tokenizers 可用,则导入 fast tokenization
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_markuplm_fast import MarkupLMTokenizerFast

    # 类型检查下,如果 torch 可用,则导入 modeling 相关模块
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_markuplm import (
            MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST,
            MarkupLMForQuestionAnswering,
            MarkupLMForSequenceClassification,
            MarkupLMForTokenClassification,
            MarkupLMModel,
            MarkupLMPreTrainedModel,
        )

# 如果不是类型检查阶段,则使用 _LazyModule 进行懒加载导入
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

.\models\mask2former\configuration_mask2former.py

# 设置文件编码为 UTF-8
# 版权声明,指明版权归属 Meta Platforms, Inc. 和 The HuggingFace Inc. team,保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权,除非符合许可证规定,否则不得使用此文件
# 可以在以下链接获取许可证副本:http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按"原样"分发本软件
# 没有任何形式的明示或暗示担保或条件。详情请参阅许可证
""" Mask2Former 模型配置"""

# 从 typing 导入所需的类型注解
from typing import Dict, List, Optional

# 导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging
# 从自动模块中导入配置映射
from ..auto import CONFIG_MAPPING

# Mask2Former 预训练配置映射表,包含预训练模型及其配置文件的链接
MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/mask2former-swin-small-coco-instance": (
        "https://huggingface.co/facebook/mask2former-swin-small-coco-instance/blob/main/config.json"
    )
    # 查看所有 Mask2Former 模型的链接:https://huggingface.co/models?filter=mask2former
}

# 获取日志记录器
logger = logging.get_logger(__name__)

# Mask2FormerConfig 类继承自 PretrainedConfig 类,用于存储 Mask2Former 模型的配置信息
class Mask2FormerConfig(PretrainedConfig):
    r"""
    这是用于存储 [`Mask2FormerModel`] 配置的配置类。根据指定的参数实例化 Mask2Former 模型,定义模型架构。
    使用默认参数实例化配置将生成类似于 Mask2Former [facebook/mask2former-swin-small-coco-instance] 
    (https://huggingface.co/facebook/mask2former-swin-small-coco-instance) 架构的配置。

    配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    当前,Mask2Former 仅支持 [Swin Transformer](swin) 作为主干。

    示例:

    ```
    >>> from transformers import Mask2FormerConfig, Mask2FormerModel

    >>> # 初始化 Mask2Former facebook/mask2former-swin-small-coco-instance 配置
    >>> configuration = Mask2FormerConfig()

    >>> # 使用配置初始化模型(带有随机权重),使用 facebook/mask2former-swin-small-coco-instance 风格的配置
    >>> model = Mask2FormerModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```

    """
    # 模型类型为 "mask2former"
    model_type = "mask2former"
    # 支持的主干为 ["swin"]
    backbones_supported = ["swin"]
    # 属性映射表,将 "hidden_size" 映射到 "hidden_dim"
    attribute_map = {"hidden_size": "hidden_dim"}
    @classmethod
    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
        """从预训练的骨干模型配置中实例化一个 [`Mask2FormerConfig`](或其派生类)对象。

        Args:
            backbone_config ([`PretrainedConfig`]):
                骨干模型的配置对象。

        Returns:
            [`Mask2FormerConfig`]: 返回一个配置对象的实例
        """
        # 使用给定的骨干模型配置实例化一个新的 `Mask2FormerConfig` 对象,并传递额外的关键字参数
        return cls(
            backbone_config=backbone_config,
            **kwargs,
        )

.\models\mask2former\convert_mask2former_original_pytorch_checkpoint_to_pytorch.py

# coding=utf-8
# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入必要的库和模块
import json
import sys
from argparse import ArgumentParser
from dataclasses import dataclass
from pathlib import Path
from pprint import pformat
from typing import Any, Dict, Iterator, List, Set, Tuple

import requests
import torch
import torchvision.transforms as T
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from huggingface_hub import hf_hub_download
from PIL import Image
from torch import Tensor, nn

# 导入 transformers 相关模块和类
from transformers import (
    Mask2FormerConfig,
    Mask2FormerForUniversalSegmentation,
    Mask2FormerImageProcessor,
    Mask2FormerModel,
    SwinConfig,
)
from transformers.models.mask2former.modeling_mask2former import (
    Mask2FormerForUniversalSegmentationOutput,
    Mask2FormerModelOutput,
)
from transformers.utils import logging

# 设置日志级别为 info
logging.set_verbosity_info()
# 获取日志记录器
logger = logging.get_logger()

# 设定随机数种子为 0
torch.manual_seed(0)


class TrackedStateDict:
    def __init__(self, to_track: Dict):
        """This class "tracks" a python dictionary by keeping track of which item is accessed.

        Args:
            to_track (Dict): The dictionary we wish to track
        """
        self.to_track = to_track
        self._seen: Set[str] = set()

    def __getitem__(self, key: str) -> Any:
        return self.to_track[key]

    def __setitem__(self, key: str, item: Any):
        self._seen.add(key)
        self.to_track[key] = item

    def diff(self) -> List[str]:
        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
        This is an effective method to check if we have update all the keys

        Returns:
            List[str]: List of keys not yet updated
        """
        return set(self.to_track.keys()) - self._seen

    def copy(self) -> Dict:
        # proxy the call to the internal dictionary
        return self.to_track.copy()


# 准备一个图片数据,用于后续验证结果
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 URL 获取图片数据流
    img_data = requests.get(url, stream=True).raw
    # 打开并返回图像对象
    im = Image.open(img_data)
    return im


@dataclass
class Args:
    """Fake command line arguments needed by mask2former/detectron implementation"""

    config_file: str
# 从参数 `args` 中获取配置,加载配置文件和命令行参数
def setup_cfg(args: Args):
    # 调用 `get_cfg()` 函数创建配置对象 `cfg`
    cfg = get_cfg()
    # 添加 DeepLab 相关的配置到 `cfg` 中
    add_deeplab_config(cfg)
    # 添加 MaskFormer2 相关的配置到 `cfg` 中
    add_maskformer2_config(cfg)
    # 从指定的配置文件 `args.config_file` 中合并配置到 `cfg` 中
    cfg.merge_from_file(args.config_file)
    # 冻结配置,防止修改
    cfg.freeze()
    # 返回配置对象 `cfg`
    return cfg


# 将原始 Mask2Former 配置转换为我们定义的 ImageProcessor
class OriginalMask2FormerConfigToOursConverter:
# 将原始 Mask2Former 配置转换为我们定义的 ImageProcessor
class OriginalMask2FormerConfigToImageProcessorConverter:
    # 将原始配置对象转换为 Mask2FormerImageProcessor 实例
    def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
        # 获取原始配置中的模型和输入信息
        model = original_config.MODEL
        model_input = original_config.INPUT

        # 返回一个 Mask2FormerImageProcessor 实例,使用标准化后的像素均值和标准差,以及其他相关配置
        return Mask2FormerImageProcessor(
            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
            size=model_input.MIN_SIZE_TEST,
            max_size=model_input.MAX_SIZE_TEST,
            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
            ignore_index=model.SEM_SEG_HEAD.IGNORE_VALUE,
            size_divisibility=32,
        )


# 将原始 Mask2Former 检查点转换为我们定义的检查点
class OriginalMask2FormerCheckpointToOursConverter:
    # 初始化转换器,接收原始模型和配置
    def __init__(self, original_model: nn.Module, config: Mask2FormerConfig):
        self.original_model = original_model
        self.config = config

    # 从源状态字典中弹出所有指定的重命名键,将其添加到目标状态字典中
    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
        for src_key, dst_key in renamed_keys:
            dst_state_dict[dst_key] = src_state_dict.pop(src_key)

    # 替换 MaskFormer Swin Transformer 的骨干部分
    def replace_maskformer_swin_backbone(
        self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig
    ):
        # 声明目标前缀和源前缀
        dst_prefix: str = "transformer_module.decoder"
        src_prefix: str = "sem_seg_head.predictor"

        # 重命名键列表在 `dst_state_dict` 和 `src_state_dict` 之间进行转换
        renamed_keys = self.rename_keys_in_masked_attention_decoder(dst_state_dict, src_state_dict)

        # 添加更多的重命名键
        renamed_keys.extend(
            [
                (f"{src_prefix}.decoder_norm.weight", f"{dst_prefix}.layernorm.weight"),
                (f"{src_prefix}.decoder_norm.bias", f"{dst_prefix}.layernorm.bias"),
            ]
        )

        mlp_len = 3
        # 遍历 MLP 层,并添加相应的重命名键
        for i in range(mlp_len):
            renamed_keys.extend(
                [
                    (
                        f"{src_prefix}.mask_embed.layers.{i}.weight",
                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.weight",
                    ),
                    (
                        f"{src_prefix}.mask_embed.layers.{i}.bias",
                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.bias",
                    ),
                ]
            )

        # 弹出所有的重命名键,并添加到目标状态字典中
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
    # 将 Transformer 解码器的自注意力层的权重和偏置从源状态字典中弹出并添加到目标状态字典中
    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典中的键前缀
        dst_prefix: str = "transformer_module.decoder.layers"
        # 源状态字典中的键前缀
        src_prefix: str = "sem_seg_head.predictor"
        
        # 遍历 Transformer 解码器的每一层
        for i in range(self.config.decoder_layers - 1):
            # 读取自注意力层的输入投影层的权重和偏置
            in_proj_weight = src_state_dict.pop(
                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
            )
            in_proj_bias = src_state_dict.pop(
                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
            )
            
            # 将查询、键和值的投影权重和偏置添加到目标状态字典中
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]

    # 替换 Transformer 模块的键名称,并将其从源状态字典移动到目标状态字典中
    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典中的键前缀
        dst_prefix: str = "transformer_module"
        # 源状态字典中的键前缀
        src_prefix: str = "sem_seg_head.predictor"

        # 调用替换掩蔽注意力解码器的方法
        self.replace_masked_attention_decoder(dst_state_dict, src_state_dict)

        # 定义要重命名的键对
        renamed_keys = [
            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
            (f"{src_prefix}.query_feat.weight", f"{dst_prefix}.queries_features.weight"),
            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
        ]

        # 从源状态字典中移除所有重命名的键,并将它们添加到目标状态字典中
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
        
        # 调用替换 Transformer 解码器中查询、键、值的投影权重和偏置的方法
        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)

    # 替换通用分割模块的键名称,并将其从源状态字典移动到目标状态字典中
    def replace_universal_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典中的键前缀(空字符串表示直接替换)
        dst_prefix: str = ""
        # 源状态字典中的键前缀
        src_prefix: str = "sem_seg_head.predictor"

        # 定义要重命名的键对
        renamed_keys = [
            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
        ]

        # 记录日志,指示正在替换的键
        logger.info(f"Replacing keys {pformat(renamed_keys)}")
        
        # 从源状态字典中移除所有重命名的键,并将它们添加到目标状态字典中
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
    # 将传入的 mask2former 对象的状态字典转换为可追踪的状态字典对象
    dst_state_dict = TrackedStateDict(mask2former.state_dict())
    # 获取原始模型的状态字典
    src_state_dict = self.original_model.state_dict()

    # 替换目标模型中的像素模块
    self.replace_pixel_module(dst_state_dict, src_state_dict)
    # 替换目标模型中的 Transformer 模块
    self.replace_transformer_module(dst_state_dict, src_state_dict)

    # 记录并输出未复制成功的键的信息
    logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
    # 记录并输出未复制的键的信息
    logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
    # 输出转换完成的信息
    logger.info("🙌 Done")

    # 从追踪的状态字典中选取需要追踪的键,构成新的状态字典
    state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
    # 加载新的状态字典到 mask2former 对象中
    mask2former.load_state_dict(state_dict)
    # 返回更新后的 mask2former 对象
    return mask2former

def convert_universal_segmentation(
    self, mask2former: Mask2FormerForUniversalSegmentation
) -> Mask2FormerForUniversalSegmentation:
    # 将传入的 mask2former 对象的状态字典转换为可追踪的状态字典对象
    dst_state_dict = TrackedStateDict(mask2former.state_dict())
    # 获取原始模型的状态字典
    src_state_dict = self.original_model.state_dict()

    # 替换通用分割模块
    self.replace_universal_segmentation_module(dst_state_dict, src_state_dict)

    # 从追踪的状态字典中选取需要追踪的键,构成新的状态字典
    state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
    # 加载新的状态字典到 mask2former 对象中
    mask2former.load_state_dict(state_dict)

    # 返回更新后的 mask2former 对象
    return mask2former

@staticmethod
def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
    # 获取 checkpoints_dir 目录下所有后缀为 .pkl 的文件路径列表
    checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")

    # 遍历每个 checkpoint 路径
    for checkpoint in checkpoints:
        # 输出正在转换的信息及其文件名(不带后缀)
        logger.info(f"💪 Converting {checkpoint.stem}")

        # 查找关联的配置文件

        # 数据集名称,例如 'coco'
        dataset_name = checkpoint.parents[2].stem
        # 如果数据集名称为 "ade",则替换为 "ade20k"
        if dataset_name == "ade":
            dataset_name = dataset_name.replace("ade", "ade20k")

        # 分割任务类型,例如 'instance-segmentation'
        segmentation_task = checkpoint.parents[1].stem

        # 与 checkpoint 相关联的配置文件名
        config_file_name = f"{checkpoint.parents[0].stem}.yaml"

        # 构建配置文件的完整路径
        config: Path = config_dir / dataset_name / segmentation_task / "swin" / config_file_name
        # 返回配置文件路径和相应的 checkpoint 路径的迭代器
        yield config, checkpoint
# 定义一个测试函数,用于比较原始模型和我们的模型的性能
def test(
    original_model,  # 原始模型
    our_model: Mask2FormerForUniversalSegmentation,  # 我们的模型,特定类型为 Mask2FormerForUniversalSegmentation
    image_processor: Mask2FormerImageProcessor,  # 图像处理器,用于准备图像数据
    tolerance: float,  # 容忍度,用于比较数值时的误差允许范围
):
    with torch.no_grad():  # 使用 torch.no_grad() 禁用梯度计算
        original_model = original_model.eval()  # 将原始模型设置为评估模式
        our_model = our_model.eval()  # 将我们的模型设置为评估模式

        im = prepare_img()  # 准备图像数据
        x = image_processor(images=im, return_tensors="pt")["pixel_values"]  # 使用图像处理器处理图像并返回像素值张量

        original_model_backbone_features = original_model.backbone(x.clone())  # 提取原始模型的骨干特征
        our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)  # 使用我们的模型,获取输出并包括隐藏状态

        # 测试骨干特征
        for original_model_feature, our_model_feature in zip(
            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
        ):
            assert torch.allclose(
                original_model_feature, our_model_feature, atol=tolerance
            ), "The backbone features are not the same."

        # 测试像素解码器
        mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
            original_model_backbone_features
        )

        for original_model_feature, our_model_feature in zip(
            multi_scale_features, our_model_output.pixel_decoder_hidden_states
        ):
            assert torch.allclose(
                original_model_feature, our_model_feature, atol=tolerance
            ), "The pixel decoder feature are not the same"

        # 测试完整模型
        tr_complete = T.Compose(
            [T.Resize((384, 384)), T.ToTensor()],
        )
        y = (tr_complete(im) * 255.0).to(torch.int).float()  # 转换图像数据到指定类型和范围

        # 修改原始的 Mask2Former 代码以返回掩码和类别 logits
        original_class_logits, original_mask_logits = original_model([{"image": y.clone().squeeze(0)}])

        our_model_out: Mask2FormerForUniversalSegmentationOutput = our_model(x.clone())
        our_mask_logits = our_model_out.masks_queries_logits  # 获取我们模型的掩码 logits
        our_class_logits = our_model_out.class_queries_logits  # 获取我们模型的类别 logits

        assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
        assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
        assert torch.allclose(
            original_class_logits, our_class_logits, atol=tolerance
        ), "The class logits are not the same."
        assert torch.allclose(
            original_mask_logits, our_mask_logits, atol=tolerance
        ), "The predicted masks are not the same."

        logger.info("✅ Test passed!")  # 记录测试通过信息


# 定义一个函数,用于从检查点文件路径中获取模型名称
def get_model_name(checkpoint_file: Path):
    # model_name_raw 是检查点文件路径的父目录名
    model_name_raw: str = checkpoint_file.parents[0].stem

    # segmentation_task_name 必须是以下之一:instance-segmentation、panoptic-segmentation、semantic-segmentation
    segmentation_task_name: str = checkpoint_file.parents[1].stem
    # 检查分割任务名称是否在预定义的列表中,如果不在则引发值错误异常
    if segmentation_task_name not in ["instance-segmentation", "panoptic-segmentation", "semantic-segmentation"]:
        raise ValueError(
            f"{segmentation_task_name} must be wrong since acceptable values are: instance-segmentation,"
            " panoptic-segmentation, semantic-segmentation."
        )

    # 提取数据集名称,应为以下之一:`coco`, `ade`, `cityscapes`, `mapillary-vistas`
    dataset_name: str = checkpoint_file.parents[2].stem
    if dataset_name not in ["coco", "ade", "cityscapes", "mapillary-vistas"]:
        raise ValueError(
            f"{dataset_name} must be wrong since we didn't find 'coco' or 'ade' or 'cityscapes' or 'mapillary-vistas'"
            " in it "
        )

    # 设置模型的骨干网络类型为 "swin"
    backbone = "swin"

    # 定义可接受的骨干网络类型列表
    backbone_types = ["tiny", "small", "base_IN21k", "base", "large"]

    # 从模型名称中筛选出存在于骨干网络类型列表中的类型,并用连字符替换下划线
    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0].replace("_", "-")

    # 组装模型名称,格式为 "mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}"
    model_name = f"mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}"

    # 返回生成的模型名称
    return model_name
if __name__ == "__main__":
    # 创建命令行解析器对象,设置描述信息
    parser = ArgumentParser(
        description="Command line to convert the original mask2formers (with swin backbone) to our implementations."
    )

    # 添加命令行参数 --checkpoints_dir,类型为 Path,用于指定模型检查点所在的目录路径
    parser.add_argument(
        "--checkpoints_dir",
        type=Path,
        help=(
            "A directory containing the model's checkpoints. The directory has to have the following structure:"
            " <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.pkl"
        ),
    )

    # 添加命令行参数 --configs_dir,类型为 Path,用于指定模型配置文件所在的目录路径
    parser.add_argument(
        "--configs_dir",
        type=Path,
        help=(
            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
            " structure: <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.yaml"
        ),
    )

    # 添加命令行参数 --mask2former_dir,类型为 Path,必选参数,用于指定 Mask2Former 的原始实现代码所在的目录路径
    parser.add_argument(
        "--mask2former_dir",
        required=True,
        type=Path,
        help=(
            "A path to Mask2Former's original implementation directory. You can download from here:"
            " https://github.com/facebookresearch/Mask2Former"
        ),
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 将命令行参数赋值给相应变量
    checkpoints_dir: Path = args.checkpoints_dir
    config_dir: Path = args.configs_dir
    mask2former_dir: Path = args.mask2former_dir

    # 将 Mask2Former 的父目录添加到系统路径中,以便导入原始 Mask2Former 的配置和模型
    sys.path.append(str(mask2former_dir.parent))

    # 从原始源代码仓库导入原始 Mask2Former 的配置和模型
    from Mask2Former.mask2former.config import add_maskformer2_config
    from Mask2Former.mask2former.maskformer_model import MaskFormer as OriginalMask2Former

    # 使用循环处理每对配置文件和检查点文件,转换成我们的实现格式
    for config_file, checkpoint_file in OriginalMask2FormerCheckpointToOursConverter.using_dirs(
        checkpoints_dir, config_dir
        ):
            # 从检查点文件获取模型名称
            model_name = get_model_name(checkpoint_file)
            # 使用原始配置文件创建图像处理器,并进行设置
            image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
                setup_cfg(Args(config_file=config_file))
            )
            # 设置图像处理器的尺寸为固定值
            image_processor.size = {"height": 384, "width": 384}

            # 使用原始配置文件创建配置对象
            original_config = setup_cfg(Args(config_file=config_file))
            # 从原始配置创建 Mask2Former 模型的参数
            mask2former_kwargs = OriginalMask2Former.from_config(original_config)
            # 创建并初始化原始 Mask2Former 模型
            original_model = OriginalMask2Former(**mask2former_kwargs).eval()

            # 加载模型的检查点
            DetectionCheckpointer(original_model).load(str(checkpoint_file))

            # 将原始配置转换为我们的 Mask2Former 配置对象
            config: Mask2FormerConfig = OriginalMask2FormerConfigToOursConverter()(original_config)
            # 创建并初始化我们的 Mask2Former 模型
            mask2former = Mask2FormerModel(config=config).eval()

            # 将原始 Mask2Former 模型和配置转换为我们的模型和配置
            converter = OriginalMask2FormerCheckpointToOursConverter(original_model, config)
            mask2former = converter.convert(mask2former)

            # 创建用于通用分割的 Mask2FormerForUniversalSegmentation 模型并初始化
            mask2former_for_segmentation = Mask2FormerForUniversalSegmentation(config=config).eval()
            # 将我们的 Mask2Former 模型应用于通用分割模型
            mask2former_for_segmentation.model = mask2former

            # 将通用分割模型从原始格式转换为我们的格式
            mask2former_for_segmentation = converter.convert_universal_segmentation(mask2former_for_segmentation)

            # 设置容差阈值
            tolerance = 3e-1
            # 需要高容差的模型列表
            high_tolerance_models = [
                "mask2former-swin-base-IN21k-coco-instance",
                "mask2former-swin-base-coco-instance",
                "mask2former-swin-small-cityscapes-semantic",
            ]

            # 如果模型名称在高容差模型列表中,则设置更高的容差阈值
            if model_name in high_tolerance_models:
                tolerance = 3e-1

            # 记录当前正在测试的模型名称
            logger.info(f"🪄 Testing {model_name}...")
            # 执行测试,评估模型性能
            test(original_model, mask2former_for_segmentation, image_processor, tolerance)
            # 记录当前正在推送的模型名称
            logger.info(f"🪄 Pushing {model_name} to hub...")

            # 将图像处理器推送至模型中心
            image_processor.push_to_hub(model_name)
            # 将通用分割模型推送至模型中心
            mask2former_for_segmentation.push_to_hub(model_name)

.\models\mask2former\image_processing_mask2former.py

# coding=utf-8
# 版权所有 2022 年 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本进行许可;
# 除非符合许可证,否则您不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,软件
# 按“原样”分发,不提供任何形式的担保或
# 条件,无论是明示的还是暗示的。参见
# 许可证中的特定语言,管理许可证
# 的权限和限制。

"""Mask2Former 的图像处理器类。"""

import math
import warnings
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union

import numpy as np

# 导入图像处理工具函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
# 导入图像变换函数
from ...image_transforms import (
    PaddingMode,
    get_resize_output_image_size,
    pad,
    rescale,
    resize,
    to_channel_dimension_format,
)
# 导入图像工具函数
from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_batched,
    is_scaled_image,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 导入通用工具函数
from ...utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    TensorType,
    is_torch_available,
    is_torch_tensor,
    logging,
)

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 如果 torch 可用,则导入相关模块
if is_torch_available():
    import torch
    from torch import nn

# 从 transformers.models.detr.image_processing_detr.max_across_indices 复制的函数
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    返回可迭代值的所有索引的最大值。
    """
    return [max(values_i) for values_i in zip(*values)]

# 从 transformers.models.detr.image_processing_detr.get_max_height_width 复制的函数
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    获取批处理中所有图像的最大高度和宽度。
    """
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    if input_data_format == ChannelDimension.FIRST:
        # 如果图像通道维度在前
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        # 如果图像通道维度在后
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)

# 从 transformers.models.detr.image_processing_detr.make_pixel_mask 复制的函数
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    创建像素掩码。
    """
    # 创建图像的像素掩码,其中1表示有效像素,0表示填充像素。
    
    Args:
        image (`np.ndarray`):
            要创建像素掩码的图像。
        output_size (`Tuple[int, int]`):
            掩码的输出尺寸。
    """
    # 获取图像的高度和宽度,根据输入数据格式中的通道维度确定。
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    
    # 创建一个指定输出尺寸的零矩阵作为掩码,数据类型为64位整数。
    mask = np.zeros(output_size, dtype=np.int64)
    
    # 将掩码中图像实际像素部分(未填充部分)置为1。
    mask[:input_height, :input_width] = 1
    
    # 返回生成的像素掩码。
    return mask
# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
def check_segment_validity(segmentation, area_threshold=2):
    """
    Checks the validity of each segment in the segmentation map based on area threshold.

    Args:
        segmentation (`torch.Tensor` or `numpy.array`):
            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
        area_threshold (`int`, optional):
            Minimum area threshold for valid segments. Defaults to 2.
    Returns:
        `List[int]`: List of valid segment indices.
    """
    segment_ids = torch.unique(segmentation)

    valid_segments = []
    for idx in segment_ids:
        mask = torch.where(segmentation == idx, 1, 0)
        area = mask.sum().item()
        if area >= area_threshold:
            valid_segments.append(idx.item())

    return valid_segments
# 将分割地图转换为二进制掩码集合
def convert_segmentation_map_to_binary_masks(
    segmentation_map,
    segments,
    num_classes,
):
    # 初始化二进制掩码列表,每个类别一个列表
    binary_masks = [[] for _ in range(num_classes)]

    # 遍历所有分割对象
    for segment in segments:
        segment_id = segment["id"]
        label_id = segment["label_id"]

        # 创建当前对象的二进制掩码
        binary_mask = (segmentation_map == segment_id).int()
        binary_masks[label_id].append(binary_mask)

    # 将每个类别的掩码列表转换为张量
    binary_masks = [torch.stack(masks) if masks else torch.empty(0, dtype=torch.int32) for masks in binary_masks]

    return binary_masks
    # segmentation_map: "np.ndarray",声明了一个变量 segmentation_map,类型为 np.ndarray
    # instance_id_to_semantic_id: Optional[Dict[int, int]] = None,声明了一个可选参数 instance_id_to_semantic_id,类型为 Optional[Dict[int, int]],默认为 None
    # ignore_index: Optional[int] = None,声明了一个可选参数 ignore_index,类型为 Optional[int],默认为 None
    # reduce_labels: bool = False,声明了一个布尔型参数 reduce_labels,初始化为 False
):
    # 如果要减少标签并且没有提供 ignore_index,则抛出值错误异常
    if reduce_labels and ignore_index is None:
        raise ValueError("If `reduce_labels` is True, `ignore_index` must be provided.")

    # 如果要减少标签,则将标记地图中的所有零值替换为 ignore_index,其他值减一
    if reduce_labels:
        segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1)

    # 获取标记地图中的所有唯一标签
    all_labels = np.unique(segmentation_map)

    # 如果指定了 ignore_index,则从所有标签中移除背景标签
    if ignore_index is not None:
        all_labels = all_labels[all_labels != ignore_index]

    # 为每个对象实例生成二进制掩码
    binary_masks = [(segmentation_map == i) for i in all_labels]
    binary_masks = np.stack(binary_masks, axis=0)  # (num_labels, height, width)

    # 如果存在 instance_id_to_semantic_id 映射,则将实例 ID 转换为类别 ID
    if instance_id_to_semantic_id is not None:
        labels = np.zeros(all_labels.shape[0])

        # 遍历所有标签,根据映射表将实例 ID 转换为类别 ID
        for label in all_labels:
            class_id = instance_id_to_semantic_id[label + 1 if reduce_labels else label]
            labels[all_labels == label] = class_id - 1 if reduce_labels else class_id
    else:
        # 否则直接使用所有标签作为类别标签
        labels = all_labels

    # 返回二进制掩码和类别标签,分别转换为 float32 和 int64 类型
    return binary_masks.astype(np.float32), labels.astype(np.int64)


# 从 transformers.models.maskformer.image_processing_maskformer.get_maskformer_resize_output_image_size 复制,并将 maskformer 改为 mask2former
def get_mask2former_resize_output_image_size(
    image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
    max_size: Optional[int] = None,
    size_divisor: int = 0,
    default_to_square: bool = True,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    计算给定所需大小的输出图像大小。

    Args:
        image (`np.ndarray`):
            输入图像。
        size (`int` or `Tuple[int, int]` or `List[int]` or `Tuple[int]`):
            输出图像的大小。
        max_size (`int`, *optional*):
            输出图像的最大大小。
        size_divisor (`int`, *optional*, defaults to 0):
            如果给定了 size_divisor,则输出图像大小将可以被该数整除。
        default_to_square (`bool`, *optional*, defaults to `True`):
            如果未提供大小,是否默认为正方形。
        input_data_format (`ChannelDimension` or `str`, *optional*):
            输入图像的通道维度格式。如果未设置,将使用输入图像的推断格式。

    Returns:
        `Tuple[int, int]`: 输出图像的大小。
    """
    # 调用 get_resize_output_image_size 函数计算输出大小
    output_size = get_resize_output_image_size(
        input_image=image,
        size=size,
        default_to_square=default_to_square,
        max_size=max_size,
        input_data_format=input_data_format,
    )

    # 如果 size_divisor 大于 0,则确保输出大小可以被 size_divisor 整除
    if size_divisor > 0:
        height, width = output_size
        height = int(math.ceil(height / size_divisor) * size_divisor)
        width = int(math.ceil(width / size_divisor) * size_divisor)
        output_size = (height, width)

    return output_size
class Mask2FormerImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Mask2Former image processor. The image processor can be used to prepare image(s) and optional targets
    for the model.

    This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the input to a certain `size`.
        size (`int`, *optional*, defaults to 800):
            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
            height / width, size)`.
        size_divisor (`int`, *optional*, defaults to 32):
            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
            Swin Transformer.
        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
            to `True`.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the input to a certain `scale`.
        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
            Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with mean and standard deviation.
        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
            ImageNet std.
        ignore_index (`int`, *optional*):
            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
            denoted with 0 (background) will be replaced with `ignore_index`.
        reduce_labels (`bool`, *optional*, defaults to `False`):
            Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
            is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
            The background label will be replaced by `ignore_index`.



    model_input_names = ["pixel_values", "pixel_mask"]



    定义一个列表 `model_input_names`,包含两个字符串元素 "pixel_values" 和 "pixel_mask",这些名称用于标识模型的输入。
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        size_divisor: int = 32,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        ignore_index: Optional[int] = None,
        reduce_labels: bool = False,
        **kwargs,
    ):
        # 如果传入的参数 kwargs 中包含 "size_divisibility",发出警告并使用新的参数名称
        if "size_divisibility" in kwargs:
            warnings.warn(
                "The `size_divisibility` argument is deprecated and will be removed in v4.27. Please use "
                "`size_divisor` instead.",
                FutureWarning,
            )
            size_divisor = kwargs.pop("size_divisibility")
        
        # 如果传入的参数 kwargs 中包含 "max_size",发出警告并设置私有属性 _max_size 为传入的值
        if "max_size" in kwargs:
            warnings.warn(
                "The `max_size` argument is deprecated and will be removed in v4.27. Please use size['longest_edge']"
                " instead.",
                FutureWarning,
            )
            # 将 max_size 设置为私有属性,以便在预处理方法中将其作为默认值传递,同时仍然可以将 size 作为整数传递
            self._max_size = kwargs.pop("max_size")
        else:
            # 否则设置默认的 _max_size 值为 1333
            self._max_size = 1333
        
        # 如果 size 参数为 None,则设置默认的 size 字典,包含 "shortest_edge" 和 "longest_edge" 键
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": self._max_size}
        # 根据传入的 size 和 _max_size 参数,获取最终的 size 字典
        size = get_size_dict(size, max_size=self._max_size, default_to_square=False)

        # 调用父类的初始化方法,传入其余的关键字参数
        super().__init__(**kwargs)

        # 初始化对象的各个属性
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.size_divisor = size_divisor
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.ignore_index = ignore_index
        self.reduce_labels = reduce_labels
        # 设置一个包含有效处理器键的列表,用于后续处理
        self._valid_processor_keys = [
            "images",
            "segmentation_maps",
            "instance_id_to_semantic_id",
            "do_resize",
            "size",
            "size_divisor",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "ignore_index",
            "reduce_labels",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    @classmethod
    # 重写了基类中的 `from_dict` 方法,用于确保当通过 `from_dict` 和 kwargs 创建图像处理器时参数可以更新,
    # 例如 `Mask2FormerImageProcessor.from_pretrained(checkpoint, max_size=800)`
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        # 复制输入的图像处理器字典,以免修改原始输入
        image_processor_dict = image_processor_dict.copy()
        # 如果 `kwargs` 中包含 "max_size",则更新图像处理器字典中的 "max_size" 参数,并从 `kwargs` 中删除
        if "max_size" in kwargs:
            image_processor_dict["max_size"] = kwargs.pop("max_size")
        # 如果 `kwargs` 中包含 "size_divisibility",则更新图像处理器字典中的 "size_divisibility" 参数,并从 `kwargs` 中删除
        if "size_divisibility" in kwargs:
            image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
        # 调用基类的 `from_dict` 方法,传递更新后的图像处理器字典和剩余的 `kwargs`
        return super().from_dict(image_processor_dict, **kwargs)

    # 从 transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.resize 处复制,
    # 将函数名从 `get_maskformer_resize_output_image_size` 修改为 `get_mask2former_resize_output_image_size`
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        size_divisor: int = 0,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format=None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    def resize_image(
            image: np.ndarray,
            size: Dict[str, int],
            size_divisor: int = 0,
            resample: PILImageResampling = PILImageResampling.BILINEAR,
            data_format: Optional[Union[ChannelDimension, str]] = None,
            input_data_format: Optional[Union[ChannelDimension, str]] = None,
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.
    
        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                The size of the output image.
            size_divisor (`int`, *optional*, defaults to 0):
                If `size_divisor` is given, the output image size will be divisible by the number.
            resample (`PILImageResampling` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use when resizing the image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
    
        # Check if deprecated `max_size` parameter is present and issue a warning
        if "max_size" in kwargs:
            warnings.warn(
                "The `max_size` parameter is deprecated and will be removed in v4.27. "
                "Please specify in `size['longest_edge'] instead`.",
                FutureWarning,
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None
    
        # Adjust `size` using utility function `get_size_dict`
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
    
        # Determine whether `size` contains `shortest_edge` and `longest_edge` or `height` and `width`
        if "shortest_edge" in size and "longest_edge" in size:
            size, max_size = size["shortest_edge"], size["longest_edge"]
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
            max_size = None
        else:
            # Raise ValueError if `size` does not contain necessary keys
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
    
        # Adjust `size` using another utility function `get_mask2former_resize_output_image_size`
        size = get_mask2former_resize_output_image_size(
            image=image,
            size=size,
            max_size=max_size,
            size_divisor=size_divisor,
            default_to_square=False,
            input_data_format=input_data_format,
        )
    
        # Resize the `image` using specified parameters and return the resized image
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )
        return image
    
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
    def convert_segmentation_map_to_binary_masks(
        self,
        segmentation_map: "np.ndarray",
        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
        ignore_index: Optional[int] = None,
        reduce_labels: bool = False,
    ):
        """
        Convert a segmentation map to binary masks based on instance IDs and semantic IDs.

        Args:
            segmentation_map (`np.ndarray`):
                The input segmentation map to be converted.
            instance_id_to_semantic_id (Optional[Dict[int, int]], *optional*):
                Mapping from instance IDs to semantic IDs.
            ignore_index (Optional[int], *optional*):
                Index to ignore in the segmentation map.
            reduce_labels (`bool`, *optional*):
                Whether to reduce the number of unique labels in the masks.

        Returns:
            `np.ndarray`: Binary masks corresponding to the segmentation map.
        """
        reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
        return convert_segmentation_map_to_binary_masks(
            segmentation_map=segmentation_map,
            instance_id_to_semantic_id=instance_id_to_semantic_id,
            ignore_index=ignore_index,
            reduce_labels=reduce_labels,
        )

    def __call__(self, images, segmentation_maps=None, **kwargs) -> BatchFeature:
        """
        Perform preprocessing on images and segmentation maps.

        Args:
            images (ImageInput):
                Input images to preprocess.
            segmentation_maps (Optional[np.ndarray], *optional*):
                Optional segmentation maps corresponding to the images.
            **kwargs:
                Additional keyword arguments for preprocessing.

        Returns:
            BatchFeature: Preprocessed batch of images and segmentation maps.
        """
        return self.preprocess(images, segmentation_maps=segmentation_maps, **kwargs)

    def _preprocess(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        size_divisor: int = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        # 如果需要进行图片尺寸调整,则调用 resize 方法
        if do_resize:
            image = self.resize(
                image, size=size, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format
            )
        # 如果需要进行图片缩放,则调用 rescale 方法
        if do_rescale:
            image = self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
        # 如果需要进行图片标准化,则调用 normalize 方法
        if do_normalize:
            image = self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
        # 返回预处理后的图片数据
        return image

    def _preprocess_image(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        size_divisor: int = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single image."""
        # 将输入的图像转换为 numpy 数组
        image = to_numpy_array(image)
        # 如果图像已经是缩放过的并且需要进行再次缩放,则记录警告信息
        if is_scaled_image(image) and do_rescale:
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        # 推断输入数据的通道格式
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        # 调用 _preprocess 方法进行实际的图像预处理
        image = self._preprocess(
            image=image,
            do_resize=do_resize,
            size=size,
            size_divisor=size_divisor,
            resample=resample,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            input_data_format=input_data_format,
        )
        # 如果指定了输出数据的通道格式,则进行格式转换
        if data_format is not None:
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
        # 返回预处理后的图像数据
        return image

    def _preprocess_mask(
        self,
        segmentation_map: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        size_divisor: int = 0,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single mask."""
        # 将分割地图转换为 NumPy 数组
        segmentation_map = to_numpy_array(segmentation_map)
        
        # 如果分割地图的维度是 2,则添加一个通道维度,这在某些转换中是必要的
        if segmentation_map.ndim == 2:
            added_channel_dim = True
            segmentation_map = segmentation_map[None, ...]  # 在第一维度上添加一个维度
            input_data_format = ChannelDimension.FIRST  # 设置数据格式为第一通道优先
        else:
            added_channel_dim = False
            if input_data_format is None:
                # 推断通道维度的格式
                input_data_format = infer_channel_dimension_format(segmentation_map)
        
        # TODO: (Amy)
        # 重构分割地图处理流程,包括减少标签和调整大小,以便不丢弃大于 255 的段ID。
        segmentation_map = self._preprocess(
            image=segmentation_map,
            do_resize=do_resize,
            resample=PILImageResampling.NEAREST,
            size=size,
            size_divisor=size_divisor,
            do_rescale=False,
            do_normalize=False,
            input_data_format=input_data_format,
        )
        
        # 如果为了处理而添加了额外的通道维度,则移除它
        if added_channel_dim:
            segmentation_map = segmentation_map.squeeze(0)
        
        # 返回预处理后的分割地图
        return segmentation_map

    def preprocess(
        self,
        images: ImageInput,
        segmentation_maps: Optional[ImageInput] = None,
        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        size_divisor: Optional[int] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        ignore_index: Optional[int] = None,
        reduce_labels: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        # 该方法未提供完整代码,仅作为示例代码的一部分,后续部分应继续补充

    # 以下是从 transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image 复制的
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度,根据输入数据格式确定通道维度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出的目标高度和宽度
        output_height, output_width = output_size

        # 计算需要填充的底部和右侧像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        # 定义填充的方式,上部和左部都不需要填充,底部和右侧填充零值
        padding = ((0, pad_bottom), (0, pad_right))
        # 对图像进行填充操作,使用常量值填充,可以指定数据格式和输入数据格式
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        # 返回填充后的图像
        return padded_image

    # 从 transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad 复制过来的函数
    def pad(
        self,
        images: List[np.ndarray],
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.

        Args:
            image (`np.ndarray`):
                Image to pad.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取批量图像中的最大高度和宽度,用于填充大小的确定
        pad_size = get_max_height_width(images, input_data_format=input_data_format)

        # 对每张图像进行填充操作,使它们的尺寸都扩展到批量中最大的高度和宽度
        padded_images = [
            self._pad_image(
                image,
                pad_size,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
            )
            for image in images
        ]
        # 将填充后的图像数据放入字典中
        data = {"pixel_values": padded_images}

        # 如果需要返回像素掩码
        if return_pixel_mask:
            # 生成每张图像的像素掩码,并放入列表中
            masks = [
                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
                for image in images
            ]
            # 将像素掩码列表放入数据字典中
            data["pixel_mask"] = masks

        # 返回填充后的批量特征对象,包括数据字典和返回的张量类型
        return BatchFeature(data=data, tensor_type=return_tensors)

    def encode_inputs(
        self,
        pixel_values_list: List[ImageInput],
        segmentation_maps: ImageInput = None,
        instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
        ignore_index: Optional[int] = None,
        reduce_labels: bool = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        """
        Encodes input images and optional segmentation maps into a format suitable for model input.

        Args:
            pixel_values_list (`List[ImageInput]`):
                List of input images to encode.
            segmentation_maps (`ImageInput`, *optional*):
                Optional segmentation maps associated with input images.
            instance_id_to_semantic_id (`Optional[Union[List[Dict[int, int]], Dict[int, int]]]`, *optional*):
                Mapping from instance IDs to semantic IDs, if applicable.
            ignore_index (`Optional[int]`, *optional*):
                Index to ignore in the encoding process.
            reduce_labels (`bool`, *optional*, defaults to `False`):
                Whether to reduce the number of unique labels in the segmentation maps.
            return_tensors (`Optional[Union[str, TensorType]]`, *optional*):
                The type of tensors to return.
            input_data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                The channel dimension format of the input images.

        Returns:
            Encoded inputs suitable for model processing.
        """
        # 编码输入图像及其相关信息,返回适合模型输入的格式
        ...

    def post_process_semantic_segmentation(
        self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None
    ):
        """
        Post-processes semantic segmentation model outputs.

        Args:
            outputs: Model outputs to be post-processed.
            target_sizes (`Optional[List[Tuple[int, int]]]`, *optional*):
                Target sizes for resizing outputs.

        Returns:
            Post-processed outputs.
        """
        # 对语义分割模型的输出进行后处理
        ...
    ) -> "torch.Tensor":
        """
        Converts the output of [`Mask2FormerForUniversalSegmentation`] into semantic segmentation maps. Only supports
        PyTorch.

        Args:
            outputs ([`Mask2FormerForUniversalSegmentation`]):
                Raw outputs of the model.
            target_sizes (`List[Tuple[int, int]]`, *optional*):
                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
                final size (height, width) of each prediction. If left to None, predictions will not be resized.
        Returns:
            `List[torch.Tensor]`:
                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
                `torch.Tensor` correspond to a semantic class id.
        """
        # Extract logits for class queries and mask queries from model outputs
        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]

        # Scale masks_queries_logits back to preprocessed image size (384, 384)
        masks_queries_logits = torch.nn.functional.interpolate(
            masks_queries_logits, size=(384, 384), mode="bilinear", align_corners=False
        )

        # Remove the null class from class_queries_logits (`[..., :-1]`)
        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]

        # Compute semantic segmentation logits of shape (batch_size, num_classes, height, width)
        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
        batch_size = class_queries_logits.shape[0]

        # Resize logits and compute semantic segmentation maps if target_sizes is provided
        if target_sizes is not None:
            if batch_size != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

            semantic_segmentation = []
            for idx in range(batch_size):
                resized_logits = torch.nn.functional.interpolate(
                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
                )
                semantic_map = resized_logits[0].argmax(dim=0)
                semantic_segmentation.append(semantic_map)
        else:
            # If target_sizes is None, compute semantic segmentation maps for each batch item
            semantic_segmentation = segmentation.argmax(dim=1)
            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]

        return semantic_segmentation
    # 对实例分割模型输出进行后处理,包括阈值化、合并重叠的实例掩码区域等
    def post_process_instance_segmentation(
        self,
        outputs,
        threshold: float = 0.5,  # 置信度阈值,用于筛选实例
        mask_threshold: float = 0.5,  # 掩码阈值,用于阈值化掩码
        overlap_mask_area_threshold: float = 0.8,  # 重叠掩码区域的阈值,用于合并重叠实例的掩码
        target_sizes: Optional[List[Tuple[int, int]]] = None,  # 目标大小列表,用于尺寸调整
        return_coco_annotation: Optional[bool] = False,  # 是否返回 COCO 格式的注释
        return_binary_maps: Optional[bool] = False,  # 是否返回二进制地图
    ):
    
    # 对全景分割模型输出进行后处理,包括阈值化、合并重叠的分割标签等
    def post_process_panoptic_segmentation(
        self,
        outputs,
        threshold: float = 0.5,  # 置信度阈值,用于筛选分割标签
        mask_threshold: float = 0.5,  # 掩码阈值,用于阈值化掩码
        overlap_mask_area_threshold: float = 0.8,  # 重叠掩码区域的阈值,用于合并重叠分割标签的掩码
        label_ids_to_fuse: Optional[Set[int]] = None,  # 要融合的标签 ID 集合
        target_sizes: Optional[List[Tuple[int, int]]] = None,  # 目标大小列表,用于尺寸调整
    ):
posted @ 2024-06-29 17:00  绝不原创的飞龙  阅读(12)  评论(0编辑  收藏  举报