Transformers-源码解析-四十一-

Transformers 源码解析（四十一）

`.\models\distilbert\tokenization_distilbert.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DistilBERT."""

import collections
import os
import unicodedata
from typing import List, Optional, Tuple

from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 预训练模型对应的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
        "distilbert-base-uncased-distilled-squad": (
            "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt"
        ),
        "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
        "distilbert-base-cased-distilled-squad": (
            "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt"
        ),
        "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
        "distilbert-base-multilingual-cased": (
            "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt"
        ),
    }
}

# 预训练模型对应的位置编码大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "distilbert-base-uncased": 512,
    "distilbert-base-uncased-distilled-squad": 512,
    "distilbert-base-cased": 512,
    "distilbert-base-cased-distilled-squad": 512,
    "distilbert-base-german-cased": 512,
    "distilbert-base-multilingual-cased": 512,
}

# 预训练模型初始化配置
PRETRAINED_INIT_CONFIGURATION = {
    "distilbert-base-uncased": {"do_lower_case": True},
    "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
    "distilbert-base-cased": {"do_lower_case": False},
    "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
    "distilbert-base-german-cased": {"do_lower_case": False},
    "distilbert-base-multilingual-cased": {"do_lower_case": False},
}

# 从 transformers.models.bert.tokenization_bert.load_vocab 复制而来的函数
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    # 使用 OrderedDict 来存储词汇表
    vocab = collections.OrderedDict()
    # 以 UTF-8 编码读取词汇文件
    with open(vocab_file, "r", encoding="utf-8") as reader:
        # 逐行读取词汇文件内容
        tokens = reader.readlines()
    # 将每个词汇添加到 vocab 字典中，并用其在文件中的顺序作为值
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab
# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    # 去除文本两端的空白字符
    text = text.strip()
    # 如果处理后文本为空，则返回空列表
    if not text:
        return []
    # 使用空格分割文本，得到 token 列表
    tokens = text.split()
    # 返回分割后的 token 列表
    return tokens


class DistilBertTokenizer(PreTrainedTokenizer):
    r"""
    Construct a DistilBERT tokenizer. Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
    """

    vocab_files_names = VOCAB_FILES_NAMES
    # 加载预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 加载预训练模型的初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 加载预训练模型的最大输入大小配置
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义模型的输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 检查给定的词汇文件是否存在，否则抛出值错误
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表
        self.vocab = load_vocab(vocab_file)
        # 构建从标识符到词汇的有序字典
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 设置是否进行基本的分词操作
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要进行基本分词，则初始化基本分词器
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        # 初始化基于词汇表的 WordPiece 分词器
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法，传递相同的参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    # 返回基本分词器是否进行小写处理的属性
    # 来自 transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
    def do_lower_case(self):
        return self.basic_tokenizer.do_lower_case

    @property
    # 返回词汇表的大小
    # 来自 transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
    def vocab_size(self):
        return len(self.vocab)

    # 返回词汇表及其附加编码器的字典
    # 来自 transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
    def get_vocab(self):
        return dict(self.vocab, **self.added_tokens_encoder)

    # 将文本标记化为子词的方法，这个方法会被具体的子类实现
    # 来自 transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
    def _tokenize(self, text, split_special_tokens=False):
        # 初始化空列表，用于存储分词后的 tokens
        split_tokens = []
        # 如果需要进行基本的分词处理
        if self.do_basic_tokenize:
            # 使用 basic_tokenizer 对文本进行分词
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果 token 在 never_split 集合中
                if token in self.basic_tokenizer.never_split:
                    # 直接添加到 split_tokens 中
                    split_tokens.append(token)
                else:
                    # 否则，使用 wordpiece_tokenizer 进一步分词，并将结果合并到 split_tokens 中
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 如果不需要基本的分词处理，直接使用 wordpiece_tokenizer 进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        # 返回分词后的 tokens 列表
        return split_tokens

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用 vocab 字典将 token 转换为对应的 id，如果 token 不在 vocab 中，则使用 unk_token 对应的 id
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用 ids_to_tokens 字典将 index 转换为对应的 token，如果 index 不在 ids_to_tokens 中，则返回 unk_token
        return self.ids_to_tokens.get(index, self.unk_token)

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 tokens 列表中的 token 连接成一个字符串，并移除特殊标记 ' ##'，最后去除首尾的空格
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 如果没有提供 token_ids_1，则构建单个序列的输入列表，包括特殊 token `[CLS]` 和 `[SEP]`
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        # 否则，构建序列对的输入列表，包括两个序列的特殊 token `[CLS]`、`[SEP]` 以及分隔符 `[SEP]`
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """
        Retrieve sequence ids from a sequence of tokens that should not be masked.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens.

        Returns:
            `List[int]`: List of indices indicating which tokens are special tokens.
        """
        # 如果已经有特殊 token，则直接返回全为 1 的掩码列表，长度为 token_ids_0 的长度
        if already_has_special_tokens:
            return [1] * len(token_ids_0)
        # 否则，构建一个掩码列表，长度为 token_ids_0 的长度加上特殊 token `[CLS]` 和 `[SEP]` 的长度
        # 并设置特殊 token 对应位置为 1，其余位置为 0
        cls_sep = [self.cls_token_id, self.sep_token_id]
        return list(map(lambda x: 1 if x in cls_sep else 0, token_ids_0))
    # 从不包含特殊token的token列表中提取序列id。当使用tokenizer的`prepare_for_model`方法添加特殊token时调用此方法。
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            # 如果token列表已包含特殊token，则调用父类的方法获取特殊token的掩码
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            # 如果有第二个token列表，则返回一个包含特殊token的掩码列表
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # 如果只有一个token列表，则返回一个包含特殊token的掩码列表
        return [1] + ([0] * len(token_ids_0)) + [1]

    # 从给定的序列创建token类型ID的方法，用于序列对分类任务
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]  # 分隔token的ID列表
        cls = [self.cls_token_id]  # 类别开始token的ID列表
        if token_ids_1 is None:
            # 如果没有第二个token列表，只返回第一个序列部分的token类型ID列表（全为0）
            return len(cls + token_ids_0 + sep) * [0]
        # 如果有第二个token列表，返回两个序列的token类型ID列表，第一个序列为0，第二个序列为1
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 定义保存词汇表的方法，接受保存目录和可选的文件名前缀作为参数，并返回保存的文件名元组
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引
        index = 0
        # 检查保存目录是否存在，构建词汇表文件路径
        if os.path.isdir(save_directory):
            # 如果保存目录是一个目录，则在该目录下创建词汇表文件路径
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 如果保存目录是一个文件路径，则直接使用该路径作为词汇表文件路径
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开词汇表文件，准备写入
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的每个词汇及其索引，按索引排序
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 如果当前索引不等于预期的索引值，记录警告日志
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # 写入词汇到文件，并添加换行符
                writer.write(token + "\n")
                # 更新索引值
                index += 1
        # 返回保存的词汇表文件路径的元组
        return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        # 如果 `never_split` 参数未提供，将其设为空列表
        if never_split is None:
            never_split = []
        # 设定是否将输入内容全部转换为小写
        self.do_lower_case = do_lower_case
        # 将 `never_split` 转换为集合，这些标记在分词时不会被分开
        self.never_split = set(never_split)
        # 设定是否对中文字符进行分词处理
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设定是否去除所有的重音符号
        self.strip_accents = strip_accents
        # 设定是否在标点符号处进行基础分词
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        # 如果给定了 `never_split` 参数，则将其转换为集合并与 `self.never_split` 取并集，否则直接使用 `self.never_split`
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        
        # 清理文本，去除不必要的字符
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果开启了 tokenize_chinese_chars 参数，则对文本中的中文字符进行特殊处理
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        
        # 将文本进行 Unicode 规范化为 NFC 格式，确保字符的一致性
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        
        # 将规范化后的文本按空白字符进行分词
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        
        # 对每个 token 进行处理
        for token in orig_tokens:
            # 如果 token 不在 never_split 中，则继续处理
            if token not in never_split:
                # 如果开启了小写化处理，则将 token 转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果开启了去除重音处理，则去除 token 的重音
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果开启了去除重音处理，则去除 token 的重音
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            
            # 将处理后的 token 再进行标点符号分割处理，并加入到 split_tokens 中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将处理后的分词按空白字符再次分割，并返回最终的输出 tokens
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 将文本进行 Unicode 规范化为 NFD 格式，分解字符为基字符和附加记号
        text = unicodedata.normalize("NFD", text)
        output = []
        
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 分类
            cat = unicodedata.category(char)
            # 如果字符的分类是 Mark, Nonspacing，则跳过该字符，不加入输出
            if cat == "Mn":
                continue
            # 否则将字符加入输出列表
            output.append(char)
        
        # 将列表中的字符连接成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要在标点处分割或者文本在never_split列表中，直接返回文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果是标点符号，则将其作为单独的列表项加入output，并标记可以开始一个新单词
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，则将字符添加到当前列表项中
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        # 将列表中的子列表合并为字符串，并返回分割后的文本列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果是中文字符，则在其前后添加空格，并加入到输出列表中
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                # 如果不是中文字符，则直接添加到输出列表中
                output.append(char)
        # 将输出列表转换为字符串，并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查Unicode码点是否位于CJK统一表意文字区块中，返回是否是中文字符的布尔值
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符为无效字符或控制字符，跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果是空白字符，则替换为单个空格；否则保留原字符
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将输出列表转换为字符串，并返回清理后的文本
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类的实例
        self.vocab = vocab  # 词汇表，用于词片段(token)的匹配
        self.unk_token = unk_token  # 未知标记，用于替换无法识别的词片段
        self.max_input_chars_per_word = max_input_chars_per_word  # 单个词的最大字符数

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        output_tokens = []  # 存储最终的词片段(token)列表
        for token in whitespace_tokenize(text):  # 对输入文本进行空白字符分割，并遍历每个分割后的单词
            chars = list(token)  # 将单词转换为字符列表
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)  # 如果单词字符数超过最大限制，则使用未知标记替代
                continue

            is_bad = False  # 标记是否出现无法识别的子词
            start = 0  # 初始化子词起始位置
            sub_tokens = []  # 存储当前单词分割后的词片段
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])  # 获取当前起始位置到结束位置的子字符串
                    if start > 0:
                        substr = "##" + substr  # 对非初始子词添加 ## 前缀
                    if substr in self.vocab:  # 如果找到匹配的词片段在词汇表中
                        cur_substr = substr  # 记录当前词片段
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True  # 如果找不到匹配的词片段，则标记为无法识别
                    break
                sub_tokens.append(cur_substr)  # 将找到的词片段添加到子词列表中
                start = end  # 更新起始位置为当前结束位置

            if is_bad:
                output_tokens.append(self.unk_token)  # 如果整个单词无法识别，则使用未知标记替代
            else:
                output_tokens.extend(sub_tokens)  # 将识别出的词片段添加到最终结果中
        return output_tokens  # 返回最终的词片段列表

`.\models\distilbert\tokenization_distilbert_fast.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DistilBERT."""

import json
from typing import List, Optional, Tuple

from tokenizers import normalizers

# 导入必要的日志记录模块
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
# 导入DistilBERT的标记器
from .tokenization_distilbert import DistilBertTokenizer

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 定义词汇和标记器文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型的词汇和标记器文件的URL映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
        "distilbert-base-uncased-distilled-squad": (
            "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt"
        ),
        "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
        "distilbert-base-cased-distilled-squad": (
            "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt"
        ),
        "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
        "distilbert-base-multilingual-cased": (
            "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json",
        "distilbert-base-uncased-distilled-squad": (
            "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json"
        ),
        "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json",
        "distilbert-base-cased-distilled-squad": (
            "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json"
        ),
        "distilbert-base-german-cased": (
            "https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json"
        ),
        "distilbert-base-multilingual-cased": (
            "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json"
        ),
    },
}

# 定义预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "distilbert-base-uncased": 512,
    "distilbert-base-uncased-distilled-squad": 512,
    "distilbert-base-cased": 512,
    "distilbert-base-cased-distilled-squad": 512,
}
    # 定义模型名称为键，对应的最大输入长度为值的字典条目
    "distilbert-base-german-cased": 512,
    "distilbert-base-multilingual-cased": 512,
}

# 预训练模型的初始化配置，包含了不同模型的配置信息
PRETRAINED_INIT_CONFIGURATION = {
    "distilbert-base-uncased": {"do_lower_case": True},  # 使用小写字符
    "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},  # 使用小写字符
    "distilbert-base-cased": {"do_lower_case": False},  # 区分大小写
    "distilbert-base-cased-distilled-squad": {"do_lower_case": False},  # 区分大小写
    "distilbert-base-german-cased": {"do_lower_case": False},  # 区分大小写，适用于德语
    "distilbert-base-multilingual-cased": {"do_lower_case": False},  # 区分大小写，适用于多语言
}


class DistilBertTokenizerFast(PreTrainedTokenizerFast):
    r"""
    构建一个“快速”的 DistilBERT 分词器（基于 HuggingFace 的 *tokenizers* 库）。基于 WordPiece。

    此分词器继承自 [`PreTrainedTokenizerFast`]，其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
    """
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        clean_text (`bool`, *optional*, defaults to `True`):
            Whether or not to clean the text before tokenization by removing any control characters and replacing all
            whitespaces by the classic one.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
            issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
            The prefix for subwords.
    """
    # 定义一些常量和映射
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 指定模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 指定慢速分词器的类，这里使用的是 DistilBertTokenizer
    slow_tokenizer_class = DistilBertTokenizer

    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置词汇文件、分词器文件、大小写敏感、未知标记、分隔标记、填充标记、类别标记、掩码标记、处理中文字符等参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        # 获取当前后端分词器的规范化器状态并转换为字典
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 检查规范化器状态中的属性是否与当前初始化参数一致，若不一致则更新
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):
            # 获取当前规范化器的类，并更新相关属性
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

        # 设置对象的大小写敏感属性
        self.do_lower_case = do_lower_case

    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 构建带有特殊标记的输入序列，根据是否提供第二个序列决定是否添加第二个分隔符和第二个序列的 token IDs
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        Create token type IDs tensor from given sequences.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs corresponding to the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of IDs corresponding to the second sequence for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs.
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define special tokens for separation and classification
        sep = [self.sep_token_id]  # List containing the separator token ID
        cls = [self.cls_token_id]  # List containing the classification token ID
        
        # If only one sequence is provided (token_ids_1 is None), return a mask with 0s
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]  # Return a list of zeros representing token type IDs
        
        # If two sequences are provided, concatenate their lengths and return a mask with 0s for the first sequence and 1s for the second sequence
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary files to the specified directory.

        Args:
            save_directory (str):
                Directory path where the vocabulary files will be saved.
            filename_prefix (Optional[str]):
                Optional prefix for the vocabulary filenames.

        Returns:
            Tuple[str]: Tuple containing the paths of the saved vocabulary files.
        """
        # Call the internal tokenizer's model save method to save vocabulary files
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # Return the paths of the saved files as a tuple
        return tuple(files)

`.\models\distilbert\init.py`

# 导入`
# 版权声明和许可证信息，指明代码版权和使用许可
# 详细描述了此代码的版权所有者和许可证（Apache License, Version 2.0）
# 提供了 Apache License, Version 2.0 的网址链接，以便查阅
# 如果符合许可证的条件，允许按“原样”分发和使用此代码
# 详细说明了在适用法律或书面同意的情况下，此软件是按“原样”分发的
# 详细说明了此软件是按“原样”分发，不带任何明示或暗示的担保或条件
# 提供了 Apache License, Version 2.0 的网址链接，以便查阅

from typing import TYPE_CHECKING

# 导入 LazyModule、检查各种库是否可用等工具函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义导入结构，列出了各模块所需的配置、类和函数
_import_structure = {
    "configuration_distilbert": [
        "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "DistilBertConfig",
        "DistilBertOnnxConfig",
    ],
    "tokenization_distilbert": ["DistilBertTokenizer"],
}

# 检查 tokenizers 库是否可用，不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 tokenization_distilbert_fast 到导入结构
    _import_structure["tokenization_distilbert_fast"] = ["DistilBertTokenizerFast"]

# 检查 torch 库是否可用，不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_distilbert 到导入结构
    _import_structure["modeling_distilbert"] = [
        "DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DistilBertForMaskedLM",
        "DistilBertForMultipleChoice",
        "DistilBertForQuestionAnswering",
        "DistilBertForSequenceClassification",
        "DistilBertForTokenClassification",
        "DistilBertModel",
        "DistilBertPreTrainedModel",
    ]

# 检查 tensorflow 库是否可用，不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_tf_distilbert 到导入结构
    _import_structure["modeling_tf_distilbert"] = [
        "TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFDistilBertForMaskedLM",
        "TFDistilBertForMultipleChoice",
        "TFDistilBertForQuestionAnswering",
        "TFDistilBertForSequenceClassification",
        "TFDistilBertForTokenClassification",
        "TFDistilBertMainLayer",
        "TFDistilBertModel",
        "TFDistilBertPreTrainedModel",
    ]

# 检查 flax 库是否可用，不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_flax_distilbert 到导入结构
    _import_structure["modeling_flax_distilbert"] = [
        "FlaxDistilBertForMaskedLM",
        "FlaxDistilBertForMultipleChoice",
        "FlaxDistilBertForQuestionAnswering",
        "FlaxDistilBertForSequenceClassification",
        "FlaxDistilBertForTokenClassification",
        "FlaxDistilBertModel",
        "FlaxDistilBertPreTrainedModel",
    ]


if TYPE_CHECKING:
    # 如果是类型检查阶段，这里可能会有类型相关的导入或代码
    # 导入DistilBERT预训练模型的配置映射、配置类和ONNX配置类
    from .configuration_distilbert import (
        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        DistilBertConfig,
        DistilBertOnnxConfig,
    )
    
    # 导入DistilBERT的标记器
    from .tokenization_distilbert import DistilBertTokenizer
    
    # 检查是否安装了tokenizers库，若未安装则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果tokenizers库可用，则导入DistilBERT的快速标记器
        from .tokenization_distilbert_fast import DistilBertTokenizerFast
    
    # 检查是否安装了torch库，若未安装则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果torch库可用，则导入DistilBERT的模型相关模块
        from .modeling_distilbert import (
            DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            DistilBertForMaskedLM,
            DistilBertForMultipleChoice,
            DistilBertForQuestionAnswering,
            DistilBertForSequenceClassification,
            DistilBertForTokenClassification,
            DistilBertModel,
            DistilBertPreTrainedModel,
        )
    
    # 检查是否安装了tensorflow库，若未安装则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果tensorflow库可用，则导入TF版本的DistilBERT模型相关模块
        from .modeling_tf_distilbert import (
            TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFDistilBertForMaskedLM,
            TFDistilBertForMultipleChoice,
            TFDistilBertForQuestionAnswering,
            TFDistilBertForSequenceClassification,
            TFDistilBertForTokenClassification,
            TFDistilBertMainLayer,
            TFDistilBertModel,
            TFDistilBertPreTrainedModel,
        )
    
    # 检查是否安装了flax库，若未安装则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果flax库可用，则导入Flax版本的DistilBERT模型相关模块
        from .modeling_flax_distilbert import (
            FlaxDistilBertForMaskedLM,
            FlaxDistilBertForMultipleChoice,
            FlaxDistilBertForQuestionAnswering,
            FlaxDistilBertForSequenceClassification,
            FlaxDistilBertForTokenClassification,
            FlaxDistilBertModel,
            FlaxDistilBertPreTrainedModel,
        )
else:
    # 如果不是以上情况，即需要延迟加载模块
    import sys
    # 导入系统模块 sys

    # 将当前模块注册为一个延迟加载模块，并将其设置为当前模块的引用
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\dit\convert_dit_unilm_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DiT checkpoints from the unilm repository."""

# 导入必要的库
import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 数据的模块
from pathlib import Path  # 导入处理文件路径的模块

import requests  # 导入处理 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载模型的功能
from PIL import Image  # 导入处理图像的 PIL 库

# 导入转换相关的类和函数
from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
from transformers.image_utils import PILImageResampling  # 导入图像处理相关的功能
from transformers.utils import logging  # 导入日志记录功能

# 设置日志输出级别为信息级别
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# here we list all keys to be renamed (original name on the left, our name on the right)
# 定义函数用于创建需要重命名的键值对列表
def create_rename_keys(config, has_lm_head=False, is_semantic=False):
    # 如果是语义模型，则前缀为"backbone."，否则为空
    prefix = "backbone." if is_semantic else ""

    rename_keys = []
    # 遍历模型的隐藏层，生成需要重命名的键值对
    for i in range(config.num_hidden_layers):
        # 第一层归一化层的权重和偏置
        rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight"))
        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias"))
        # 注意力机制输出的权重和偏置
        rename_keys.append(
            (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight")
        )
        rename_keys.append(
            (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias")
        )
        # 第二层归一化层的权重和偏置
        rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight"))
        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias"))
        # 中间层的全连接层1的权重和偏置
        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight"))
        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias"))
        # 中间层的全连接层2的权重和偏置
        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight"))
        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias"))

    # projection layer + position embeddings
    # 扩展 rename_keys 列表，添加模型参数重命名对应关系
    rename_keys.extend(
        [
            (f"{prefix}cls_token", "beit.embeddings.cls_token"),  # 将 "{prefix}cls_token" 重命名为 "beit.embeddings.cls_token"
            (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"),  # 将 "{prefix}patch_embed.proj.weight" 重命名为 "beit.embeddings.patch_embeddings.projection.weight"
            (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"),  # 将 "{prefix}patch_embed.proj.bias" 重命名为 "beit.embeddings.patch_embeddings.projection.bias"
            (f"{prefix}pos_embed", "beit.embeddings.position_embeddings"),  # 将 "{prefix}pos_embed" 重命名为 "beit.embeddings.position_embeddings"
        ]
    )

    if has_lm_head:
        # 如果模型具有语言模型头部，则继续添加重命名对应关系
        rename_keys.extend(
            [
                ("mask_token", "beit.embeddings.mask_token"),  # 将 "mask_token" 重命名为 "beit.embeddings.mask_token"
                ("norm.weight", "layernorm.weight"),  # 将 "norm.weight" 重命名为 "layernorm.weight"
                ("norm.bias", "layernorm.bias"),  # 将 "norm.bias" 重命名为 "layernorm.bias"
            ]
        )
    else:
        # 如果模型没有语言模型头部，则添加分类头部的重命名对应关系
        rename_keys.extend(
            [
                ("fc_norm.weight", "beit.pooler.layernorm.weight"),  # 将 "fc_norm.weight" 重命名为 "beit.pooler.layernorm.weight"
                ("fc_norm.bias", "beit.pooler.layernorm.bias"),  # 将 "fc_norm.bias" 重命名为 "beit.pooler.layernorm.bias"
                ("head.weight", "classifier.weight"),  # 将 "head.weight" 重命名为 "classifier.weight"
                ("head.bias", "classifier.bias"),  # 将 "head.bias" 重命名为 "classifier.bias"
            ]
        )

    return rename_keys
# 将每个编码器层的权重矩阵分解为查询（queries）、键（keys）和值（values）
def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False):
    # 遍历编码器层的数量
    for i in range(config.num_hidden_layers):
        # 如果是语义模型，添加前缀 "backbone."
        prefix = "backbone." if is_semantic else ""

        # 获取查询、键和值的权重矩阵
        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
        # 获取查询的偏置
        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
        # 获取值的偏置
        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")

        # 将查询权重放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
            : config.hidden_size, :
        ]
        # 将查询偏置放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
        # 将键的权重放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        # 将值的权重放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
            -config.hidden_size :, :
        ]
        # 将值的偏置放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias

        # 获取 gamma_1 和 gamma_2 的值
        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")

        # 将 gamma_1 重命名为 lambda_1，并放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1
        # 将 gamma_2 重命名为 lambda_2，并放入新的键值对中
        state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2


# 重命名字典中的键
def rename_key(dct, old, new):
    # 弹出旧键的值
    val = dct.pop(old)
    # 将值用新键重新放入字典中
    dct[new] = val


# 准备图片数据，在线获取一张可爱猫咪的图片
def prepare_img():
    # 图片的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 URL 获取图片对象
    im = Image.open(requests.get(url, stream=True).raw)
    # 返回获取的图片对象
    return im


# 使用无梯度计算的上下文环境，将某个检查点文件的权重转换到我们的 BEiT 结构中
@torch.no_grad()
def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to our BEiT structure.
    """

    # 定义默认的 BEiT 配置
    # 根据检查点 URL 判断是否有语言模型头部
    has_lm_head = False if "rvlcdip" in checkpoint_url else True
    # 根据是否使用绝对位置嵌入和是否有语言模型头部来配置 BEiT
    config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head)

    # 根据检查点 URL 中是否包含 "large" 或 "dit-l" 来配置 BEiT 的架构大小
    if "large" in checkpoint_url or "dit-l" in checkpoint_url:
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16

    # 如果检查点 URL 中包含 "rvlcdip"，配置 BEiT 的标签相关信息
    if "rvlcdip" in checkpoint_url:
        config.num_labels = 16
        # 设置用于加载 id2label 映射的存储库和文件名
        repo_id = "huggingface/label-files"
        filename = "rvlcdip-id2label.json"
        # 通过 Hugging Face Hub 下载并加载 id2label 映射
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    # 加载原始模型的 state_dict，并移除和重命名一些键
    # 从指定的 URL 加载模型的状态字典，并选择在 CPU 上加载
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]

    # 根据配置创建重命名键列表，用于在加载状态字典时重命名模型参数
    rename_keys = create_rename_keys(config, has_lm_head=has_lm_head)
    for src, dest in rename_keys:
        # 重命名状态字典中的键名，根据预定义的映射关系进行修改
        rename_key(state_dict, src, dest)
    
    # 根据状态字典读取并初始化 Q、K、V（查询、键、值）的权重
    read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head)

    # 根据是否有语言模型头部选择加载不同类型的 Beit 模型
    model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config)
    # 设置模型为评估模式
    model.eval()
    # 加载预训练模型的状态字典
    model.load_state_dict(state_dict)

    # 创建 Beit 图像处理器，用于预处理图像
    image_processor = BeitImageProcessor(
        size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
    )
    # 准备图像数据
    image = prepare_img()

    # 使用图像处理器对图像进行编码，并返回 PyTorch 张量表示
    encoding = image_processor(images=image, return_tensors="pt")
    # 提取像素值张量
    pixel_values = encoding["pixel_values"]

    # 使用加载的模型进行图像处理，获取输出结果
    outputs = model(pixel_values)
    # 提取模型的预测 logits（对数概率）
    logits = outputs.logits

    # 验证 logits 的形状是否符合预期
    expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192]
    assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected"

    # 确保保存模型的文件夹存在，如果不存在则创建
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印信息，指示正在保存模型到指定路径
    print(f"Saving model to {pytorch_dump_folder_path}")
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印信息，指示正在保存图像处理器到指定路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到模型 hub
    if push_to_hub:
        # 根据模型的具体配置选择模型名称
        if has_lm_head:
            model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
        else:
            model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
        
        # 将图像处理器推送到指定的 hub 仓库
        image_processor.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add image processor",
            use_temp_dir=True,
        )
        # 将模型推送到指定的 hub 仓库
        model.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add model",
            use_temp_dir=True,
        )
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    parser.add_argument(
        "--checkpoint_url",
        default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth",
        type=str,
        help="URL to the original PyTorch checkpoint (.pth file).",
    )
    # 添加名为--checkpoint_url的命令行参数，用于指定原始PyTorch检查点文件的URL

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
    )
    # 添加名为--pytorch_dump_folder_path的命令行参数，用于指定输出PyTorch模型的文件夹路径

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
    )
    # 添加名为--push_to_hub的命令行参数，如果设置则将其设为True

    args = parser.parse_args()
    # 解析命令行参数并将其存储在args变量中

    convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用convert_dit_checkpoint函数，传递解析后的命令行参数作为参数

`.\models\dit\init.py`

# 导入标准库中的datetime模块，用于处理日期和时间相关的操作
import datetime

# 定义一个名为format_date的函数，接受一个datetime对象作为参数，返回格式化后的日期字符串
def format_date(dt):
    # 使用datetime对象的strftime方法格式化日期，"%Y-%m-%d"表示年-月-日的格式
    return dt.strftime("%Y-%m-%d")

# 创建一个datetime对象，表示当前日期和时间
current_date = datetime.datetime.now()

# 调用format_date函数，将current_date格式化为字符串并赋值给formatted_date变量
formatted_date = format_date(current_date)

# 打印格式化后的日期字符串
print(formatted_date)

`.\models\donut\configuration_donut_swin.py`

# coding=utf-8
# 定义编码格式为 UTF-8

# 导入所需的模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练配置文件的映射字典，将模型名称映射到其预训练配置文件的 URL
DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "naver-clova-ix/donut-base": "https://huggingface.co/naver-clova-ix/donut-base/resolve/main/config.json",
    # 查看所有 Donut 模型的列表：https://huggingface.co/models?filter=donut-swin
}


class DonutSwinConfig(PretrainedConfig):
    r"""
    这是用于存储 [`DonutSwinModel`] 配置信息的配置类。它用于根据指定的参数实例化 Donut 模型，定义模型架构。
    使用默认配置实例化将产生类似于 Donut [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base)
    架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
    """
    pass  # 这里只是一个占位符，暂时没有其他配置信息需要添加
    # 定义模型类型为 "donut-swin"
    model_type = "donut-swin"
    
    # 定义一个映射字典，将 "num_attention_heads" 映射到 "num_heads"，"num_hidden_layers" 映射到 "num_layers"
    attribute_map = {
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
    }
    # 定义一个初始化方法，初始化一个自定义的神经网络模型
    def __init__(
        self,
        image_size=224,  # 图像大小，默认为224像素
        patch_size=4,  # 图像块大小，默认为4像素
        num_channels=3,  # 图像通道数，默认为3（RGB）
        embed_dim=96,  # 嵌入维度，默认为96
        depths=[2, 2, 6, 2],  # 不同阶段的层数列表
        num_heads=[3, 6, 12, 24],  # 不同阶段的注意力头数列表
        window_size=7,  # 窗口大小，默认为7
        mlp_ratio=4.0,  # MLP扩展比率，默认为4.0
        qkv_bias=True,  # 是否在注意力层中使用QKV偏置，默认为True
        hidden_dropout_prob=0.0,  # 隐藏层dropout概率，默认为0.0
        attention_probs_dropout_prob=0.0,  # 注意力层dropout概率，默认为0.0
        drop_path_rate=0.1,  # DropPath率，默认为0.1
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        use_absolute_embeddings=False,  # 是否使用绝对位置嵌入，默认为False
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # LayerNorm的epsilon，默认为1e-5
        **kwargs,  # 其他关键字参数，用于灵活扩展
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
    
        # 设置各个参数到对象的属性中
        self.image_size = image_size  # 图像大小
        self.patch_size = patch_size  # 图像块大小
        self.num_channels = num_channels  # 图像通道数
        self.embed_dim = embed_dim  # 嵌入维度
        self.depths = depths  # 不同阶段的层数列表
        self.num_layers = len(depths)  # 网络的总层数
        self.num_heads = num_heads  # 不同阶段的注意力头数列表
        self.window_size = window_size  # 窗口大小
        self.mlp_ratio = mlp_ratio  # MLP扩展比率
        self.qkv_bias = qkv_bias  # 是否使用QKV偏置
        self.hidden_dropout_prob = hidden_dropout_prob  # 隐藏层dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 注意力层dropout概率
        self.drop_path_rate = drop_path_rate  # DropPath率
        self.hidden_act = hidden_act  # 隐藏层激活函数
        self.use_absolute_embeddings = use_absolute_embeddings  # 是否使用绝对位置嵌入
        self.layer_norm_eps = layer_norm_eps  # LayerNorm的epsilon
        self.initializer_range = initializer_range  # 初始化范围
    
        # 设置隐藏层大小属性，以使Swin模型与VisionEncoderDecoderModel兼容
        # 这表示模型最后一个阶段之后的通道维度
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))

`.\models\donut\convert_donut_to_pytorch.py`

# 设置脚本的编码格式为UTF-8，确保支持中文等Unicode字符
# 版权声明，声明使用Apache License Version 2.0许可证
# 可以在遵守许可证条件的前提下使用此文件
"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""

# 导入命令行参数解析模块
import argparse

# 导入PyTorch库
import torch
# 导入datasets模块中的load_dataset函数，用于加载数据集
from datasets import load_dataset
# 导入donut模块中的DonutModel类
from donut import DonutModel

# 导入transformers库中的多个模块和类
from transformers import (
    DonutImageProcessor,
    DonutProcessor,
    DonutSwinConfig,
    DonutSwinModel,
    MBartConfig,
    MBartForCausalLM,
    VisionEncoderDecoderModel,
    XLMRobertaTokenizerFast,
)


# 定义函数，根据给定模型获取相关的配置信息
def get_configs(model):
    # 获取原始模型的配置信息
    original_config = model.config

    # 创建编码器的配置信息对象DonutSwinConfig
    encoder_config = DonutSwinConfig(
        image_size=original_config.input_size,  # 使用原始模型的输入尺寸作为图像尺寸
        patch_size=4,  # 指定图像块的大小为4
        depths=original_config.encoder_layer,  # 使用原始模型的编码器层数
        num_heads=[4, 8, 16, 32],  # 设定多头注意力机制的头数分别为4, 8, 16, 32
        window_size=original_config.window_size,  # 使用原始模型的窗口大小
        embed_dim=128,  # 设定嵌入维度为128
    )

    # 创建解码器的配置信息对象MBartConfig
    decoder_config = MBartConfig(
        is_decoder=True,  # 设置为解码器
        is_encoder_decoder=False,  # 不是编码器-解码器模型
        add_cross_attention=True,  # 添加交叉注意力
        decoder_layers=original_config.decoder_layer,  # 使用原始模型的解码器层数
        max_position_embeddings=original_config.max_position_embeddings,  # 使用原始模型的最大位置嵌入数
        vocab_size=len(
            model.decoder.tokenizer
        ),  # 设定词汇表大小为解码器的词汇量，XLMRobertaTokenizer添加了一些特殊标记，请查看hub上的repo（added_tokens.json）
        scale_embedding=True,  # 缩放嵌入
        add_final_layer_norm=True,  # 添加最终的层归一化
    )

    # 返回编码器和解码器的配置信息
    return encoder_config, decoder_config


# 定义函数，根据给定的名字对模型的键进行重命名处理
def rename_key(name):
    if "encoder.model" in name:
        name = name.replace("encoder.model", "encoder")  # 将名字中的"encoder.model"替换为"encoder"
    if "decoder.model" in name:
        name = name.replace("decoder.model", "decoder")  # 将名字中的"decoder.model"替换为"decoder"
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")  # 将名字中的"patch_embed.proj"替换为"embeddings.patch_embeddings.projection"
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.norm")  # 将名字中的"patch_embed.norm"替换为"embeddings.norm"
    # 如果名称以 "encoder" 开头，则进行以下操作
    if name.startswith("encoder"):
        # 如果名称中包含 "layers"
        if "layers" in name:
            # 在名称前添加 "encoder."
            name = "encoder." + name
        # 如果名称中包含 "attn.proj"
        if "attn.proj" in name:
            # 将 "attn.proj" 替换为 "attention.output.dense"
            name = name.replace("attn.proj", "attention.output.dense")
        # 如果名称中包含 "attn" 且不包含 "mask"
        if "attn" in name and "mask" not in name:
            # 将 "attn" 替换为 "attention.self"
            name = name.replace("attn", "attention.self")
        # 如果名称中包含 "norm1"
        if "norm1" in name:
            # 将 "norm1" 替换为 "layernorm_before"
            name = name.replace("norm1", "layernorm_before")
        # 如果名称中包含 "norm2"
        if "norm2" in name:
            # 将 "norm2" 替换为 "layernorm_after"
            name = name.replace("norm2", "layernorm_after")
        # 如果名称中包含 "mlp.fc1"
        if "mlp.fc1" in name:
            # 将 "mlp.fc1" 替换为 "intermediate.dense"
            name = name.replace("mlp.fc1", "intermediate.dense")
        # 如果名称中包含 "mlp.fc2"
        if "mlp.fc2" in name:
            # 将 "mlp.fc2" 替换为 "output.dense"
            name = name.replace("mlp.fc2", "output.dense")

        # 如果名称是 "encoder.norm.weight"
        if name == "encoder.norm.weight":
            # 将名称替换为 "encoder.layernorm.weight"
            name = "encoder.layernorm.weight"
        # 如果名称是 "encoder.norm.bias"
        if name == "encoder.norm.bias":
            # 将名称替换为 "encoder.layernorm.bias"
            name = "encoder.layernorm.bias"

    # 返回修改后的名称
    return name
# 将给定的原始状态字典按键值进行迭代复制，以避免在迭代时修改字典结构
def convert_state_dict(orig_state_dict, model):
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名中包含 "qkv"
        if "qkv" in key:
            # 根据 "." 分割键名
            key_split = key.split(".")
            # 解析层号和块号
            layer_num = int(key_split[3])
            block_num = int(key_split[5])
            # 计算注意力机制的维度
            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size

            # 根据键名中是否包含 "weight" 分别处理权重和偏置
            if "weight" in key:
                # 更新 query、key、value 的权重
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                ] = val[:dim, :]
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
                ] = val[dim : dim * 2, :]
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                ] = val[-dim:, :]
            else:
                # 更新 query、key、value 的偏置
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
                ] = val[:dim]
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
                ] = val[dim : dim * 2]
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                ] = val[-dim:]
        # 如果键名中包含 "attn_mask" 或者是指定的其他键名
        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
            # HuggingFace 实现中不使用 attn_mask 缓冲区
            # 模型不使用编码器的最终 LayerNorms
            pass
        else:
            # 对于其余的键名，应用重命名函数，并保留其原始值
            orig_state_dict[rename_key(key)] = val

    # 返回处理后的原始状态字典
    return orig_state_dict


# 将 Donut 模型检查点转换为 HuggingFace 模型
def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    # 加载原始模型
    original_model = DonutModel.from_pretrained(model_name).eval()

    # 加载 HuggingFace 模型
    encoder_config, decoder_config = get_configs(original_model)
    encoder = DonutSwinModel(encoder_config)
    decoder = MBartForCausalLM(decoder_config)
    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
    model.eval()

    # 获取原始模型的状态字典
    state_dict = original_model.state_dict()
    # 转换状态字典中的键值结构
    new_state_dict = convert_state_dict(state_dict, model)
    # 加载转换后的状态字典到 HuggingFace 模型
    model.load_state_dict(new_state_dict)

    # 在扫描文档上验证结果
    dataset = load_dataset("hf-internal-testing/example-documents")
    image = dataset["test"][0]["image"].convert("RGB")

    # 从模型名称加载 XLM-Roberta 分词器
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
    # 创建 Donut 图像处理器，根据原始模型配置设定
    image_processor = DonutImageProcessor(
        do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
    )
    # 创建 Donut 处理器，整合图像处理器和分词器
    processor = DonutProcessor(image_processor, tokenizer)
    # 处理图像并获取像素值张量
    pixel_values = processor(image, return_tensors="pt").pixel_values
    # 检查模型名称是否为 "naver-clova-ix/donut-base-finetuned-docvqa"
    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
        # 设置任务提示，包含用户输入的问题
        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
        # 设置问题内容为 "When is the coffee break?"
        question = "When is the coffee break?"
        # 替换任务提示中的占位符 {user_input} 为实际问题内容
        task_prompt = task_prompt.replace("{user_input}", question)
    
    # 检查模型名称是否为 "naver-clova-ix/donut-base-finetuned-rvlcdip"
    elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
        # 设置任务提示为 "<s_rvlcdip>"
        task_prompt = "<s_rvlcdip>"
    
    # 检查模型名称是否为以下任一
    elif model_name in [
        "naver-clova-ix/donut-base-finetuned-cord-v1",
        "naver-clova-ix/donut-base-finetuned-cord-v1-2560",
    ]:
        # 设置任务提示为 "<s_cord>"
        task_prompt = "<s_cord>"
    
    # 检查模型名称是否为 "naver-clova-ix/donut-base-finetuned-cord-v2"
    elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
        # 设置任务提示为 "s_cord-v2>"
        task_prompt = "s_cord-v2>"
    
    # 检查模型名称是否为 "naver-clova-ix/donut-base-finetuned-zhtrainticket"
    elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
        # 设置任务提示为 "<s_zhtrainticket>"
        task_prompt = "<s_zhtrainticket>"
    
    # 检查模型名称是否为以下任一
    elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
        # 如果以上条件均不满足，使用随机任务提示 "hello world"
        task_prompt = "hello world"
    
    else:
        # 如果模型名称不在支持列表中，抛出数值错误
        raise ValueError("Model name not supported")
    
    # 使用原始模型的解码器的标记器(tokenizer)处理任务提示，返回输入 ID（input_ids）张量
    prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

    # 使用原始模型的编码器的模型.patch_embed方法对像素值进行嵌入处理，获取原始补丁嵌入
    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
    # 使用当前模型的编码器的嵌入方法对像素值进行嵌入处理，获取当前模型的补丁嵌入和其他信息
    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
    # 断言原始补丁嵌入与当前模型的补丁嵌入在指定的误差范围内相似
    assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)

    # 验证编码器的隐藏状态是否相似
    original_last_hidden_state = original_model.encoder(pixel_values)
    last_hidden_state = model.encoder(pixel_values).last_hidden_state
    # 断言原始模型的最后隐藏状态与当前模型的最后隐藏状态在指定的误差范围内相似
    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)

    # 验证解码器的隐藏状态是否相似
    original_logits = original_model(pixel_values, prompt_tensors, None).logits
    logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
    # 断言原始模型的输出 logits 与当前模型的输出 logits 在指定的误差范围内相似
    assert torch.allclose(original_logits, logits, atol=1e-3)
    
    # 如果指定了 PyTorch 导出文件夹路径，则保存模型和处理器到该路径
    if pytorch_dump_folder_path is not None:
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)
    
    # 如果指定了推送到 Hub，则将模型和处理器推送到 Hub
    if push_to_hub:
        # 使用模型名称的最后一部分推送到 Hub
        model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
        processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
if __name__ == "__main__":
    # 如果这个模块被直接运行而非导入，则执行以下代码块

    # 创建参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加必需的参数
    parser.add_argument(
        "--model_name",
        default="naver-clova-ix/donut-base-finetuned-docvqa",
        required=False,
        type=str,
        help="Name of the original model you'd like to convert.",
    )

    # 添加可选的参数
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        required=False,
        type=str,
        help="Path to the output PyTorch model directory.",
    )

    # 添加标志参数
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether or not to push the converted model and processor to the 🤗 hub.",
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数进行模型转换，传入解析后的参数
    convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\donut\feature_extraction_donut.py`

# 设置文件编码为UTF-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Feature extractor class for Donut."""

# 导入警告模块
import warnings

# 导入日志记录工具
from ...utils import logging
# 导入图像处理模块
from .image_processing_donut import DonutImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 DonutFeatureExtractor 类，继承自 DonutImageProcessor 类
class DonutFeatureExtractor(DonutImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，提醒该类将在 Transformers 版本 5 中被移除，建议使用 DonutImageProcessor 代替
        warnings.warn(
            "The class DonutFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use DonutImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法
        super().__init__(*args, **kwargs)

`.\models\donut\image_processing_donut.py`

# 如果视觉处理库可用，则导入PIL库
if is_vision_available():
    import PIL

# 定义一个名为DonutImageProcessor的类，继承自BaseImageProcessor类
class DonutImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Donut image processor.
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_thumbnail (`bool`, *optional*, defaults to `True`):
            Whether to resize the image using thumbnail method.
        do_align_long_axis (`bool`, *optional*, defaults to `False`):
            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
            random amount of padding on each side, up to the largest image size in the batch. Otherwise, all images are
            padded to the largest image size in the batch.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Image standard deviation.
    """
    # 定义模型输入的名称列表，仅包含像素值
    model_input_names = ["pixel_values"]
    # 初始化函数，用于设置图像处理的各项参数和默认值
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像尺寸调整，默认为True
        size: Dict[str, int] = None,  # 图像的目标尺寸，字典形式表示，包含高度和宽度，默认为None
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像调整大小时的重采样方法，默认为双线性插值
        do_thumbnail: bool = True,  # 是否生成缩略图，默认为True
        do_align_long_axis: bool = False,  # 是否在长轴上对齐图像，默认为False
        do_pad: bool = True,  # 是否进行图像填充，默认为True
        do_rescale: bool = True,  # 是否对图像进行重新缩放，默认为True
        rescale_factor: Union[int, float] = 1 / 255,  # 图像重新缩放的因子，默认为1/255
        do_normalize: bool = True,  # 是否对图像进行归一化，默认为True
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像的均值用于归一化，默认为None
        image_std: Optional[Union[float, List[float]]] = None,  # 图像的标准差用于归一化，默认为None
        **kwargs,  # 其他可选的关键字参数
    ) -> None:
        # 调用父类的初始化方法，传入其他的关键字参数
        super().__init__(**kwargs)

        # 如果size为None，则设定默认的高度和宽度
        size = size if size is not None else {"height": 2560, "width": 1920}
        # 如果size是元组或列表形式，则转换为字典形式，表示高度和宽度
        if isinstance(size, (tuple, list)):
            # The previous feature extractor size parameter was in (width, height) format
            size = size[::-1]
        # 使用函数get_size_dict处理size，确保返回的是一个标准化的尺寸字典
        size = get_size_dict(size)

        # 设置对象的属性值，将初始化函数的参数赋值给对象的属性
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_thumbnail = do_thumbnail
        self.do_align_long_axis = do_align_long_axis
        self.do_pad = do_pad
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN  # 如果image_mean为None，则使用IMAGENET_STANDARD_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD  # 如果image_std为None，则使用IMAGENET_STANDARD_STD
        # 验证处理器的关键字列表，用于后续处理
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_thumbnail",
            "do_align_long_axis",
            "do_pad",
            "random_padding",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    def align_long_axis(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Align the long axis of the image to the longest axis of the specified size.

        Args:
            image (`np.ndarray`):
                The image to be aligned.
            size (`Dict[str, int]`):
                The size `{"height": h, "width": w}` to align the long axis to.
            data_format (`str` or `ChannelDimension`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.

        Returns:
            `np.ndarray`: The aligned image.
        """

        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = size["height"], size["width"]

        # 如果输出宽度小于高度且输入宽度大于高度，或者输出宽度大于高度且输入宽度小于高度，则须旋转图像
        if (output_width < output_height and input_width > input_height) or (
            output_width > output_height and input_width < input_height
        ):
            image = np.rot90(image, 3)

        # 如果指定了输出数据格式，则转换图像数据格式
        if data_format is not None:
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)

        # 返回对齐后的图像
        return image

    def pad_image(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        random_padding: bool = False,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Pad the image to the specified size.

        Args:
            image (`np.ndarray`):
                The image to be padded.
            size (`Dict[str, int]`):
                The size `{"height": h, "width": w}` to pad the image to.
            random_padding (`bool`, *optional*, defaults to `False`):
                Whether to use random padding or not.
            data_format (`str` or `ChannelDimension`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # Extract output height and width from the size dictionary
        output_height, output_width = size["height"], size["width"]
        
        # Obtain input height and width from the input image
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)

        # Calculate the difference between output and input dimensions
        delta_width = output_width - input_width
        delta_height = output_height - input_height

        # Determine padding amounts based on random_padding flag
        if random_padding:
            pad_top = np.random.randint(low=0, high=delta_height + 1)
            pad_left = np.random.randint(low=0, high=delta_width + 1)
        else:
            pad_top = delta_height // 2
            pad_left = delta_width // 2

        # Calculate remaining padding amounts to complete the pad
        pad_bottom = delta_height - pad_top
        pad_right = delta_width - pad_left

        # Construct the padding tuple for np.pad function
        padding = ((pad_top, pad_bottom), (pad_left, pad_right))
        
        # Apply padding to the image using np.pad
        return pad(image, padding, data_format=data_format, input_data_format=input_data_format)

    def pad(self, *args, **kwargs):
        # Log a deprecation warning for the `pad` method
        logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.")
        # Redirect to `pad_image` method
        return self.pad_image(*args, **kwargs)

    def thumbnail(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
        corresponding dimension of the specified size.

        Args:
            image (`np.ndarray`):
                The image to be resized.
            size (`Dict[str, int]`):
                The size `{"height": h, "width": w}` to resize the image to.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                The resampling filter to use.
            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        
        # 获取输出图像的目标高度和宽度
        output_height, output_width = size["height"], size["width"]

        # 始终调整图像大小为输入或输出大小中较小的那一个
        height = min(input_height, output_height)
        width = min(input_width, output_width)

        # 如果输入图像已经符合要求的大小，则直接返回原图像
        if height == input_height and width == input_width:
            return image

        # 根据输入图像的长宽比例调整目标高度或宽度
        if input_height > input_width:
            width = int(input_width * height / input_height)
        elif input_width > input_height:
            height = int(input_height * width / input_width)

        # 调用 resize 函数，进行图像的实际调整
        return resize(
            image,
            size=(height, width),
            resample=resample,
            reducing_gap=2.0,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
        ):
        """
        Resize the input image to the specified size.

        Args:
            image (`np.ndarray`):
                The image to be resized.
            size (`Dict[str, int]`):
                The target size `{"height": h, "width": w}` to resize the image to.
            resample (`PILImageResampling`, *optional*):
                The resampling filter to use.
            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                The data format of the output image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image.

        Returns:
            np.ndarray: The resized image.
        """
    ) -> np.ndarray:
        """
        Resizes `image` to `(height, width)` specified by `size` using the PIL library.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 调整 `size` 参数，确保其为大小字典
        size = get_size_dict(size)
        # 计算 `size` 中较短的边长
        shortest_edge = min(size["height"], size["width"])
        # 获取调整大小后的输出图像尺寸
        output_size = get_resize_output_image_size(
            image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
        )
        # 调整图像大小并返回调整后的图像
        resized_image = resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
        return resized_image

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_thumbnail: bool = None,
        do_align_long_axis: bool = None,
        do_pad: bool = None,
        random_padding: bool = False,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\donut\modeling_donut_swin.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states."""

import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_donut_swin import DonutSwinConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "DonutSwinConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]

DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "naver-clova-ix/donut-base",
    # See all Donut Swin models at https://huggingface.co/models?filter=donut
]


@dataclass
# 数据类，定义了一个用于存储数据的类
# 从transformers.models.swin.modeling_swin.SwinEncoderOutput复制而来，仅将Swin替换为DonutSwin
class DonutSwinEncoderOutput(ModelOutput):
    """
    DonutSwin encoder's outputs, with potential hidden states and attentions.
    """
    # DonutSwin编码器的输出，可能包含隐藏状态和注意力
    # 最后一层模型的隐藏状态，形状为 `(batch_size, sequence_length, hidden_size)`。
    last_hidden_state: torch.FloatTensor = None
    
    # 模型每层的隐藏状态的元组，如果设置了 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 则返回。
    # 元组中包含 `torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`。
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    
    # 注意力权重的元组，如果设置了 `output_attentions=True` 或者 `config.output_attentions=True` 则返回。
    # 元组中包含 `torch.FloatTensor`，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    
    # 模型每层的隐藏状态的元组，如果设置了 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 则返回。
    # 元组中包含 `torch.FloatTensor`，形状为 `(batch_size, hidden_size, height, width)`，表示包括空间维度在内的每层的隐藏状态。
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 通过 @dataclass 装饰器定义一个数据类 DonutSwinModelOutput，继承自 ModelOutput
# 从 transformers.models.swin.modeling_swin.SwinModelOutput 复制，将 Swin 替换为 DonutSwin
@dataclass
class DonutSwinModelOutput(ModelOutput):
    """
    DonutSwin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    # 定义类的成员变量，用于存储模型输出的不同部分
    last_hidden_state: torch.FloatTensor = None
    pooler_output: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


# 从 transformers.models.swin.modeling_swin.window_partition 复制的函数
def window_partition(input_feature, window_size):
    """
    Partitions the given input into windows.
    """
    # 获取输入特征的形状信息
    batch_size, height, width, num_channels = input_feature.shape
    # 将输入特征按窗口大小分割成小窗口，存储在 input_feature 中
    input_feature = input_feature.view(
        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
    )
    # 调整分割后的窗口顺序，并重新整理为一个扁平化的张量
    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
    # 返回分割后的窗口张量
    return windows


# 从 transformers.models.swin.modeling_swin.window_reverse 复制的函数
# 定义一个函数window_reverse，用于合并窗口以生成更高分辨率的特征
def window_reverse(windows, window_size, height, width):
    # 获取窗口的通道数
    num_channels = windows.shape[-1]
    # 重塑窗口张量的形状，以便进行后续操作
    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
    # 调整张量的维度顺序，以便后续操作的连续性
    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
    # 返回处理后的窗口张量
    return windows


# 从transformers.models.swin.modeling_swin.SwinEmbeddings复制并修改为DonutSwinEmbeddings
class DonutSwinEmbeddings(nn.Module):
    """
    构建补丁和位置嵌入。可选择添加掩码令牌。
    """

    def __init__(self, config, use_mask_token=False):
        super().__init__()

        # 初始化补丁嵌入
        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
        # 获取补丁数量和网格大小
        num_patches = self.patch_embeddings.num_patches
        self.patch_grid = self.patch_embeddings.grid_size
        # 如果需要使用掩码令牌，则初始化掩码令牌
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None

        # 根据配置决定是否使用绝对位置嵌入
        if config.use_absolute_embeddings:
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
        else:
            self.position_embeddings = None

        # 初始化LayerNorm和Dropout
        self.norm = nn.LayerNorm(config.embed_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
    ) -> Tuple[torch.Tensor]:
        # 获取补丁嵌入和输出维度
        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
        # 对嵌入进行LayerNorm
        embeddings = self.norm(embeddings)
        # 获取批量大小、序列长度和嵌入维度
        batch_size, seq_len, _ = embeddings.size()

        # 如果存在掩码位置信息，则用掩码令牌替换掩码的视觉令牌
        if bool_masked_pos is not None:
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
            # 将掩码应用到嵌入张量中
            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        # 如果存在位置嵌入，则添加到嵌入张量中
        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings

        # 应用Dropout到嵌入张量
        embeddings = self.dropout(embeddings)

        # 返回处理后的嵌入张量和输出维度
        return embeddings, output_dimensions


# 从transformers.models.swin.modeling_swin.SwinPatchEmbeddings复制并修改为DonutSwinPatchEmbeddings
class DonutSwinPatchEmbeddings(nn.Module):
    """
    将形状为(batch_size, num_channels, height, width)的像素值转换为Transformer可消耗的初始隐藏状态（补丁嵌入），
    形状为(batch_size, seq_length, hidden_size)。
    """
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 从配置对象中获取图像大小和patch大小
        image_size, patch_size = config.image_size, config.patch_size
        # 从配置对象中获取通道数和嵌入维度大小
        num_channels, hidden_size = config.num_channels, config.embed_dim
        # 如果图像大小和patch大小不是可迭代对象，则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像中patch的数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        # 设置对象的属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        # 计算图像网格大小
        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        # 创建一个卷积层，用于将输入的通道转换为隐藏维度的输出
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    # 对输入的像素值进行可能的填充，使其能够被patch大小整除
    def maybe_pad(self, pixel_values, height, width):
        # 如果宽度不能被patch的宽度整除，则进行填充
        if width % self.patch_size[1] != 0:
            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        # 如果高度不能被patch的高度整除，则进行填充
        if height % self.patch_size[0] != 0:
            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        # 返回填充后的像素值
        return pixel_values

    # 前向传播函数，接受一个可选的torch.FloatTensor类型的像素值作为输入，返回嵌入向量和输出维度
    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
        # 获取输入张量的形状信息
        _, num_channels, height, width = pixel_values.shape
        # 如果通道数不等于设定的通道数，则抛出值错误异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 将输入进行填充，使其能够被patch大小整除
        pixel_values = self.maybe_pad(pixel_values, height, width)
        # 将填充后的像素值通过投影卷积层转换为嵌入向量
        embeddings = self.projection(pixel_values)
        # 获取嵌入向量的形状信息
        _, _, height, width = embeddings.shape
        # 计算输出的维度信息
        output_dimensions = (height, width)
        # 将嵌入向量展平，并在特定维度上转置
        embeddings = embeddings.flatten(2).transpose(1, 2)

        # 返回嵌入向量和输出维度
        return embeddings, output_dimensions
# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging
class DonutSwinPatchMerging(nn.Module):
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    """

    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        # 初始化 Patch Merging 层，保存输入分辨率和通道数
        self.input_resolution = input_resolution
        self.dim = dim
        # 创建线性层，用于降维操作，从4倍的通道数到2倍的通道数，无偏置
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        # 初始化归一化层，对4倍的通道数进行归一化处理
        self.norm = norm_layer(4 * dim)

    def maybe_pad(self, input_feature, height, width):
        # 判断是否需要对输入特征进行填充，使得高度和宽度均为偶数
        should_pad = (height % 2 == 1) or (width % 2 == 1)
        if should_pad:
            pad_values = (0, 0, 0, width % 2, 0, height % 2)
            input_feature = nn.functional.pad(input_feature, pad_values)

        return input_feature

    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
        height, width = input_dimensions
        # 获取输入特征的批大小、通道数以及特征图的数量
        batch_size, dim, num_channels = input_feature.shape

        # 将输入特征重塑为四维张量 [batch_size, height, width, num_channels]
        input_feature = input_feature.view(batch_size, height, width, num_channels)
        # 如果需要，对输入特征进行填充，使得高度和宽度均为偶数
        input_feature = self.maybe_pad(input_feature, height, width)

        # 下采样操作，将特征图划分成四个区域，分别对应输入特征的四分之一大小
        input_feature_0 = input_feature[:, 0::2, 0::2, :]
        input_feature_1 = input_feature[:, 1::2, 0::2, :]
        input_feature_2 = input_feature[:, 0::2, 1::2, :]
        input_feature_3 = input_feature[:, 1::2, 1::2, :]
        
        # 将四个区域的特征按通道拼接起来，形成新的特征图
        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
        # 将新的特征图重塑为三维张量 [batch_size, height/2*width/2, 4*num_channels]
        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)

        # 对新的特征图进行归一化处理
        input_feature = self.norm(input_feature)
        # 使用线性层进行降维操作，输出特征维度为 [batch_size, height/2*width/2, 2*dim]
        input_feature = self.reduction(input_feature)

        return input_feature


# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    """

    # 实现样本级别的路径丢弃（随机深度），在残差块的主路径中应用
    # 输入参数包括：input - 输入张量，drop_prob - 丢弃概率（默认为0.0），training - 是否处于训练模式（默认为False）
    # 详细讨论可以参考 Ross Wightman 的评论和链接中的讨论
    # 如果 drop_prob 为 0.0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留的概率
    keep_prob = 1 - drop_prob
    # 确定输出张量的形状，适用于不同维度的张量，不仅限于二维卷积网络
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)
    # 生成与输入张量相同设备和数据类型的随机张量，值在 [keep_prob, 1.0) 之间
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    # 将随机张量向下取整，将其二值化
    random_tensor.floor_()
    # 计算输出，将输入张量除以保留概率，再乘以随机张量
    output = input.div(keep_prob) * random_tensor
    # 返回处理后的输出张量
    return output
# 从transformers.models.swin.modeling_swin.SwinDropPath复制而来，定义了DonutSwinDropPath类，用于实现每个样本的Drop Path（随机深度）机制。
class DonutSwinDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob  # 初始化Drop Path的概率

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)  # 执行Drop Path操作

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)  # 返回Drop Path概率的描述字符串


# 从transformers.models.swin.modeling_swin.SwinSelfAttention复制而来，定义了DonutSwinSelfAttention类，实现Swin Transformer的自注意力机制，被修改为适应新的Donut模型。
class DonutSwinSelfAttention(nn.Module):
    def __init__(self, config, dim, num_heads, window_size):
        super().__init__()
        if dim % num_heads != 0:
            raise ValueError(
                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
            )

        self.num_attention_heads = num_heads  # 注意力头的数量
        self.attention_head_size = int(dim / num_heads)  # 每个注意力头的大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size  # 所有注意力头的总大小
        self.window_size = (
            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
        )  # 窗口大小，用于相对位置编码

        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
        )  # 相对位置偏置表格，用作注意力矩阵的偏置

        # 获取窗口内每个标记的成对相对位置索引
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))  # 构建网格坐标
        coords_flatten = torch.flatten(coords, 1)
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 计算成对相对坐标
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
        relative_coords[:, :, 0] += self.window_size[0] - 1
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)
        self.register_buffer("relative_position_index", relative_position_index)  # 注册成对相对位置索引为模型的缓冲区

        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)  # 查询变换器
        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)  # 键变换器
        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)  # 值变换器

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)  # 注意力概率的dropout机制

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)  # 转置矩阵，以适应多头注意力的计算
    # 定义前向传播方法，用于处理输入隐藏状态和注意力掩码等参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 获取输入隐藏状态的维度信息
        batch_size, dim, num_channels = hidden_states.shape
        # 通过 self.query 对隐藏状态进行查询操作，生成混合的查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用 self.key 对隐藏状态进行键操作，并转换维度以便进行注意力计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 使用 self.value 对隐藏状态进行值操作，并转换维度以便进行注意力计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        # 对混合的查询层进行维度转换以便进行注意力计算
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算 "查询" 和 "键" 之间的点积，得到原始的注意力分数
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 将注意力分数除以 sqrt(注意力头的大小)，以归一化
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 获取相对位置偏置，并按照特定方式重塑其形状以便加到注意力分数上
        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
        relative_position_bias = relative_position_bias.view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
        )
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)

        # 如果提供了注意力掩码，则应用掩码
        if attention_mask is not None:
            # 将注意力掩码重塑为适合注意力分数张量的形状
            mask_shape = attention_mask.shape[0]
            attention_scores = attention_scores.view(
                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
            )
            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)

        # 对注意力分数进行 softmax 归一化，得到注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用 dropout 随机丢弃整个令牌的注意力概率
        attention_probs = self.dropout(attention_probs)

        # 如果提供了头部掩码，则应用头部掩码
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文层，将注意力概率乘以值层，并重塑其形状
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据输出标志决定返回的输出，可能包括注意力概率
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput
class DonutSwinSelfOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个线性层，输入和输出维度均为 dim
        self.dense = nn.Linear(dim, dim)
        # 初始化一个 dropout 层，使用 config 中的 dropout 概率
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 通过线性层 self.dense 进行变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的 hidden_states 使用 dropout 进行随机置零
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states


# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwin
class DonutSwinAttention(nn.Module):
    def __init__(self, config, dim, num_heads, window_size):
        super().__init__()
        # 初始化 DonutSwinSelfAttention 层
        self.self = DonutSwinSelfAttention(config, dim, num_heads, window_size)
        # 初始化 DonutSwinSelfOutput 层
        self.output = DonutSwinSelfOutput(config, dim)
        # 初始化一个空集合，用于存储被修剪的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 根据给定的 heads 进行注意力头修剪，并返回修剪后的 heads 和索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对线性层进行修剪
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被修剪的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 层进行自注意力计算
        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
        # 将 self 层的输出经过 output 层处理
        attention_output = self.output(self_outputs[0], hidden_states)
        # 将输出打包成元组，如果需要输出注意力权重，则加入输出中
        outputs = (attention_output,) + self_outputs[1:]
        # 返回处理后的输出
        return outputs


# Copied from transformers.models.swin.modeling_swin.SwinIntermediate
class DonutSwinIntermediate(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个线性层，输入维度为 dim，输出维度为 config.mlp_ratio * dim
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        # 根据 config 中的 hidden_act 初始化激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 通过线性层 self.dense 进行变换
        hidden_states = self.dense(hidden_states)
        # 使用预定义的激活函数对 hidden_states 进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states


# Copied from transformers.models.swin.modeling_swin.SwinOutput
# 此处代码被省略，需要在此处补充相关注释
# 定义名为 DonutSwinOutput 的神经网络模块，继承自 nn.Module 类
class DonutSwinOutput(nn.Module):
    # 初始化函数，接收 config 和 dim 两个参数
    def __init__(self, config, dim):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个线性层，输入大小为 config.mlp_ratio * dim，输出大小为 dim
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 创建一个 Dropout 层，使用配置中的 hidden_dropout_prob 参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收输入 hidden_states，返回经过处理的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入 hidden_states 通过 self.dense 线性层处理
        hidden_states = self.dense(hidden_states)
        # 对处理后的张量应用 dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的张量作为输出
        return hidden_states


# 从 transformers.models.swin.modeling_swin.SwinLayer 复制代码，将 Swin 改为 DonutSwin
class DonutSwinLayer(nn.Module):
    # 初始化函数，接收 config、dim、input_resolution、num_heads 和可选的 shift_size 参数
    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
        # 调用父类的初始化函数
        super().__init__()
        # 设置分块大小为 config.chunk_size_feed_forward
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置 shift_size
        self.shift_size = shift_size
        # 设置窗口大小为 config.window_size
        self.window_size = config.window_size
        # 设置输入分辨率
        self.input_resolution = input_resolution
        # 在层归一化之前应用层归一化，设置归一化的 epsilon 为 config.layer_norm_eps
        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建自注意力机制（Attention）层 DonutSwinAttention 对象
        self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
        # 如果 drop_path_rate 大于 0.0，则创建 DropPath 层 DonutSwinDropPath 对象，否则创建 Identity 层
        self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
        # 在层归一化之后应用层归一化，设置归一化的 epsilon 为 config.layer_norm_eps
        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建中间层 DonutSwinIntermediate 对象
        self.intermediate = DonutSwinIntermediate(config, dim)
        # 创建输出层 DonutSwinOutput 对象
        self.output = DonutSwinOutput(config, dim)

    # 设置 shift_size 和 window_size 的函数，根据输入分辨率 input_resolution 进行调整
    def set_shift_and_window_size(self, input_resolution):
        # 如果输入分辨率中最小的尺寸小于等于窗口大小 window_size，则不分割窗口
        if min(input_resolution) <= self.window_size:
            self.shift_size = 0
            self.window_size = min(input_resolution)

    # 根据给定的高度和宽度生成注意力掩码的函数
    def get_attn_mask(self, height, width, dtype):
        # 如果 shift_size 大于 0，则计算 SW-MSA 的注意力掩码
        if self.shift_size > 0:
            # 创建一个高度为 1，宽度为 height 和 width，通道数为 1 的零张量 img_mask
            img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
            # 定义高度和宽度的切片
            height_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            width_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            count = 0
            # 遍历高度和宽度切片，并在 img_mask 上进行相应的标记
            for height_slice in height_slices:
                for width_slice in width_slices:
                    img_mask[:, height_slice, width_slice, :] = count
                    count += 1

            # 将 img_mask 分割为窗口并展平成二维张量
            mask_windows = window_partition(img_mask, self.window_size)
            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
            # 构建注意力掩码，使对角线上的元素为 0，其他位置为 -100.0
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
        else:
            attn_mask = None
        # 返回生成的注意力掩码
        return attn_mask
    # 在可能的情况下，对隐藏状态进行填充，以保证其尺寸能够被窗口大小整除
    def maybe_pad(self, hidden_states, height, width):
        # 计算右边和底部需要填充的像素数，确保能够被窗口大小整除
        pad_right = (self.window_size - width % self.window_size) % self.window_size
        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
        # 定义填充值的元组：(前, 后, 上, 右, 下, 左)，这里只在右边和底部进行填充
        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
        # 对隐藏状态进行填充操作
        hidden_states = nn.functional.pad(hidden_states, pad_values)
        # 返回填充后的隐藏状态和填充值的元组
        return hidden_states, pad_values

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        always_partition: Optional[bool] = False,
        ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 如果不总是分区，设置偏移量和窗口大小
        if not always_partition:
            self.set_shift_and_window_size(input_dimensions)
        else:
            # 否则，什么也不做
            pass
        # 获取输入维度的高度和宽度
        height, width = input_dimensions
        # 获取隐藏状态的批量大小、通道数和维度
        batch_size, _, channels = hidden_states.size()
        # 备份隐藏状态
        shortcut = hidden_states

        # 在层归一化之前应用层归一化
        hidden_states = self.layernorm_before(hidden_states)

        # 将隐藏状态重塑为四维张量 (batch_size, height, width, channels)
        hidden_states = hidden_states.view(batch_size, height, width, channels)

        # 使用 maybe_pad 方法对隐藏状态进行填充，使其大小为窗口大小的倍数
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)

        # 获取填充后的高度和宽度
        _, height_pad, width_pad, _ = hidden_states.shape

        # 如果有循环偏移量，将隐藏状态进行循环移位
        if self.shift_size > 0:
            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
        else:
            shifted_hidden_states = hidden_states

        # 分区窗口，将移位后的隐藏状态分割成窗口
        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)

        # 获取注意力遮罩，以排除填充区域的影响
        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
        if attn_mask is not None:
            attn_mask = attn_mask.to(hidden_states_windows.device)

        # 使用注意力机制处理窗口
        attention_outputs = self.attention(
            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
        )

        # 获取注意力机制的输出
        attention_output = attention_outputs[0]

        # 将注意力输出重塑为四维张量 (batch_size, height, width, channels)
        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)

        # 反向操作，将窗口重排成原始形状
        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)

        # 如果有循环偏移量，对注意力窗口进行反向循环移位
        if self.shift_size > 0:
            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            attention_windows = shifted_windows

        # 如果存在填充，则裁剪注意力窗口以匹配原始图像尺寸
        was_padded = pad_values[3] > 0 or pad_values[5] > 0
        if was_padded:
            attention_windows = attention_windows[:, :height, :width, :].contiguous()

        # 将注意力窗口重塑为三维张量 (batch_size, height*width, channels)
        attention_windows = attention_windows.view(batch_size, height * width, channels)

        # 将快捷连接和注意力窗口加和，并应用 drop_path
        hidden_states = shortcut + self.drop_path(attention_windows)

        # 在层归一化之后应用层归一化
        layer_output = self.layernorm_after(hidden_states)

        # 应用中间层
        layer_output = self.intermediate(layer_output)

        # 应用输出层
        layer_output = hidden_states + self.output(layer_output)

        # 返回层输出，如果需要输出注意力权重则包含在内
        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        return layer_outputs
# 从 transformers.models.swin.modeling_swin.SwinStage 复制而来，将 Swin 替换为 DonutSwin
class DonutSwinStage(nn.Module):
    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
        super().__init__()
        self.config = config
        self.dim = dim
        # 创建包含多个 DonutSwinLayer 的模块列表，根据给定的深度
        self.blocks = nn.ModuleList(
            [
                DonutSwinLayer(
                    config=config,
                    dim=dim,
                    input_resolution=input_resolution,
                    num_heads=num_heads,
                    # 根据奇偶性确定 shift_size 的值
                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                )
                for i in range(depth)
            ]
        )

        # 如果有 downsample 参数，创建 patch merging 层；否则为 None
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
        else:
            self.downsample = None

        self.pointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        always_partition: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        height, width = input_dimensions
        # 遍历每个 DonutSwinLayer 模块进行前向传播计算
        for i, layer_module in enumerate(self.blocks):
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 调用当前层的前向传播方法，得到该层的输出
            layer_outputs = layer_module(
                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
            )

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

        hidden_states_before_downsampling = hidden_states
        # 如果存在 downsample 层，对隐藏状态进行下采样处理
        if self.downsample is not None:
            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
            output_dimensions = (height, width, height_downsampled, width_downsampled)
            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
        else:
            output_dimensions = (height, width, height, width)

        # 生成 stage 的输出，包括隐藏状态、下采样前的隐藏状态和输出维度信息
        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)

        # 如果开启了输出注意力机制信息的选项，则将每个层的注意力信息添加到输出中
        if output_attentions:
            stage_outputs += layer_outputs[1:]
        return stage_outputs


# 从 transformers.models.swin.modeling_swin.SwinEncoder 复制而来，将 Swin 替换为 DonutSwin
class DonutSwinEncoder(nn.Module):
    # 初始化函数，用于初始化一个 DonutSwin 模型实例
    def __init__(self, config, grid_size):
        # 调用父类（nn.Module）的初始化方法
        super().__init__()
        # 计算模型层数
        self.num_layers = len(config.depths)
        # 保存配置对象
        self.config = config
        # 计算每层的 drop path rate
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        # 创建模型的层列表，每层是一个 DonutSwinStage 实例
        self.layers = nn.ModuleList(
            [
                DonutSwinStage(
                    config=config,
                    # 设置每层的输入维度
                    dim=int(config.embed_dim * 2**i_layer),
                    # 设置输入分辨率
                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                    # 设置层的深度（即重复次数）
                    depth=config.depths[i_layer],
                    # 设置注意力头的数量
                    num_heads=config.num_heads[i_layer],
                    # 设置当前层的 drop path rates
                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                    # 如果不是最后一层，则使用 DonutSwinPatchMerging 进行下采样
                    downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
                )
                # 循环创建每一层的实例
                for i_layer in range(self.num_layers)
            ]
        )

        # 是否使用梯度检查点，默认为 False
        self.gradient_checkpointing = False

    # 前向传播函数，计算模型的输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        always_partition: Optional[bool] = False,
        return_dict: Optional[bool] = True,
# 从 transformers.models.swin.modeling_swin.SwinPreTrainedModel 复制并修改为 DonutSwinPreTrainedModel 类
class DonutSwinPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 DonutSwinConfig
    config_class = DonutSwinConfig
    # 基础模型的前缀为 "swin"
    base_model_prefix = "swin"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 对线性层和卷积层使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果有偏置，则将偏置初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对 LayerNorm 层，初始化偏置为零，权重为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


SWIN_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

SWIN_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DonutImageProcessor.__call__`] for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

@add_start_docstrings(
    "The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.",
    SWIN_START_DOCSTRING,
)
# 定义 DonutSwinModel 类，继承自 DonutSwinPreTrainedModel
class DonutSwinModel(DonutSwinPreTrainedModel):
    pass  # 这里省略了具体实现，仅作为示例展示类的定义和继承关系
    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
        # 调用父类的初始化方法，传入配置信息
        super().__init__(config)
        # 将配置信息保存到实例变量中
        self.config = config
        # 计算编码器层数量
        self.num_layers = len(config.depths)
        # 计算特征数量
        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))

        # 初始化嵌入层对象
        self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
        # 初始化编码器对象，传入嵌入层的补丁网格
        self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)

        # 如果需要添加池化层，则初始化自适应平均池化层，否则设为 None
        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None

        # 调用初始化权重和应用最终处理的方法
        self.post_init()

    def get_input_embeddings(self):
        # 返回嵌入层的补丁嵌入
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要剪枝的层和对应的注意力头信息
        for layer, heads in heads_to_prune.items():
            # 调用编码器对象中相应层的注意力模块的剪枝方法
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=DonutSwinModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处应该包含模型前向传播的详细说明文档和示例代码，但在注释中无法展示具体内容
        pass
        ) -> Union[Tuple, DonutSwinModelOutput]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 设置输出是否包含注意力权重，默认与模型配置一致
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出是否包含隐藏状态，默认与模型配置一致
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回对象类型，默认与模型配置一致
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 准备头部掩码（如果需要）
        # 头部掩码中的 1 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # 将 head_mask 转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, len(self.config.depths))

        # 将像素值和布尔掩码位置作为输入，传递给嵌入层获取嵌入输出和输入维度
        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        # 将嵌入输出传递给编码器，获取编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]

        # 如果存在池化器，则对序列输出进行池化和扁平化处理
        pooled_output = None
        if self.pooler is not None:
            pooled_output = self.pooler(sequence_output.transpose(1, 2))
            pooled_output = torch.flatten(pooled_output, 1)

        # 如果不返回字典，则返回元组格式的输出
        if not return_dict:
            output = (sequence_output, pooled_output) + encoder_outputs[1:]
            return output

        # 如果返回字典，则返回特定的模型输出对象
        return DonutSwinModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )

`.\models\donut\processing_donut.py`

# 设置编码格式为 UTF-8
# 版权声明：2022 年由 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本，只有在符合许可证的情况下才能使用此文件
# 您可以从以下链接获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据“原样”分发，不附带任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。
"""
Donut 的处理器类。
"""
import re  # 导入正则表达式模块
import warnings  # 导入警告模块
from contextlib import contextmanager  # 导入上下文管理器

from ...processing_utils import ProcessorMixin  # 导入处理器混合类


class DonutProcessor(ProcessorMixin):
    r"""
    构造一个 Donut 处理器，将 Donut 图像处理器和 XLMRoBERTa 分词器封装成一个单一处理器。

    [`DonutProcessor`] 提供 [`DonutImageProcessor`] 和 [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] 的所有功能。
    详见 [`~DonutProcessor.__call__`] 和 [`~DonutProcessor.decode`] 获取更多信息。

    Args:
        image_processor ([`DonutImageProcessor`], *可选*):
            [`DonutImageProcessor`] 的实例。图像处理器是必需的输入。
        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *可选*):
            [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] 的实例。分词器是必需的输入。
    """

    attributes = ["image_processor", "tokenizer"]  # 类属性列表
    image_processor_class = "AutoImageProcessor"  # 图像处理器类名
    tokenizer_class = "AutoTokenizer"  # 分词器类名

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        feature_extractor = None
        if "feature_extractor" in kwargs:
            # 警告：`feature_extractor` 参数已弃用，并将在 v5 中删除，请使用 `image_processor` 替代。
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # 如果 kwargs 中包含 `feature_extractor`，则将其赋给 feature_extractor 变量
        image_processor = image_processor if image_processor is not None else feature_extractor
        # 如果未指定 image_processor，则引发 ValueError 异常
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        # 如果未指定 tokenizer，则引发 ValueError 异常
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类 ProcessorMixin 的构造函数，传入 image_processor 和 tokenizer
        super().__init__(image_processor, tokenizer)
        # 设置当前处理器为 image_processor
        self.current_processor = self.image_processor
        # 标记目标上下文管理器未启动
        self._in_target_context_manager = False
    def __call__(self, *args, **kwargs):
        """
        当在正常模式下使用时，该方法将所有参数转发给 AutoImageProcessor 的 [`~AutoImageProcessor.__call__`] 并返回其输出。
        如果在上下文 [`~DonutProcessor.as_target_processor`] 中使用，则将所有参数转发给 DonutTokenizer 的 [`~DonutTokenizer.__call__`]。
        请参阅上述两个方法的文档了解更多信息。
        """
        # 对于向后兼容性
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        images = kwargs.pop("images", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            images = args[0]
            args = args[1:]

        if images is None and text is None:
            raise ValueError("You need to specify either an `images` or `text` input to process.")

        if images is not None:
            # 使用图像处理器处理图像和其他参数
            inputs = self.image_processor(images, *args, **kwargs)
        if text is not None:
            # 使用分词器处理文本和其他参数
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            return inputs
        elif images is None:
            return encodings
        else:
            # 将标签添加到输入字典中
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        """
        将所有参数转发给 DonutTokenizer 的 [`~PreTrainedTokenizer.batch_decode`] 方法。请参阅该方法的文档了解更多信息。
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        将所有参数转发给 DonutTokenizer 的 [`~PreTrainedTokenizer.decode`] 方法。请参阅该方法的文档了解更多信息。
        """
        return self.tokenizer.decode(*args, **kwargs)

    @contextmanager
    def as_target_processor(self):
        """
        临时设置处理输入的分词器。用于在微调 TrOCR 时对标签进行编码。
        """
        warnings.warn(
            "`as_target_processor` 已弃用，并将在 Transformers 的 v5 中移除。您可以通过在常规 `__call__` 方法的参数 `text` 中处理您的标签（在与图像输入相同的调用中或在单独的调用中）。"
        )
        self._in_target_context_manager = True
        self.current_processor = self.tokenizer
        yield
        self.current_processor = self.image_processor
        self._in_target_context_manager = False
    def token2json(self, tokens, is_inner_value=False, added_vocab=None):
        """
        Convert a (generated) token sequence into an ordered JSON format.

        Args:
            tokens (str): The token sequence to convert into JSON format.
            is_inner_value (bool, optional): Indicates if the function is processing inner values. Defaults to False.
            added_vocab (list, optional): List of added vocabulary tokens. Defaults to None.

        Returns:
            dict or list: Ordered JSON format representing the token sequence.

        Converts a sequence of tokens into a structured JSON format. Handles both leaf and non-leaf nodes
        in the token sequence recursively.
        """
        if added_vocab is None:
            added_vocab = self.tokenizer.get_added_vocab()

        output = {}

        while tokens:
            # Locate the start token in the token sequence
            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
            if start_token is None:
                break
            key = start_token.group(1)
            key_escaped = re.escape(key)

            # Find the corresponding end token for the current start token
            end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
            start_token = start_token.group()
            if end_token is None:
                tokens = tokens.replace(start_token, "")
            else:
                end_token = end_token.group()
                start_token_escaped = re.escape(start_token)
                end_token_escaped = re.escape(end_token)
                # Extract content between start and end tokens
                content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
                if content is not None:
                    content = content.group(1).strip()
                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
                        # Recursively convert inner token sequence to JSON
                        value = self.token2json(content, is_inner_value=True, added_vocab=added_vocab)
                        if value:
                            if len(value) == 1:
                                value = value[0]
                            output[key] = value
                    else:  # leaf nodes
                        output[key] = []
                        # Split content into leaf nodes based on separator "<sep/>"
                        for leaf in content.split(r"<sep/>"):
                            leaf = leaf.strip()
                            if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
                                leaf = leaf[1:-2]  # for categorical special tokens
                            output[key].append(leaf)
                        if len(output[key]) == 1:
                            output[key] = output[key][0]

                # Remove processed tokens from the sequence
                tokens = tokens[tokens.find(end_token) + len(end_token):].strip()
                if tokens[:6] == r"<sep/>":  # non-leaf nodes
                    # Return a list with current output and recursively processed tokens
                    return [output] + self.token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)

        # Handle cases where no output is generated
        if len(output):
            return [output] if is_inner_value else output
        else:
            return [] if is_inner_value else {"text_sequence": tokens}

    @property
    def feature_extractor_class(self):
        """
        Property accessor for deprecated feature_extractor_class.

        Returns:
            class: The image processor class.

        Warns:
            FutureWarning: This property is deprecated and will be removed in v5.
                           Use `image_processor_class` instead.
        """
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class

    @property
    def feature_extractor(self):
        """
        Property accessor for deprecated feature_extractor.

        Returns:
            object: The image processor instance.

        Warns:
            FutureWarning: This property is deprecated and will be removed in v5.
                           Use `image_processor` instead.
        """
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor

`.\models\donut\init.py`

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义异常和LazyModule类、检查是否有torch和vision库可用的函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构字典
_import_structure = {
    "configuration_donut_swin": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutSwinConfig"],
    "processing_donut": ["DonutProcessor"],
}

# 检查是否有torch库可用，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加相关模块到_import_structure字典中
    _import_structure["modeling_donut_swin"] = [
        "DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DonutSwinModel",
        "DonutSwinPreTrainedModel",
    ]

# 检查是否有vision库可用，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加相关模块到_import_structure字典中
    _import_structure["feature_extraction_donut"] = ["DonutFeatureExtractor"]
    _import_structure["image_processing_donut"] = ["DonutImageProcessor"]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从相应模块导入特定类和变量
    from .configuration_donut_swin import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutSwinConfig
    from .processing_donut import DonutProcessor

    try:
        # 检查是否有torch库可用，如果不可用则抛出OptionalDependencyNotAvailable异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从相应模块导入特定类和变量
        from .modeling_donut_swin import (
            DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
            DonutSwinModel,
            DonutSwinPreTrainedModel,
        )

    try:
        # 检查是否有vision库可用，如果不可用则抛出OptionalDependencyNotAvailable异常
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从相应模块导入特定类和变量
        from .feature_extraction_donut import DonutFeatureExtractor
        from .image_processing_donut import DonutImageProcessor

# 如果不是在类型检查模式下
else:
    # 导入sys模块
    import sys

    # 将当前模块替换为_LazyModule的实例，通过LazyModule进行按需导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\dpr\configuration_dpr.py`

# 设置文件编码为UTF-8
# 版权所有 2010 年，DPR 作者，The Hugging Face Team.
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发的软件，
# 没有任何明示或暗示的担保或条件。
# 请查看许可证以获取详细的权利和限制信息。
""" DPR 模型配置"""

from ...configuration_utils import PretrainedConfig  # 导入预训练模型的配置类
from ...utils import logging  # 导入日志工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 定义 DPR 预训练模型与配置文件的映射关系
DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/dpr-ctx_encoder-single-nq-base": (
        "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/config.json"
    ),
    "facebook/dpr-question_encoder-single-nq-base": (
        "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/config.json"
    ),
    "facebook/dpr-reader-single-nq-base": (
        "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/config.json"
    ),
    "facebook/dpr-ctx_encoder-multiset-base": (
        "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/config.json"
    ),
    "facebook/dpr-question_encoder-multiset-base": (
        "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/config.json"
    ),
    "facebook/dpr-reader-multiset-base": (
        "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/config.json"
    ),
}


class DPRConfig(PretrainedConfig):
    r"""
    [`DPRConfig`] 是用于存储 *DPRModel* 配置的配置类。

    这是用于存储 [`DPRContextEncoder`], [`DPRQuestionEncoder`] 或 [`DPRReader`] 的配置类。根据指定的参数实例化 DPR 模型组件，
    定义模型组件的架构。使用默认值实例化配置将产生类似于 DPRContextEncoder
    [facebook/dpr-ctx_encoder-single-nq-base](https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base)
    架构的配置。

    该类是 [`BertConfig`] 的子类。请查看超类以获取所有 kwargs 的文档。

    示例:

    ```
    >>> from transformers import DPRConfig, DPRContextEncoder

    >>> # 初始化 DPR facebook/dpr-ctx_encoder-single-nq-base 风格的配置
    >>> configuration = DPRConfig()

    >>> # 使用配置初始化一个模型（随机权重）来自 facebook/dpr-ctx_encoder-single-nq-base 风格的配置
    >>> model = DPRContextEncoder(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    # 设定模型类型为 "dpr"
    model_type = "dpr"
    
    # 定义一个初始化方法，用于初始化模型参数和配置
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        projection_dim: int = 0,
        **kwargs,
    ):
        # 调用父类的初始化方法，传入 pad_token_id 和其他关键字参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)
    
        # 设置模型的各种参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.projection_dim = projection_dim
        self.position_embedding_type = position_embedding_type

`.\models\dpr\convert_dpr_original_checkpoint_to_pytorch.py`

# 导入必要的库
import argparse  # 用于解析命令行参数
import collections  # 提供额外的数据结构
from pathlib import Path  # 处理路径相关操作

import torch  # PyTorch深度学习库
from torch.serialization import default_restore_location  # 默认的模型加载位置

# 导入transformers库中的相关组件
from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader

# 定义一个命名元组CheckpointState，用于保存模型加载状态
CheckpointState = collections.namedtuple(
    "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"]
)

# 从文件中加载模型状态
def load_states_from_checkpoint(model_file: str) -> CheckpointState:
    # 打印正在读取的模型文件信息
    print(f"Reading saved model from {model_file}")
    # 使用torch.load加载模型文件，并映射到CPU上
    state_dict = torch.load(model_file, map_location=lambda s, l: default_restore_location(s, "cpu"))
    # 返回包含模型状态的CheckpointState命名元组
    return CheckpointState(**state_dict)


# 定义一个DPRState类，用于处理DPR相关的状态
class DPRState:
    def __init__(self, src_file: Path):
        self.src_file = src_file  # 初始化源文件路径

    def load_dpr_model(self):
        raise NotImplementedError  # 抽象方法，子类需实现具体功能

    # 静态方法，根据组件类型创建对应的DPRState子类实例
    @staticmethod
    def from_type(comp_type: str, *args, **kwargs) -> "DPRState":
        # 根据组件类型选择相应的子类实例化
        if comp_type.startswith("c"):
            return DPRContextEncoderState(*args, **kwargs)
        if comp_type.startswith("q"):
            return DPRQuestionEncoderState(*args, **kwargs)
        if comp_type.startswith("r"):
            return DPRReaderState(*args, **kwargs)
        else:
            raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.")


# DPRContextEncoderState类继承自DPRState，处理上下文编码器相关状态
class DPRContextEncoderState(DPRState):
    def load_dpr_model(self):
        # 创建DPRContextEncoder模型实例，基于指定的配置
        model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
        # 打印正在加载的DPR双编码器信息
        print(f"Loading DPR biencoder from {self.src_file}")
        # 从检查点文件中加载保存的模型状态
        saved_state = load_states_from_checkpoint(self.src_file)
        encoder, prefix = model.ctx_encoder, "ctx_model."

        # 修复自GitHub提交中的更改，更新模型状态字典
        state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids}
        for key, value in saved_state.model_dict.items():
            if key.startswith(prefix):
                key = key[len(prefix) :]
                if not key.startswith("encode_proj."):
                    key = "bert_model." + key
                state_dict[key] = value
        
        # 加载更新后的状态字典到编码器模型中
        encoder.load_state_dict(state_dict)
        return model


class DPRQuestionEncoderState(DPRState):
    # 加载 DPR 模型
    def load_dpr_model(self):
        # 创建 DPRQuestionEncoder 对象，使用指定的 DPRConfig 和 BertConfig
        model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
        # 打印正在加载的 DPR biencoder 的来源文件路径
        print(f"Loading DPR biencoder from {self.src_file}")
        # 从指定的文件中加载模型状态
        saved_state = load_states_from_checkpoint(self.src_file)
        # 获取模型的 encoder 部分和前缀字符串
        encoder, prefix = model.question_encoder, "question_model."
        # 修复来自特定提交的更改，更新状态字典以适应新版本的模型
        state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids}
        # 遍历加载的模型状态字典的每个键值对
        for key, value in saved_state.model_dict.items():
            # 如果键以指定前缀开头
            if key.startswith(prefix):
                # 去掉前缀
                key = key[len(prefix) :]
                # 如果不是以 "encode_proj." 开头的键，则加上 "bert_model." 前缀
                if not key.startswith("encode_proj."):
                    key = "bert_model." + key
                # 更新状态字典
                state_dict[key] = value
        # 使用更新后的状态字典加载 encoder 的状态
        encoder.load_state_dict(state_dict)
        # 返回加载后的模型对象
        return model
class DPRReaderState(DPRState):
    # 继承自 DPRState 类的 DPRReaderState 类

    def load_dpr_model(self):
        # 加载 DPR 模型
        model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
        # 打印加载的 DPR 读取器模型的信息，显示源文件路径
        print(f"Loading DPR reader from {self.src_file}")
        # 从检查点文件加载保存的模型状态
        saved_state = load_states_from_checkpoint(self.src_file)
        
        # 修复自 https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 的更改
        # 准备状态字典，映射加载的模型状态到正确的位置
        state_dict = {
            "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids
        }
        
        # 遍历保存的模型字典的键值对
        for key, value in saved_state.model_dict.items():
            # 如果键以 "encoder." 开头但不以 "encoder.encode_proj" 开头，修正为 "encoder.bert_model."
            if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"):
                key = "encoder.bert_model." + key[len("encoder.") :]
            # 将修正后的键值对加入到状态字典中
            state_dict[key] = value
        
        # 加载状态字典到 DPR 读取器模型的 span_predictor 部分
        model.span_predictor.load_state_dict(state_dict)
        # 返回加载完成的 DPR 读取器模型
        return model


def convert(comp_type: str, src_file: Path, dest_dir: Path):
    # 转换函数，将指定类型的 DPR 模型检查点文件转换为 PyTorch 模型

    # 确保输出目录存在或创建它
    dest_dir = Path(dest_dir)
    dest_dir.mkdir(exist_ok=True)

    # 根据组件类型和源文件路径创建 DPRState 对象
    dpr_state = DPRState.from_type(comp_type, src_file=src_file)
    # 加载 DPR 模型
    model = dpr_state.load_dpr_model()
    # 将模型保存到指定的输出目录
    model.save_pretrained(dest_dir)
    # 从输出目录重新加载模型进行验证
    model.from_pretrained(dest_dir)  # sanity check


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 必需参数
    parser.add_argument(
        "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
    )
    parser.add_argument(
        "--src",
        type=str,
        help=(
            "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo"
            " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the"
            " 'retriever' checkpoints."
        ),
    )
    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.")
    args = parser.parse_args()

    # 获取源文件路径和输出目录路径
    src_file = Path(args.src)
    dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
    dest_dir = Path(dest_dir)
    
    # 确保指定的源文件存在
    assert src_file.exists()
    # 确保指定了 DPR 模型的组件类型
    assert (
        args.type is not None
    ), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
    
    # 执行转换操作，将指定类型的 DPR 模型转换为 PyTorch 模型
    convert(args.type, src_file, dest_dir)

`.\models\dpr\modeling_dpr.py`

# 设置代码文件的编码格式为 UTF-8
# 版权声明，指出代码的版权信息和使用许可
# 依照 Apache License 2.0 许可证使用本代码
# 获取许可证的详细信息请访问指定网址
# 根据适用法律或书面同意，本软件是基于"原样"提供的，没有任何形式的担保或条件
# 请查阅许可证获取更多信息
""" PyTorch DPR model for Open Domain Question Answering. """

# 导入必要的模块和类
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
from torch import Tensor, nn

# 导入模型输出相关的基类
from ...modeling_outputs import BaseModelOutputWithPooling
# 导入预训练模型相关的基类
from ...modeling_utils import PreTrainedModel
# 导入日志相关的模块
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入 BERT 模型
from ..bert.modeling_bert import BertModel
# 导入 DPR 配置
from .configuration_dpr import DPRConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置信息
_CONFIG_FOR_DOC = "DPRConfig"
_CHECKPOINT_FOR_DOC = "facebook/dpr-ctx_encoder-single-nq-base"

# 预训练模型的存档列表
DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dpr-ctx_encoder-single-nq-base",
    "facebook/dpr-ctx_encoder-multiset-base",
]
DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dpr-question_encoder-single-nq-base",
    "facebook/dpr-question_encoder-multiset-base",
]
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dpr-reader-single-nq-base",
    "facebook/dpr-reader-multiset-base",
]


##########
# Outputs
##########

# DPRContextEncoderOutput 类，用于保存 DPRContextEncoder 的输出结果
@dataclass
class DPRContextEncoderOutput(ModelOutput):
    """
    Class for outputs of [`DPRQuestionEncoder`].
    """
    # `pooler_output` 是一个 `torch.FloatTensor` 张量，形状为 `(batch_size, embeddings_size)`，
    # 表示 DPR 编码器输出的池化器输出，对应于上下文表示。这是通过线性层进一步处理的序列第一个标记（分类标记）的最后一层隐藏状态。
    # 此输出用于将上下文嵌入以便与问题嵌入一起进行最近邻查询。

    pooler_output: torch.FloatTensor

    # `hidden_states` 是一个可选的元组，类型为 `tuple(torch.FloatTensor)`。
    # 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
    # 它包含两个张量，第一个是嵌入的输出，第二个是每个层的输出。
    # 形状为 `(batch_size, sequence_length, hidden_size)`。

    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

    # `attentions` 是一个可选的元组，类型为 `tuple(torch.FloatTensor)`。
    # 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
    # 它包含每个层的注意力张量，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    # 这些是经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。

    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 @dataclass 装饰器定义数据类 DPRQuestionEncoderOutput，继承自 ModelOutput
@dataclass
class DPRQuestionEncoderOutput(ModelOutput):
    """
    Class for outputs of [`DPRQuestionEncoder`].

    Args:
        pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
            The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
            hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
            This output is to be used to embed questions for nearest neighbors queries with context embeddings.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """
    # 定义 pooler_output 字段，类型为 torch.FloatTensor，用于存储 DPR 编码器输出的池化表示
    pooler_output: torch.FloatTensor
    # 可选字段 hidden_states，存储 DPR 模型各层的隐藏状态，类型为 Tuple[torch.FloatTensor, ...]
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选字段 attentions，存储注意力权重，类型为 Tuple[torch.FloatTensor, ...]
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# 使用 @dataclass 装饰器定义数据类 DPRReaderOutput，继承自 ModelOutput
@dataclass
class DPRReaderOutput(ModelOutput):
    """
    Class for outputs of [`DPRQuestionEncoder`].
    """
    # 定义函数参数及其类型注解：起始位置的预测概率张量，形状为 (段落数, 序列长度)
    start_logits: torch.FloatTensor
    # 可选参数：结束位置的预测概率张量，形状同上，默认为 None
    end_logits: torch.FloatTensor = None
    # 可选参数：DPRReader QA 分类器的输出，表示每个段落回答问题的相关性分数，形状为 (段落数, )
    relevance_logits: torch.FloatTensor = None
    # 可选参数：隐藏状态元组，包含模型每层的隐藏状态张量，形状为 (批大小, 序列长度, 隐藏大小)
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选参数：注意力权重元组，包含每层注意力权重张量，形状为 (批大小, 注意力头数, 序列长度, 序列长度)
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
class DPRPreTrainedModel(PreTrainedModel):
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是线性层，则使用正态分布初始化权重，标准差为配置文件中的初始化范围
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层，则同样使用正态分布初始化权重，标准差为配置文件中的初始化范围
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了填充索引，则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是层归一化层，则初始化偏置为零，权重为全1
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


class DPREncoder(DPRPreTrainedModel):
    base_model_prefix = "bert_model"

    def __init__(self, config: DPRConfig):
        super().__init__(config)
        # 初始化BERT模型，不包含池化层
        self.bert_model = BertModel(config, add_pooling_layer=False)
        # 检查隐藏层大小是否合理
        if self.bert_model.config.hidden_size <= 0:
            raise ValueError("Encoder hidden_size can't be zero")
        # 设定投影维度
        self.projection_dim = config.projection_dim
        # 如果投影维度大于零，则增加一个线性层进行投影
        if self.projection_dim > 0:
            self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
        # 执行初始化权重和最终处理
        self.post_init()

    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Optional[Tensor] = None,
        token_type_ids: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
    ) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]:
        # 调用BERT模型的前向传播
        outputs = self.bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取序列输出和汇聚输出
        sequence_output = outputs[0]
        pooled_output = sequence_output[:, 0, :]

        # 如果设定了投影维度，则对汇聚输出进行投影
        if self.projection_dim > 0:
            pooled_output = self.encode_proj(pooled_output)

        # 如果不要求返回字典，则返回序列输出、汇聚输出以及其他隐藏状态
        if not return_dict:
            return (sequence_output, pooled_output) + outputs[2:]

        # 如果要求返回字典，则返回一个包含汇聚输出、最后隐藏状态、隐藏状态和注意力的模型输出对象
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    @property
    def embeddings_size(self) -> int:
        # 如果设定了投影维度，则返回投影层的输出维度
        if self.projection_dim > 0:
            return self.encode_proj.out_features
        # 否则返回BERT模型的隐藏层大小
        return self.bert_model.config.hidden_size


class DPRSpanPredictor(DPRPreTrainedModel):
    base_model_prefix = "encoder"
    # 初始化方法，接受一个DPRConfig对象作为参数
    def __init__(self, config: DPRConfig):
        # 调用父类构造方法，初始化模型
        super().__init__(config)
        # 创建一个DPREncoder对象并赋值给self.encoder
        self.encoder = DPREncoder(config)
        # 创建一个线性层，用于生成问题答案的logits，输入大小为self.encoder.embeddings_size，输出大小为2
        self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
        # 创建一个线性层，用于生成问题相关性的logits，输入大小为self.encoder.embeddings_size，输出大小为1
        self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
        # 调用自定义的post_init方法，用于初始化权重和应用最终处理
        self.post_init()

    # 前向传播方法
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        inputs_embeds: Optional[Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
    ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
        # 计算输入的数量和序列长度
        n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2]
        
        # 将输入传递给编码器进行处理
        outputs = self.encoder(
            input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器输出的序列张量
        sequence_output = outputs[0]

        # 计算问题答案的logits
        logits = self.qa_outputs(sequence_output)
        # 将logits沿着最后一个维度分割成start_logits和end_logits
        start_logits, end_logits = logits.split(1, dim=-1)
        # 去除不必要的维度并保证内存连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        # 计算问题相关性的logits，仅使用序列输出的第一个向量
        relevance_logits = self.qa_classifier(sequence_output[:, 0, :])

        # 调整张量的大小以匹配预期的输出形状
        start_logits = start_logits.view(n_passages, sequence_length)
        end_logits = end_logits.view(n_passages, sequence_length)
        relevance_logits = relevance_logits.view(n_passages)

        # 如果return_dict为False，则返回一个元组
        if not return_dict:
            return (start_logits, end_logits, relevance_logits) + outputs[2:]

        # 如果return_dict为True，则返回一个DPRReaderOutput对象
        return DPRReaderOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            relevance_logits=relevance_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
    Contains the docstring for input specifications to DPR encoders.
"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列标记的索引，表示在词汇表中的位置。为了与预训练匹配，DPR输入序列应按以下方式格式化，包括[CLS]和[SEP]标记：

            (a) 对于序列对（例如标题+文本对）：

            ```
            tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            ```

            (b) 对于单个序列（例如问题）：

            ```
            tokens:         [CLS] the dog is hairy . [SEP]
            token_type_ids:   0   0   0   0  0     0   0
            ```

            DPR是一个具有绝对位置嵌入的模型，因此通常建议在右侧而不是左侧填充输入。

            可以使用[`AutoTokenizer`]获取索引。详见[`PreTrainedTokenizer.encode`]和[`PreTrainedTokenizer.__call__`]的详细说明。

            [什么是输入ID？](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 避免在填充标记索引上执行注意力的掩码。掩码值选择在 `[0, 1]` 范围内：

            - 1 表示**未掩码**的标记，
            - 0 表示**已掩码**的标记。

            [什么是注意力掩码？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 段标记索引，指示输入的第一部分和第二部分。索引选择在 `[0, 1]` 范围内：

            - 0 对应于*句子A*标记，
            - 1 对应于*句子B*标记。

            [什么是标记类型ID？](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选，可以直接传递嵌入表示而不是传递`input_ids`。如果需要比模型内部嵌入查找矩阵更多控制权，这将很有用。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。查看返回的张量中的 `attentions` 以获取更多详细信息。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。查看返回的张量中的 `hidden_states` 以获取更多详细信息。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
DPR_READER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`Tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
            and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
            be formatted with [CLS] and [SEP] with the format:

                `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`

            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
            rather than the left.

            Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `(n_passages, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DPRContextEncoder transformer outputting pooler outputs as context representations.",
    DPR_START_DOCSTRING,
)
class DPRContextEncoder(DPRPretrainedContextEncoder):
    """
    DPRContextEncoder extends DPRPretrainedContextEncoder to encode context using DPR models.

    Args:
        config (DPRConfig): Configuration object specifying the model configuration.

    Attributes:
        config (DPRConfig): The configuration object used to initialize the model.
        ctx_encoder (DPREncoder): Encoder instance responsible for encoding contexts.

    Methods:
        post_init(): Initializes weights and applies final processing after initialization.
    """

    def __init__(self, config: DPRConfig):
        """
        Initializes a DPRContextEncoder instance.

        Args:
            config (DPRConfig): Configuration object specifying the model configuration.
        """
        super().__init__(config)
        self.config = config
        self.ctx_encoder = DPREncoder(config)
        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        """
        Performs forward pass of the DPRContextEncoder model.

        Args:
            input_ids (Tuple[torch.LongTensor] of shapes `(n_passages, sequence_length)`, optional):
                Indices of input sequence tokens in the vocabulary.
            attention_mask (torch.FloatTensor of shape `(n_passages, sequence_length)`, optional):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (torch.FloatTensor of shape `(n_passages, sequence_length, hidden_size)`, optional):
                Optionally, directly pass an embedded representation instead of `input_ids`.
            output_attentions (bool, optional):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (bool, optional):
                Whether or not to return the hidden states of all layers.
            return_dict (bool, optional):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            DPRContextEncoderOutput or tuple:
                The output of the DPRContextEncoder model.

        Raises:
            ValueError: If `input_ids` and `inputs_embeds` are both provided.
        """
        # Implementation of forward pass is handled by the superclass DPRPretrainedContextEncoder
        return super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    def forward(
        self,
        input_ids: Optional[Tensor] = None,
        attention_mask: Optional[Tensor] = None,
        token_type_ids: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
        r"""
        此方法定义了模型的前向传播逻辑，接受多个输入参数并返回相应的输出。

        Return:
            返回一个包含池化输出、隐藏状态、注意力权重的对象，具体取决于return_dict参数设置。

        Examples:

        ```
        >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        >>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
        >>> embeddings = model(input_ids).pooler_output
        ```"""

        # 确定是否输出注意力权重，默认使用配置中的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态，默认使用配置中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否使用返回字典形式的输出，默认使用配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 检查输入参数的一致性
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 确定设备位置，根据输入参数选择
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未提供注意力掩码，则根据输入数据是否为填充令牌来生成
        if attention_mask is None:
            attention_mask = (
                torch.ones(input_shape, device=device)
                if input_ids is None
                else (input_ids != self.config.pad_token_id)
            )
        
        # 如果未提供token_type_ids，则默认为全零向量
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 调用上下文编码器模型的前向传播
        outputs = self.ctx_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果不要求返回字典形式的输出，则返回元组形式的结果
        if not return_dict:
            return outputs[1:]
        # 否则返回自定义输出对象，包含池化输出、隐藏状态和注意力权重
        return DPRContextEncoderOutput(
            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
# 为 DPRQuestionEncoder 类添加文档字符串，描述其作为问题表示的池化输出的基本功能
@add_start_docstrings(
    "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
    DPR_START_DOCSTRING,
)
class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
    def __init__(self, config: DPRConfig):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)
        # 存储配置对象到实例中
        self.config = config
        # 创建一个 DPREncoder 对象作为问题编码器
        self.question_encoder = DPREncoder(config)
        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[Tensor] = None,
        attention_mask: Optional[Tensor] = None,
        token_type_ids: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
        r"""
        定义函数的返回类型，可以是 DPRQuestionEncoderOutput 类型或者 Tensor 的元组类型

        Examples:

        ```
        >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
        >>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
        >>> embeddings = model(input_ids).pooler_output
        ```
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定 output_attentions，则使用 self.config 中的默认值

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 output_hidden_states，则使用 self.config 中的默认值

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果未指定 return_dict，则使用 self.config 中的 use_return_dict 的值

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")
        # 根据输入的 input_ids 或 inputs_embeds 确定 input_shape，如果都未指定则报错

        device = input_ids.device if input_ids is not None else inputs_embeds.device
        # 根据 input_ids 或 inputs_embeds 确定设备类型

        if attention_mask is None:
            attention_mask = (
                torch.ones(input_shape, device=device)
                if input_ids is None
                else (input_ids != self.config.pad_token_id)
            )
        # 如果未指定 attention_mask，则根据 input_ids 是否为 None 来确定是否为 pad_token_id

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
        # 如果未指定 token_type_ids，则创建全零张量，形状与 input_shape 相同，数据类型为 long

        outputs = self.question_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 question_encoder 方法进行编码，根据参数进行不同的处理

        if not return_dict:
            return outputs[1:]
        # 如果 return_dict 为 False，则返回 outputs 的第二个元素之后的内容

        return DPRQuestionEncoderOutput(
            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
        # 否则，返回一个包含 pooler_output、hidden_states 和 attentions 的 DPRQuestionEncoderOutput 对象
# 使用装饰器为类添加文档字符串，描述了该类的基本功能和用途
@add_start_docstrings(
    "The bare DPRReader transformer outputting span predictions.",
    DPR_START_DOCSTRING,
)
# 定义 DPRReader 类，继承自 DPRPretrainedReader 类
class DPRReader(DPRPretrainedReader):
    
    # 初始化方法，接受一个 DPRConfig 类型的参数 config
    def __init__(self, config: DPRConfig):
        # 调用父类的初始化方法，传入 config 参数
        super().__init__(config)
        # 将传入的 config 参数保存为类的属性
        self.config = config
        # 初始化一个 DPRSpanPredictor 实例作为类的属性 span_predictor
        self.span_predictor = DPRSpanPredictor(config)
        # 调用类的 post_init 方法，用于初始化权重和应用最终处理
        self.post_init()

    # 前向传播方法，接受多个输入参数并返回输出结果
    @add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[Tensor] = None,
        attention_mask: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 方法接受的输入参数详细文档，包括输入类型和说明
        # 输出返回值类型的文档替换为 DPRReaderOutput 类型，使用 _CONFIG_FOR_DOC 配置类
        ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
        r"""
        返回预测结果或包含多个张量的元组。

        Examples:

        ```
        >>> from transformers import DPRReader, DPRReaderTokenizer

        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
        >>> encoded_inputs = tokenizer(
        ...     questions=["What is love ?"],
        ...     titles=["Haddaway"],
        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
        ...     return_tensors="pt",
        ... )
        >>> outputs = model(**encoded_inputs)
        >>> start_logits = outputs.start_logits
        >>> end_logits = outputs.end_logits
        >>> relevance_logits = outputs.relevance_logits
        ```
        """
        # 根据输入的输出参数设置是否返回注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据输入的输出参数设置是否返回隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据输入的输出参数设置是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果同时指定了 input_ids 和 inputs_embeds，则抛出错误
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果只指定了 input_ids，则检查是否需要警告没有指定 attention_mask
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        # 如果只指定了 inputs_embeds，则获取其形状
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        # 如果既没有指定 input_ids 也没有指定 inputs_embeds，则抛出错误
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 确定使用的设备，根据是否有 input_ids 来决定
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果没有提供 attention_mask，则创建一个全为1的张量作为默认的 attention_mask
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)

        # 调用 span_predictor 方法进行预测并返回结果
        return self.span_predictor(
            input_ids,
            attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

`.\models\dpr\modeling_tf_dpr.py`

# coding=utf-8
# 定义文件编码格式和版权信息

# 导入必要的模块和库
from __future__ import annotations
from dataclasses import dataclass
from typing import Tuple, Union

import tensorflow as tf

# 导入相关的模型输出类和工具函数
from ...modeling_tf_outputs import TFBaseModelOutputWithPooling
from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, keras, shape_list, unpack_inputs
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..bert.modeling_tf_bert import TFBertMainLayer
from .configuration_dpr import DPRConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 预定义一些用于文档生成的配置常量
_CONFIG_FOR_DOC = "DPRConfig"

# 预训练模型的存档列表
TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dpr-ctx_encoder-single-nq-base",
    "facebook/dpr-ctx_encoder-multiset-base",
]
TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dpr-question_encoder-single-nq-base",
    "facebook/dpr-question_encoder-multiset-base",
]
TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dpr-reader-single-nq-base",
    "facebook/dpr-reader-multiset-base",
]

##########
# Outputs
##########

# 数据类，用于封装`TFDPRContextEncoder`的输出
@dataclass
class TFDPRContextEncoderOutput(ModelOutput):
    r"""
    Class for outputs of [`TFDPRContextEncoder`].
    """
    """
    Args:
        pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
            DPR编码器的输出，对应于上下文表示。是序列中第一个标记（分类标记）的最后一层隐藏状态，
            进一步由线性层处理。此输出用于嵌入上下文，以便使用问题嵌入进行最近邻查询。

        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            `tf.Tensor`元组（当传递`output_hidden_states=True`或`config.output_hidden_states=True`时返回），
            包含形状为`(batch_size, sequence_length, hidden_size)`的张量。

            模型在每个层的输出隐藏状态，以及初始嵌入输出。

        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            `tf.Tensor`元组（当传递`output_attentions=True`或`config.output_attentions=True`时返回），
            包含形状为`(batch_size, num_heads, sequence_length, sequence_length)`的张量。

            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。

    """

    pooler_output: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None
# 使用 `dataclass` 装饰器定义一个数据类，用于存储 `TFDPRQuestionEncoder` 的输出结果
@dataclass
class TFDPRQuestionEncoderOutput(ModelOutput):
    """
    Class for outputs of [`TFDPRQuestionEncoder`].

    Args:
        pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
            The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
            hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
            This output is to be used to embed questions for nearest neighbors queries with context embeddings.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 定义属性 `pooler_output`，类型为 `tf.Tensor`，用于存储 DPR 编码器对问题的池化输出
    pooler_output: tf.Tensor = None
    # 定义属性 `hidden_states`，类型为元组 `Tuple[tf.Tensor, ...]` 或 `None`，存储模型每一层的隐藏状态
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 定义属性 `attentions`，类型为元组 `Tuple[tf.Tensor, ...]` 或 `None`，存储模型每一层的注意力权重
    attentions: Tuple[tf.Tensor, ...] | None = None


# 使用 `dataclass` 装饰器定义一个数据类，用于存储 `TFDPRReaderEncoder` 的输出结果
@dataclass
class TFDPRReaderOutput(ModelOutput):
    """
    Class for outputs of [`TFDPRReaderEncoder`].
    """

    # 这里省略了具体的属性定义，根据文档需求，应包含与 `TFDPRQuestionEncoderOutput` 类似的属性定义
    # `start_logits` 是一个 Tensor，形状为 `(n_passages, sequence_length)`，包含每个段落中起始索引的预测值
    start_logits: tf.Tensor = None
    # `end_logits` 是一个 Tensor，形状为 `(n_passages, sequence_length)`，包含每个段落中结束索引的预测值
    end_logits: tf.Tensor = None
    # `relevance_logits` 是一个 Tensor，形状为 `(n_passages, )`，包含每个段落对于问题的相关性预测分数
    relevance_logits: tf.Tensor = None
    # `hidden_states` 是一个可选的元组，包含多个 Tensor，形状为 `(batch_size, sequence_length, hidden_size)`，表示模型在每一层输出的隐藏状态以及初始嵌入输出
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    # `attentions` 是一个可选的元组，包含多个 Tensor，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，表示每层的注意力权重
    attentions: Tuple[tf.Tensor, ...] | None = None
# 定义 TF-DPR 编码器层
class TFDPREncoderLayer(keras.layers.Layer):
    # 基础模型前缀设定为 "bert_model"
    base_model_prefix = "bert_model"

    def __init__(self, config: DPRConfig, **kwargs):
        super().__init__(**kwargs)

        # 解决与 TFBertMainLayer 的名称冲突，使用 TFBertMainLayer 而不是 TFBertModel
        self.bert_model = TFBertMainLayer(config, add_pooling_layer=False, name="bert_model")
        self.config = config

        # 检查隐藏层大小是否为非正数，如果是则抛出 ValueError
        if self.config.hidden_size <= 0:
            raise ValueError("Encoder hidden_size can't be zero")
        
        # 设置投影维度为配置中的投影维度
        self.projection_dim = config.projection_dim
        
        # 如果投影维度大于 0，则创建投影层 Dense
        if self.projection_dim > 0:
            self.encode_proj = keras.layers.Dense(
                config.projection_dim, kernel_initializer=get_initializer(config.initializer_range), name="encode_proj"
            )

    # 解包输入参数装饰器，定义 call 方法
    @unpack_inputs
    def call(
        self,
        input_ids: tf.Tensor = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool = None,
        output_hidden_states: bool = None,
        return_dict: bool = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]:
        # 调用 bert_model 的 call 方法，传入参数并获取输出
        outputs = self.bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 提取序列输出和池化输出
        sequence_output = outputs[0]
        pooled_output = sequence_output[:, 0, :]  # 取序列输出的第一个 token 的输出作为池化输出
        
        # 如果有投影维度，则应用投影层到池化输出上
        if self.projection_dim > 0:
            pooled_output = self.encode_proj(pooled_output)

        # 如果不返回字典，则返回序列输出、池化输出和其他输出
        if not return_dict:
            return (sequence_output, pooled_output) + outputs[1:]

        # 返回 TFBaseModelOutputWithPooling 对象，包含最后的隐藏状态、池化输出、隐藏状态和注意力权重
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 嵌入大小的属性方法，返回投影维度或者 bert_model 配置中的隐藏大小
    @property
    def embeddings_size(self) -> int:
        if self.projection_dim > 0:
            return self.projection_dim
        return self.bert_model.config.hidden_size

    # 构建方法，构建 bert_model 和 encode_proj 层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果存在 bert_model，则在 bert_model 名称空间下构建
        if getattr(self, "bert_model", None) is not None:
            with tf.name_scope(self.bert_model.name):
                self.bert_model.build(None)
        
        # 如果存在 encode_proj，则在 encode_proj 名称空间下构建
        if getattr(self, "encode_proj", None) is not None:
            with tf.name_scope(self.encode_proj.name):
                self.encode_proj.build(None)


# 定义 TF-DPR 跨度预测器层
class TFDPRSpanPredictorLayer(keras.layers.Layer):
    # 基础模型前缀设定为 "encoder"
    base_model_prefix = "encoder"
    # 初始化函数，用于创建一个新的DPRReader对象
    def __init__(self, config: DPRConfig, **kwargs):
        # 调用父类的初始化函数，传递任何额外的关键字参数
        super().__init__(**kwargs)
        # 存储传入的配置对象
        self.config = config
        # 创建一个TFDPREncoderLayer对象作为编码器，并命名为"encoder"
        self.encoder = TFDPREncoderLayer(config, name="encoder")

        # 创建一个全连接层用于生成答案的起始和结束logits
        self.qa_outputs = keras.layers.Dense(
            2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        # 创建一个全连接层用于生成问题与文本段落相关性的logits
        self.qa_classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="qa_classifier"
        )

    # 使用装饰器unpack_inputs，对输入进行解包
    def call(
        self,
        input_ids: tf.Tensor = None,
        attention_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
        training: bool = False,
    ) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
        # 获取输入张量input_ids的形状，n_passages表示问题批次中的段落数，sequence_length表示序列的长度
        n_passages, sequence_length = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)[:2]
        
        # 将输入传递给编码器进行处理
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从编码器的输出中获取序列输出
        sequence_output = outputs[0]

        # 计算起始和结束logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)
        
        # 计算问题与文本段落相关性的logits
        relevance_logits = self.qa_classifier(sequence_output[:, 0, :])

        # 调整logits的形状
        start_logits = tf.reshape(start_logits, [n_passages, sequence_length])
        end_logits = tf.reshape(end_logits, [n_passages, sequence_length])
        relevance_logits = tf.reshape(relevance_logits, [n_passages])

        # 如果return_dict为False，则返回元组形式的输出
        if not return_dict:
            return (start_logits, end_logits, relevance_logits) + outputs[2:]

        # 如果return_dict为True，则返回TFDPRReaderOutput对象的形式
        return TFDPRReaderOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            relevance_logits=relevance_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 构建方法用于构建模型的层和变量，如果已经构建过，则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 设置标志表示模型已构建
        self.built = True
        # 如果存在编码器(encoder)，则构建编码器的层和变量
        if getattr(self, "encoder", None) is not None:
            # 在命名作用域下构建编码器
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在问答输出(qa_outputs)，则构建其层和变量
        if getattr(self, "qa_outputs", None) is not None:
            # 在命名作用域下构建问答输出
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.encoder.embeddings_size])
        # 如果存在问答分类器(qa_classifier)，则构建其层和变量
        if getattr(self, "qa_classifier", None) is not None:
            # 在命名作用域下构建问答分类器
            with tf.name_scope(self.qa_classifier.name):
                self.qa_classifier.build([None, None, self.encoder.embeddings_size])
class TFDPRSpanPredictor(TFPreTrainedModel):
    base_model_prefix = "encoder"

    def __init__(self, config: DPRConfig, **kwargs):
        super().__init__(config, **kwargs)
        # 初始化编码器层，使用给定的配置参数
        self.encoder = TFDPRSpanPredictorLayer(config)

    @unpack_inputs
    def call(
        self,
        input_ids: tf.Tensor = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
        training: bool = False,
    ) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
        # 调用编码器层的call方法，传递参数并获取输出
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        return outputs


class TFDPREncoder(TFPreTrainedModel):
    base_model_prefix = "encoder"

    def __init__(self, config: DPRConfig, **kwargs):
        super().__init__(config, **kwargs)
        # 初始化编码器层，使用给定的配置参数
        self.encoder = TFDPREncoderLayer(config)

    @unpack_inputs
    def call(
        self,
        input_ids: tf.Tensor = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
        training: bool = False,
    ) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
        # 调用编码器层的call方法，传递参数并获取输出
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        return outputs


##################
# PreTrainedModel
##################


class TFDPRPretrainedContextEncoder(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DPRConfig
    base_model_prefix = "ctx_encoder"


class TFDPRPretrainedQuestionEncoder(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DPRConfig
    base_model_prefix = "question_encoder"


class TFDPRPretrainedReader(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DPRConfig
    base_model_prefix = "reader"


###############
# Actual Models
###############


TF_DPR_START_DOCSTRING = r"""
    # 此模型继承自 `TFPreTrainedModel`。请查看超类文档，了解库实现的通用方法，如下载或保存模型、调整输入嵌入大小、修剪头等。
    
    # 此模型还是一个 Tensorflow 的 `keras.Model` 子类。您可以将其用作常规的 TF 2.0 Keras 模型，并参考 TF 2.0 文档了解所有与一般用法和行为相关的事项。
    
    # <Tip>
    
    # 在 `transformers` 中，TensorFlow 模型和层接受两种输入格式：
    
    # - 将所有输入作为关键字参数（类似于 PyTorch 模型），
    # - 将所有输入作为列表、元组或字典传递给第一个位置参数。
    
    # 支持第二种格式的原因是，Keras 方法在将输入传递给模型和层时更倾向于此格式。因此，在使用 `model.fit()` 等方法时，只需将输入和标签以 `model.fit()` 支持的任何格式传递即可！但是，如果您想在 Keras `Functional` API 中创建自己的层或模型时使用第二种格式，比如在创建自己的层或模型时，可以使用以下三种可能性将所有输入张量收集到第一个位置参数中：
    
    # - 只有 `input_ids` 的单个张量：`model(input_ids)`
    # - 长度不同的列表，按照文档字符串中给定的顺序包含一个或多个输入张量：`model([input_ids, attention_mask])` 或 `model([input_ids, attention_mask, token_type_ids])`
    # - 一个字典，其中包含一个或多个输入张量，与文档字符串中给定的输入名称关联：`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
    
    # 注意，如果使用 [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) 创建模型和层，则无需担心这些问题，因为可以像对任何其他 Python 函数一样传递输入！
    
    # </Tip>
    
    # 参数:
    #     config ([`DPRConfig`]): 包含模型所有参数的模型配置类。
    #         使用配置文件初始化不会加载与模型关联的权重，仅加载配置。查看 [`~TFPreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""

TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
"""

TF_DPR_READER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`Numpy array` or `tf.Tensor` of shapes `(n_passages, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
            and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
            be formatted with [CLS] and [SEP] with the format:

                `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`

            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
            rather than the left.

            Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
        attention_mask (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""


@add_start_docstrings(
    "The bare DPRContextEncoder transformer outputting pooler outputs as context representations.",
    TF_DPR_START_DOCSTRING,
)
class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
    def __init__(self, config: DPRConfig, *args, **kwargs):
        # 调用父类构造函数，传递配置和其他可选参数
        super().__init__(config, *args, **kwargs)
        # 创建一个上下文编码器层对象，使用给定的配置，命名为"ctx_encoder"
        self.ctx_encoder = TFDPREncoderLayer(config, name="ctx_encoder")
    # 尝试获取上下文编码器（Context Encoder）中 BERT 模型的输入嵌入
    try:
        return self.ctx_encoder.bert_model.get_input_embeddings()
    except AttributeError:
        # 如果属性错误，则调用 build 方法重新构建模型
        self.build()
        # 返回重新构建后的 BERT 模型的输入嵌入
        return self.ctx_encoder.bert_model.get_input_embeddings()

    # 调用方法的装饰器：解压输入
    @unpack_inputs
    # 调用方法的装饰器：将模型前向方法的文档字符串添加到模型中
    @add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
    # 调用方法的装饰器：替换返回值的文档字符串为指定的类型和配置类
    @replace_return_docstrings(output_type=TFDPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
    ) -> TFDPRContextEncoderOutput | Tuple[tf.Tensor, ...]:
        r"""
        返回模型的输出：

        Examples:

        ```
        >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer

        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        >>> model = TFDPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", from_pt=True)
        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
        >>> embeddings = model(input_ids).pooler_output
        ```
        """
        # 如果同时指定了 input_ids 和 inputs_embeds，则引发 ValueError
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果指定了 input_ids，则获取其形状
        elif input_ids is not None:
            input_shape = shape_list(input_ids)
        # 如果指定了 inputs_embeds，则获取其形状，去掉最后一个维度
        elif inputs_embeds is not None:
            input_shape = shape_list(inputs_embeds)[:-1]
        else:
            # 如果既未指定 input_ids 也未指定 inputs_embeds，则引发 ValueError
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 如果 attention_mask 为 None，则根据 input_ids 是否为 None，选择性地创建 attention_mask
        if attention_mask is None:
            attention_mask = (
                tf.ones(input_shape, dtype=tf.dtypes.int32)
                if input_ids is None
                else (input_ids != self.config.pad_token_id)
            )
        # 如果 token_type_ids 为 None，则创建与 input_shape 相同形状的全零 tensor
        if token_type_ids is None:
            token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32)

        # 调用上下文编码器（Context Encoder）的前向传播方法
        outputs = self.ctx_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 如果 return_dict 为 False，则返回 outputs 的所有元素，除去第一个元素
        if not return_dict:
            return outputs[1:]

        # 如果 return_dict 为 True，则构建 TFDPRContextEncoderOutput 对象，并返回
        return TFDPRContextEncoderOutput(
            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
    # 定义一个方法 `build`，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回，不再重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果存在上下文编码器 `ctx_encoder`
        if getattr(self, "ctx_encoder", None) is not None:
            # 在 TensorFlow 的命名作用域下，构建上下文编码器
            with tf.name_scope(self.ctx_encoder.name):
                # 调用上下文编码器的 `build` 方法，并传入 `None` 作为输入形状
                self.ctx_encoder.build(None)
# 使用装饰器添加文档字符串，描述了该类的基本信息以及继承的文档字符串内容
@add_start_docstrings(
    "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
    TF_DPR_START_DOCSTRING,
)
# 定义一个 TF 版本的 DPRQuestionEncoder 类，继承自 TFDPRPretrainedQuestionEncoder 类
class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
    
    # 初始化方法，接受一个 DPRConfig 类型的配置对象以及其他参数和关键字参数
    def __init__(self, config: DPRConfig, *args, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *args, **kwargs)
        # 创建一个 TFDPREncoderLayer 类的实例作为 question_encoder 属性
        self.question_encoder = TFDPREncoderLayer(config, name="question_encoder")

    # 获取输入嵌入的方法
    def get_input_embeddings(self):
        try:
            # 尝试获取 question_encoder 属性中 bert_model 的输入嵌入
            return self.question_encoder.bert_model.get_input_embeddings()
        except AttributeError:
            # 如果属性错误（即不存在），则调用 build 方法重新构建
            self.build()
            return self.question_encoder.bert_model.get_input_embeddings()

    # 使用装饰器为 call 方法添加文档字符串，描述输入参数和返回输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFDPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
        # 函数签名中省略部分，以保持完整性
    ) -> TFDPRQuestionEncoderOutput | Tuple[tf.Tensor, ...]:
        r"""
        指定该方法的返回类型为 TFDPRQuestionEncoderOutput 或包含 tf.Tensor 的元组

        Examples:

        ```
        >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer

        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
        >>> model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", from_pt=True)
        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
        >>> embeddings = model(input_ids).pooler_output
        ```
        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = shape_list(input_ids)
        elif inputs_embeds is not None:
            input_shape = shape_list(inputs_embeds)[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if attention_mask is None:
            # 如果未提供 attention_mask，则根据输入的 shape 自动生成全为 1 的 attention_mask
            attention_mask = (
                tf.ones(input_shape, dtype=tf.dtypes.int32)
                if input_ids is None
                else (input_ids != self.config.pad_token_id)
            )
        if token_type_ids is None:
            # 如果未提供 token_type_ids，则生成一个全为 0 的 token_type_ids，与输入的 shape 相同
            token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32)

        outputs = self.question_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        if not return_dict:
            # 如果 return_dict 为 False，则返回输出中除第一个元素外的所有元素
            return outputs[1:]
        # 如果 return_dict 为 True，则以 TFDPRQuestionEncoderOutput 形式返回指定的输出
        return TFDPRQuestionEncoderOutput(
            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "question_encoder", None) is not None:
            with tf.name_scope(self.question_encoder.name):
                # 构建 question_encoder 模型的网络结构
                self.question_encoder.build(None)
@add_start_docstrings(
    "The bare DPRReader transformer outputting span predictions.",
    TF_DPR_START_DOCSTRING,
)
class TFDPRReader(TFDPRPretrainedReader):
    def __init__(self, config: DPRConfig, *args, **kwargs):
        super().__init__(config, *args, **kwargs)
        # 初始化span预测器，使用给定的配置
        self.span_predictor = TFDPRSpanPredictorLayer(config, name="span_predictor")

    def get_input_embeddings(self):
        try:
            # 尝试获取输入嵌入层
            return self.span_predictor.encoder.bert_model.get_input_embeddings()
        except AttributeError:
            # 如果属性错误，重新构建模型并返回输入嵌入层
            self.build()
            return self.span_predictor.encoder.bert_model.get_input_embeddings()

    @unpack_inputs
    @add_start_docstrings_to_model_forward(TF_DPR_READER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFDPRReaderOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
    ) -> TFDPRReaderOutput | Tuple[tf.Tensor, ...]:
        r"""
        模型前向传播函数，接受多种输入参数，返回预测结果。

        Return:
            TFDPRReaderOutput或者一个元组包含tf.Tensor

        Examples:

        ```
        >>> from transformers import TFDPRReader, DPRReaderTokenizer

        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
        >>> model = TFDPRReader.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True)
        >>> encoded_inputs = tokenizer(
        ...     questions=["What is love ?"],
        ...     titles=["Haddaway"],
        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
        ...     return_tensors="tf",
        ... )
        >>> outputs = model(encoded_inputs)
        >>> start_logits = outputs.start_logits
        >>> end_logits = outputs.end_logits
        >>> relevance_logits = outputs.relevance_logits
        ```
        """
        if input_ids is not None and inputs_embeds is not None:
            # 如果同时指定了input_ids和inputs_embeds，则引发值错误异常
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 获取input_ids的形状列表
            input_shape = shape_list(input_ids)
        elif inputs_embeds is not None:
            # 获取inputs_embeds的形状列表，去掉最后一个维度
            input_shape = shape_list(inputs_embeds)[:-1]
        else:
            # 如果既没有指定input_ids也没有指定inputs_embeds，则引发值错误异常
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if attention_mask is None:
            # 如果attention_mask为None，则创建全1的张量作为attention_mask
            attention_mask = tf.ones(input_shape, dtype=tf.dtypes.int32)

        # 调用span_predictor的前向传播方法，传递所有参数并返回结果
        return self.span_predictor(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
    # 定义模型的构建方法，参数 input_shape 可选，默认为 None
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        
        # 检查模型是否具有 span_predictor 属性，并且该属性不为 None
        if getattr(self, "span_predictor", None) is not None:
            # 在 TensorFlow 中使用命名空间来组织模型的组件，这里将使用 span_predictor 的名字空间
            with tf.name_scope(self.span_predictor.name):
                # 构建 span_predictor 组件，input_shape 参数这里传入 None
                self.span_predictor.build(None)

posted @ 2024-06-30 15:37 绝不原创的飞龙阅读(26) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-四十一-

Transformers 源码解析（四十一）

`.\models\distilbert\tokenization_distilbert.py`

`.\models\distilbert\tokenization_distilbert_fast.py`

`.\models\distilbert\init.py`

`.\models\dit\convert_dit_unilm_to_pytorch.py`

`.\models\dit\init.py`

`.\models\donut\configuration_donut_swin.py`

`.\models\donut\convert_donut_to_pytorch.py`

`.\models\donut\feature_extraction_donut.py`

`.\models\donut\image_processing_donut.py`

`.\models\donut\modeling_donut_swin.py`

`.\models\donut\processing_donut.py`

`.\models\donut\init.py`

`.\models\dpr\configuration_dpr.py`

`.\models\dpr\convert_dpr_original_checkpoint_to_pytorch.py`

`.\models\dpr\modeling_dpr.py`

`.\models\dpr\modeling_tf_dpr.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-四十一-

Transformers 源码解析（四十一）

.\models\distilbert\tokenization_distilbert.py

.\models\distilbert\tokenization_distilbert_fast.py

.\models\distilbert\__init__.py

.\models\dit\convert_dit_unilm_to_pytorch.py

.\models\dit\__init__.py

.\models\donut\configuration_donut_swin.py

.\models\donut\convert_donut_to_pytorch.py

.\models\donut\feature_extraction_donut.py

.\models\donut\image_processing_donut.py

.\models\donut\modeling_donut_swin.py

.\models\donut\processing_donut.py

.\models\donut\__init__.py

.\models\dpr\configuration_dpr.py

.\models\dpr\convert_dpr_original_checkpoint_to_pytorch.py

.\models\dpr\modeling_dpr.py

.\models\dpr\modeling_tf_dpr.py

公告

`.\models\distilbert\tokenization_distilbert.py`

`.\models\distilbert\tokenization_distilbert_fast.py`

`.\models\distilbert\init.py`

`.\models\dit\convert_dit_unilm_to_pytorch.py`

`.\models\dit\init.py`

`.\models\donut\configuration_donut_swin.py`

`.\models\donut\convert_donut_to_pytorch.py`

`.\models\donut\feature_extraction_donut.py`

`.\models\donut\image_processing_donut.py`

`.\models\donut\modeling_donut_swin.py`

`.\models\donut\processing_donut.py`

`.\models\donut\init.py`

`.\models\dpr\configuration_dpr.py`

`.\models\dpr\convert_dpr_original_checkpoint_to_pytorch.py`

`.\models\dpr\modeling_dpr.py`

`.\models\dpr\modeling_tf_dpr.py`