Transformers-源码解析-三十四-

Transformers 源码解析(三十四)

.\models\deberta_v2\tokenization_deberta_v2.py

# coding=utf-8
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tokenization class for model DeBERTa.
"""

import os  # 导入标准库os,用于处理操作系统相关功能
import unicodedata  # 导入unicodedata库,用于Unicode字符数据库的访问
from typing import Any, Dict, List, Optional, Tuple  # 导入类型提示相关的库

import sentencepiece as sp  # 导入sentencepiece库,用于分词模型的处理

from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入自定义模块中的类和函数
from ...utils import logging  # 从自定义模块中导入logging模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

# 预定义的词汇文件映射,指定不同预训练模型的SentencePiece模型文件的下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
        "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
        "microsoft/deberta-v2-xlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
        ),
        "microsoft/deberta-v2-xxlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
        ),
    }
}

# 预定义的位置嵌入大小映射,指定不同预训练模型的位置嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/deberta-v2-xlarge": 512,
    "microsoft/deberta-v2-xxlarge": 512,
    "microsoft/deberta-v2-xlarge-mnli": 512,
    "microsoft/deberta-v2-xxlarge-mnli": 512,
}

# 预定义的初始化配置映射,指定不同预训练模型的初始化配置
PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/deberta-v2-xlarge": {"do_lower_case": False},
    "microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
    "microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
    "microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
}

# 词汇文件名称映射,指定模型的SentencePiece模型文件名称
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}


class DebertaV2Tokenizer(PreTrainedTokenizer):
    r"""
    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件名称映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练模型的词汇文件映射
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION  # 设置预训练模型的初始化配置
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 设置模型的最大输入大小

    def __init__(
        self,
        vocab_file,
        do_lower_case=False,
        split_by_punct=False,
        bos_token="[CLS]",
        eos_token="[SEP]",
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        """
        Initialize a DebertaV2Tokenizer with essential parameters.

        Args:
            vocab_file (str): The vocabulary file path.
            do_lower_case (bool): Whether to convert tokens to lowercase.
            split_by_punct (bool): Whether to split tokens by punctuation.
            bos_token (str): Beginning of sequence token.
            eos_token (str): End of sequence token.
            unk_token (str): Token for unknown or unrecognized tokens.
            sep_token (str): Separator token.
            pad_token (str): Token used for padding sequences.
            cls_token (str): Classification token.
            mask_token (str): Mask token for masked language modeling.
            sp_model_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments for SentencePiece model.
            **kwargs: Additional keyword arguments.

        """
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )
        self.vocab_file = vocab_file  # 设置词汇文件路径
        self.do_lower_case = do_lower_case  # 设置是否将词汇转换为小写
        self.split_by_punct = split_by_punct  # 设置是否按标点符号分割词汇
        self.sp_model_kwargs = sp_model_kwargs if sp_model_kwargs is not None else {}  # 设置SentencePiece模型的额外参数
    ) -> None:
        # 初始化一个空字典作为分词模型的参数,如果没有指定则使用空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 检查给定的词汇文件路径是否是一个文件,如果不是则抛出数值错误异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )

        # 设置是否小写化文本的标志
        self.do_lower_case = do_lower_case
        # 设置是否通过标点符号分割的标志
        self.split_by_punct = split_by_punct
        # 设置词汇文件路径
        self.vocab_file = vocab_file

        # 使用SPMTokenizer初始化分词器,传入词汇文件路径、None作为模型路径、是否通过标点符号分割的标志、以及分词模型参数字典
        self._tokenizer = SPMTokenizer(
            vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
        )

        # 如果unk_token是字符串类型,则创建一个AddedToken对象,标记为特殊且已规范化;否则直接使用unk_token
        unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token

        # 调用父类的初始化方法,设置分词器的各种特殊标记以及其他关键字参数
        super().__init__(
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            split_by_punct=split_by_punct,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        # 将特殊标记列表赋值给分词器的特殊标记属性
        self._tokenizer.special_tokens = self.all_special_tokens

    @property
    def vocab_size(self):
        # 返回分词器词汇表的大小(词汇表的长度)
        return len(self.vocab)

    @property
    def vocab(self):
        # 返回分词器的词汇表
        return self._tokenizer.vocab

    def get_vocab(self):
        # 获取分词器的完整词汇表,包括额外添加的词汇
        vocab = self.vocab.copy()
        vocab.update(self.get_added_vocab())
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        # 如果设定为小写化,则将输入文本转换为小写
        if self.do_lower_case:
            text = text.lower()
        # 调用分词器的tokenize方法,将文本分词为字符串列表(token列表)
        return self._tokenizer.tokenize(text)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用分词器的spm对象将token(字符串)转换为对应的id(整数)
        return self._tokenizer.spm.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用分词器的spm对象将id(整数)转换为对应的token(字符串),如果id超出词汇表大小,则返回unk_token
        return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 使用分词器的decode方法将token序列(字符串列表)转换为单个字符串
        return self._tokenizer.decode(tokens)
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        从序列或序列对中构建模型输入,用于序列分类任务,通过连接和添加特殊标记。DeBERTa 序列的格式如下:

        - 单个序列:[CLS] X [SEP]
        - 序列对:[CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的 ID 列表,用于序列对输入。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """

        if token_ids_1 is None:
            # 返回只含有一个序列的特殊标记的输入列表
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回包含序列对的特殊标记的输入列表
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        从没有添加特殊标记的标记列表中检索序列 ID。当使用 tokenizer 的 `prepare_for_model` 或 `encode_plus` 方法添加特殊标记时调用此方法。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的 ID 列表,用于序列对输入。
            already_has_special_tokens (`bool`, *可选*, 默认为 `False`):
                标记列表是否已经格式化为模型所需的特殊标记。

        Returns:
            `List[int]`: 一个整数列表,取值为 [0, 1]:1 表示特殊标记,0 表示序列标记。
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            # 返回包含序列对特殊标记掩码的列表
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # 返回只包含单个序列特殊标记掩码的列表
        return [1] + ([0] * len(token_ids_0)) + [1]
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define separator and classification token IDs
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        
        # If only one sequence is provided
        if token_ids_1 is None:
            # Return token type IDs for single sequence (all 0s)
            return len(cls + token_ids_0 + sep) * [0]
        
        # Return token type IDs for two sequences (0s for first sequence, 1s for second sequence)
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        # Extract 'add_prefix_space' from kwargs
        add_prefix_space = kwargs.pop("add_prefix_space", False)
        
        # Add prefix space if required
        if is_split_into_words or add_prefix_space:
            text = " " + text
        
        # Return text and remaining kwargs
        return (text, kwargs)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Save vocabulary using the underlying tokenizer's method
        return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
    r"""
    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

    def __init__(
        self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None
    ):
        # 是否按标点符号进行分割
        self.split_by_punct = split_by_punct
        # 词汇文件路径
        self.vocab_file = vocab_file
        # SentencePiece 参数,如果未提供则为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        # 使用给定的参数初始化 SentencePieceProcessor 对象
        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        # 检查词汇文件是否存在,不存在则抛出 FileNotFoundError 异常
        if not os.path.exists(vocab_file):
            raise FileNotFoundError(f"{vocab_file} does not exist!")
        # 加载词汇文件到 SentencePieceProcessor 对象
        spm.load(vocab_file)
        # 获取 BPE 词汇表大小
        bpe_vocab_size = spm.GetPieceSize()
        # 构建词汇映射表
        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
        # 根据编号获取词汇表
        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
        # 设置特殊标记(未使用的代码段)
        # self.vocab['[PAD]'] = 0
        # self.vocab['[CLS]'] = 1
        # self.vocab['[SEP]'] = 2
        # self.vocab['[UNK]'] = 3

        # 保存 SentencePieceProcessor 对象和特殊标记
        self.spm = spm
        self.special_tokens = special_tokens

    def __getstate__(self):
        # 复制当前对象的状态,但将 spm 属性置为 None
        state = self.__dict__.copy()
        state["spm"] = None
        return state

    def __setstate__(self, d):
        # 恢复对象状态
        self.__dict__ = d

        # 为了向后兼容性
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 重新初始化 SentencePieceProcessor 对象并加载词汇文件
        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        self.spm.Load(self.vocab_file)

    def tokenize(self, text):
        # 使用 SentencePiece 对文本进行分词
        return self._encode_as_pieces(text)

    def convert_ids_to_tokens(self, ids):
        # 将编号转换为对应的标记
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens
    # 解码给定的 token 序列成原始文本。如果 raw_text 为 None,则根据 tokens 进行解码;否则根据 raw_text 进行解码。
    def decode(self, tokens, start=-1, end=-1, raw_text=None):
        if raw_text is None:
            current_sub_tokens = []  # 存储当前正在处理的子 token 序列
            out_string = ""  # 存储最终解码的文本字符串
            prev_is_special = False  # 标记前一个 token 是否为特殊 token
            for token in tokens:
                # 如果 token 是特殊 token,则不使用 sentencepiece 模型解码
                if token in self.special_tokens:
                    if not prev_is_special:
                        out_string += " "  # 如果前一个 token 不是特殊 token,则添加空格分隔
                    out_string += self.spm.decode_pieces(current_sub_tokens) + token  # 解码当前子 token 序列并添加当前 token
                    prev_is_special = True
                    current_sub_tokens = []  # 清空当前子 token 序列,准备处理下一个特殊 token
                else:
                    current_sub_tokens.append(token)  # 将 token 添加到当前子 token 序列中
                    prev_is_special = False
            out_string += self.spm.decode_pieces(current_sub_tokens)  # 解码剩余的子 token 序列并添加到最终文本中
            return out_string.strip()  # 返回去除首尾空格的最终文本
        else:
            words = self.split_to_words(raw_text)  # 根据原始文本分割成单词列表
            word_tokens = [self.tokenize(w) for w in words]  # 对每个单词进行分词得到 token 序列
            token2words = [0] * len(tokens)  # 创建一个与 tokens 等长的列表,用于映射 token 到单词索引
            tid = 0
            for i, w in enumerate(word_tokens):
                for k, t in enumerate(w):
                    token2words[tid] = i  # 将 token 的索引映射到对应的单词索引
                    tid += 1
            word_start = token2words[start]  # 获取起始 token 对应的单词索引
            word_end = token2words[end] if end < len(tokens) else len(words)  # 获取结束 token 对应的单词索引,如果超出 tokens 则取单词列表的末尾
            text = "".join(words[word_start:word_end])  # 根据单词索引拼接原始文本
            return text  # 返回拼接后的文本

    # 添加特殊 token 到 tokenizer 中,如果 token 不存在于特殊 token 列表中,则添加,并更新词汇表和 id 到 token 的映射
    def add_special_token(self, token):
        if token not in self.special_tokens:
            self.special_tokens.append(token)  # 将新的特殊 token 添加到列表中
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab) - 1  # 将新的 token 添加到词汇表中
                self.ids_to_tokens.append(token)  # 更新 id 到 token 的映射
        return self.id(token)  # 返回特殊 token 对应的 id

    # 判断 token 是否为整个单词的一部分。如果 is_bos 为 True,则始终返回 True;否则根据 token 的首字符判断是否为单词的一部分。
    def part_of_whole_word(self, token, is_bos=False):
        logger.warning_once(
            "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
        )
        if is_bos:
            return True
        if (
            len(token) == 1
            and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
        ) or token in self.special_tokens:
            return False

        word_start = b"\xe2\x96\x81".decode("utf-8")
        return not token.startswith(word_start)  # 判断 token 是否以词的起始字符开头

    # 返回填充 token
    def pad(self):
        return "[PAD]"

    # 返回文本的开头 token
    def bos(self):
        return "[CLS]"

    # 返回文本的结尾 token
    def eos(self):
        return "[SEP]"

    # 返回未知 token
    def unk(self):
        return "[UNK]"

    # 返回掩码 token
    def mask(self):
        return "[MASK]"

    # 根据 id 返回对应的 token
    def sym(self, id):
        return self.ids_to_tokens[id]

    # 根据 token 返回对应的 id,如果 token 不在词汇表中则返回默认 id 1
    def id(self, sym):
        logger.warning_once(
            "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
        )
        return self.vocab[sym] if sym in self.vocab else 1
    # 将输入文本转换为Unicode格式
    def _encode_as_pieces(self, text):
        text = convert_to_unicode(text)
        
        # 如果设置了按标点符号分割,则在文本上运行标点符号分割
        if self.split_by_punct:
            words = self._run_split_on_punc(text)
            # 对每个分割后的单词进行SPM编码,并转换为字符串列表
            pieces = [self.spm.encode(w, out_type=str) for w in words]
            # 展平嵌套列表,将编码后的片段放入一个列表中
            return [p for w in pieces for p in w]
        else:
            # 否则直接对整个文本进行SPM编码
            return self.spm.encode(text, out_type=str)

    # 将文本分割成单词
    def split_to_words(self, text):
        pieces = self._encode_as_pieces(text)
        # 定义用于标记单词开始的特殊字符
        word_start = b"\xe2\x96\x81".decode("utf-8")
        words = []
        offset = 0
        prev_end = 0
        
        # 遍历编码后的片段
        for i, p in enumerate(pieces):
            # 如果片段以单词开始字符开头
            if p.startswith(word_start):
                # 如果当前偏移量大于上一个单词结束的位置
                if offset > prev_end:
                    # 将上一个单词的内容添加到单词列表中
                    words.append(text[prev_end:offset])
                prev_end = offset
                # 移除单词开始字符,获取真正的单词内容
                w = p.replace(word_start, "")
            else:
                w = p
            
            try:
                # 在文本中查找当前单词的起始位置
                s = text.index(w, offset)
                pn = ""
                k = i + 1
                # 查找下一个非空白片段
                while k < len(pieces):
                    pn = pieces[k].replace(word_start, "")
                    if len(pn) > 0:
                        break
                    k += 1
                
                # 如果下一个片段非空且在当前单词范围内,则增加偏移量
                if len(pn) > 0 and pn in text[offset:s]:
                    offset = offset + 1
                else:
                    offset = s + len(w)
            except Exception:
                offset = offset + 1
        
        # 添加最后一个单词到单词列表中
        if prev_end < offset:
            words.append(text[prev_end:offset])
        
        return words

    # 在文本上运行标点符号分割
    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        
        # 遍历文本中的每个字符
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号,则开始一个新单词
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 否则将字符添加到当前单词的最后一个片段中
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1
        
        # 将分割后的列表中的子列表连接成字符串,并返回结果
        return ["".join(x) for x in output]

    # 将当前模型保存到指定路径下
    def save_pretrained(self, path: str, filename_prefix: str = None):
        # 获取保存的文件名
        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
        if filename_prefix is not None:
            filename = filename_prefix + "-" + filename
        
        # 拼接保存文件的完整路径
        full_path = os.path.join(path, filename)
        
        # 将序列化后的模型写入文件
        with open(full_path, "wb") as fs:
            fs.write(self.spm.serialized_model_proto())
        
        # 返回保存的文件路径
        return (full_path,)
# 检查字符是否为空白字符
def _is_whitespace(char):
    # 如果字符是空格、制表符、换行符或回车符,则返回 True
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    # 获取字符的 Unicode 分类
    cat = unicodedata.category(char)
    # 如果分类是 Zs(空格分隔符),则返回 True
    if cat == "Zs":
        return True
    # 否则返回 False
    return False


# 检查字符是否为控制字符
def _is_control(char):
    # 如果字符是制表符、换行符或回车符,则返回 False
    if char == "\t" or char == "\n" or char == "\r":
        return False
    # 获取字符的 Unicode 分类
    cat = unicodedata.category(char)
    # 如果分类以 C 开头(控制字符),则返回 True
    if cat.startswith("C"):
        return True
    # 否则返回 False
    return False


# 检查字符是否为标点符号
def _is_punctuation(char):
    # 获取字符的 Unicode 码点
    cp = ord(char)
    # 检查是否在 ASCII 范围内的非字母/数字字符,认定为标点符号
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    # 获取字符的 Unicode 分类
    cat = unicodedata.category(char)
    # 如果分类以 P 开头(标点字符),则返回 True
    if cat.startswith("P"):
        return True
    # 否则返回 False
    return False


# 将文本转换为 Unicode 编码(如果尚未)
def convert_to_unicode(text):
    # 如果输入已经是字符串,则直接返回
    if isinstance(text, str):
        return text
    # 如果输入是字节流,则用 UTF-8 解码为字符串并忽略错误
    elif isinstance(text, bytes):
        return text.decode("utf-8", "ignore")
    # 如果输入既不是字符串也不是字节流,则引发异常
    else:
        raise ValueError(f"Unsupported string type: {type(text)}")

.\models\deberta_v2\tokenization_deberta_v2_fast.py

# coding=utf-8
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization class for model DeBERTa."""

import os
from shutil import copyfile
from typing import Optional, Tuple

from ...file_utils import is_sentencepiece_available
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging

# 检查是否安装了 SentencePiece 库
if is_sentencepiece_available():
    # 如果安装了,从本地导入 DebertaV2Tokenizer 类
    from .tokenization_deberta_v2 import DebertaV2Tokenizer
else:
    # 如果未安装,将 DebertaV2Tokenizer 设置为 None
    DebertaV2Tokenizer = None

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
        "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
        "microsoft/deberta-v2-xlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
        ),
        "microsoft/deberta-v2-xxlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
        ),
    }
}

# 预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/deberta-v2-xlarge": 512,
    "microsoft/deberta-v2-xxlarge": 512,
    "microsoft/deberta-v2-xlarge-mnli": 512,
    "microsoft/deberta-v2-xxlarge-mnli": 512,
}

# 预训练模型的初始化配置映射
PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/deberta-v2-xlarge": {"do_lower_case": False},
    "microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
    "microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
    "microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
}


class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
    r"""
    Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    """

    # 设置词汇文件的名称映射
    vocab_files_names = VOCAB_FILES_NAMES
    # 设置预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 设置预训练模型的初始化配置映射
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 设置预训练模型的最大输入大小映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 设置慢速 tokenizer 的类为 DebertaV2Tokenizer
    slow_tokenizer_class = DebertaV2Tokenizer
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=False,
        split_by_punct=False,
        bos_token="[CLS]",
        eos_token="[SEP]",
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
    ) -> None:
        # 调用父类的初始化方法,传入参数进行初始化
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            split_by_punct=split_by_punct,
            **kwargs,
        )

        # 设置对象属性,保存初始化参数的值
        self.do_lower_case = do_lower_case
        self.split_by_punct = split_by_punct
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 判断词汇文件是否存在,以确定是否可以保存慢速的分词器
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        从一个序列或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。
        DeBERTa 模型的序列格式如下:

        - 单个序列: [CLS] X [SEP]
        - 一对序列: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个可选的序列 ID 列表,用于序列对。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """

        if token_ids_1 is None:
            # 如果只有一个序列,则返回加上特殊标记的列表
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        # 如果有两个序列,则分别构建包含特殊标记的列表并连接
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep
    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # If the tokens already have special tokens, delegate to the superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If token_ids_1 is provided, create a mask with special tokens for sequence pairs
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # Otherwise, create a mask with special tokens for a single sequence
        return [1] + ([0] * len(token_ids_0)) + [1]


    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """

        # Define special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a mask with only the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # Otherwise, return a mask with special tokens for both sequences
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 保存词汇表到指定目录下的文件中,并返回保存的文件路径
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果无法保存慢速分词器的词汇表,则引发数值错误
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存目录不存在,则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构造输出词汇表文件的路径,可以带有前缀
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与输出路径不一致,则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回保存的词汇表文件路径的元组
        return (out_vocab_file,)

.\models\deberta_v2\__init__.py

# 版权声明和许可证声明,指明代码的版权和许可条件
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 授权使用此文件
# you may not use this file except in compliance with the License.
# 除非符合许可证的规定,否则不得使用本文件
# You may obtain a copy of the License at
# 获取许可证的副本地址
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 除非法律要求或书面同意,否则本软件按"原样"提供
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的担保或条件
# See the License for the specific language governing permissions and
# 详见许可证,了解特定语言的授权信息
# limitations under the License.
# 许可证下的限制

from typing import TYPE_CHECKING

# 导入必要的依赖模块和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块导入结构
_import_structure = {
    "configuration_deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config", "DebertaV2OnnxConfig"],
    "tokenization_deberta_v2": ["DebertaV2Tokenizer"],
}

# 检查 tokenizers 是否可用,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,将 tokenization_deberta_v2_fast 模块添加到导入结构中
    _import_structure["tokenization_deberta_v2_fast"] = ["DebertaV2TokenizerFast"]

# 检查 TensorFlow 是否可用,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,将 modeling_tf_deberta_v2 模块添加到导入结构中
    _import_structure["modeling_tf_deberta_v2"] = [
        "TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFDebertaV2ForMaskedLM",
        "TFDebertaV2ForQuestionAnswering",
        "TFDebertaV2ForMultipleChoice",
        "TFDebertaV2ForSequenceClassification",
        "TFDebertaV2ForTokenClassification",
        "TFDebertaV2Model",
        "TFDebertaV2PreTrainedModel",
    ]

# 检查 PyTorch 是否可用,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,将 modeling_deberta_v2 模块添加到导入结构中
    _import_structure["modeling_deberta_v2"] = [
        "DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DebertaV2ForMaskedLM",
        "DebertaV2ForMultipleChoice",
        "DebertaV2ForQuestionAnswering",
        "DebertaV2ForSequenceClassification",
        "DebertaV2ForTokenClassification",
        "DebertaV2Model",
        "DebertaV2PreTrainedModel",
    ]

# 如果是类型检查阶段,进行进一步的导入
if TYPE_CHECKING:
    # 导入配置相关的类和变量
    from .configuration_deberta_v2 import (
        DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        DebertaV2Config,
        DebertaV2OnnxConfig,
    )
    # 导入 tokenizers 相关的类
    from .tokenization_deberta_v2 import DebertaV2Tokenizer

    # 检查 tokenizers 是否可用,若不可用则不导入
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,导入 tokenization_deberta_v2_fast 模块
        from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast

    # 检查 TensorFlow 是否可用,若不可用则不导入
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果前面的条件不满足,则从当前目录下的.modeling_tf_deberta_v2模块中导入以下内容:
    from .modeling_tf_deberta_v2 import (
        TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFDebertaV2ForMaskedLM,
        TFDebertaV2ForMultipleChoice,
        TFDebertaV2ForQuestionAnswering,
        TFDebertaV2ForSequenceClassification,
        TFDebertaV2ForTokenClassification,
        TFDebertaV2Model,
        TFDebertaV2PreTrainedModel,
    )

try:
    # 尝试检查是否有torch库可用,如果不可用则引发OptionalDependencyNotAvailable异常
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果OptionalDependencyNotAvailable异常被引发,什么也不做,直接跳过
    pass
else:
    # 如果上面的try块未引发异常,则从当前目录下的.modeling_deberta_v2模块中导入以下内容:
    from .modeling_deberta_v2 import (
        DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
        DebertaV2ForMaskedLM,
        DebertaV2ForMultipleChoice,
        DebertaV2ForQuestionAnswering,
        DebertaV2ForSequenceClassification,
        DebertaV2ForTokenClassification,
        DebertaV2Model,
        DebertaV2PreTrainedModel,
    )
else:
    # 导入 sys 模块,用于操作 Python 解释器的系统功能
    import sys
    
    # 将当前模块注册到 sys.modules 中,使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\decision_transformer\configuration_decision_transformer.py

# coding=utf-8
# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Decision Transformer model configuration"""

# 导入必要的库和模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型及其配置文件的映射字典
DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "edbeeching/decision-transformer-gym-hopper-medium": (
        "https://huggingface.co/edbeeching/decision-transformer-gym-hopper-medium/resolve/main/config.json"
    ),
    # 查看所有 DecisionTransformer 模型,请访问 https://huggingface.co/models?filter=decision_transformer
}

# DecisionTransformerConfig 类,用于存储 DecisionTransformer 模型的配置信息
class DecisionTransformerConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to
    instantiate a Decision Transformer model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the standard
    DecisionTransformer architecture. Many of the config options are used to instatiate the GPT2 model that is used as
    part of the architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import DecisionTransformerConfig, DecisionTransformerModel

    >>> # Initializing a DecisionTransformer configuration
    >>> configuration = DecisionTransformerConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DecisionTransformerModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型
    model_type = "decision_transformer"
    # 推理时忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典,用于调整配置
    attribute_map = {
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }
    # 初始化函数,用于设置模型的各种参数和配置
    def __init__(
        self,
        state_dim=17,  # 状态维度,默认为17
        act_dim=4,  # 动作维度,默认为4
        hidden_size=128,  # 隐藏层大小,默认为128
        max_ep_len=4096,  # 最大的 episode 长度,默认为4096
        action_tanh=True,  # 是否对动作进行 tanh 处理,默认为True
        vocab_size=1,  # 词汇表大小,默认为1
        n_positions=1024,  # 序列位置编码的最大长度,默认为1024
        n_layer=3,  # Transformer 层数,默认为3
        n_head=1,  # 自注意力机制中的头数,默认为1
        n_inner=None,  # FeedForward 层中间层的维度,默认为None
        activation_function="relu",  # 激活函数,默认为 relu
        resid_pdrop=0.1,  # 残差连接中的 dropout 概率,默认为0.1
        embd_pdrop=0.1,  # Embedding 层的 dropout 概率,默认为0.1
        attn_pdrop=0.1,  # 注意力机制中的 dropout 概率,默认为0.1
        layer_norm_epsilon=1e-5,  # Layer Normalization 中的 epsilon,默认为1e-5
        initializer_range=0.02,  # 参数初始化范围,默认为0.02
        scale_attn_weights=True,  # 是否对注意力权重进行缩放,默认为True
        use_cache=True,  # 是否使用缓存,默认为True
        bos_token_id=50256,  # 起始 token 的 id,默认为50256
        eos_token_id=50256,  # 结束 token 的 id,默认为50256
        scale_attn_by_inverse_layer_idx=False,  # 是否根据逆层索引缩放注意力,默认为False
        reorder_and_upcast_attn=False,  # 是否重新排序并提升注意力,默认为False
        **kwargs,
    ):
        self.state_dim = state_dim  # 初始化模型的状态维度
        self.act_dim = act_dim  # 初始化模型的动作维度
        self.hidden_size = hidden_size  # 初始化模型的隐藏层大小
        self.max_ep_len = max_ep_len  # 初始化模型的最大 episode 长度
        self.action_tanh = action_tanh  # 初始化模型的动作是否经过 tanh 处理
        self.vocab_size = vocab_size  # 初始化模型的词汇表大小
        self.n_positions = n_positions  # 初始化模型的序列位置编码的最大长度
        self.n_layer = n_layer  # 初始化模型的 Transformer 层数
        self.n_head = n_head  # 初始化模型的自注意力机制中的头数
        self.n_inner = n_inner  # 初始化模型的 FeedForward 层中间层的维度
        self.activation_function = activation_function  # 初始化模型的激活函数
        self.resid_pdrop = resid_pdrop  # 初始化模型的残差连接中的 dropout 概率
        self.embd_pdrop = embd_pdrop  # 初始化模型的 Embedding 层的 dropout 概率
        self.attn_pdrop = attn_pdrop  # 初始化模型的注意力机制中的 dropout 概率
        self.layer_norm_epsilon = layer_norm_epsilon  # 初始化模型的 Layer Normalization 中的 epsilon
        self.initializer_range = initializer_range  # 初始化模型的参数初始化范围
        self.scale_attn_weights = scale_attn_weights  # 初始化模型的是否对注意力权重进行缩放
        self.use_cache = use_cache  # 初始化模型的是否使用缓存
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx  # 初始化模型的是否根据逆层索引缩放注意力
        self.reorder_and_upcast_attn = reorder_and_upcast_attn  # 初始化模型的是否重新排序并提升注意力

        self.bos_token_id = bos_token_id  # 初始化模型的起始 token 的 id
        self.eos_token_id = eos_token_id  # 初始化模型的结束 token 的 id

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)  # 调用父类的初始化函数,并传递参数

.\models\decision_transformer\modeling_decision_transformer.py

# coding=utf-8
# Copyright 2022 The HuggingFace Team The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DecisionTransformer model."""

import math
import os
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.cuda.amp import autocast

# 导入激活函数映射表
from ...activations import ACT2FN
# 导入模型输出的基类,包含过去注意力和交叉注意力
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
# 导入预训练模型的基类
from ...modeling_utils import PreTrainedModel
# 导入与PyTorch相关的实用工具
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
# 导入通用的模型输出类型
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入决策Transformer的配置文件类
from .configuration_decision_transformer import DecisionTransformerConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点名称
_CHECKPOINT_FOR_DOC = "edbeeching/decision-transformer-gym-hopper-medium"
# 用于文档的配置文件名称
_CONFIG_FOR_DOC = "DecisionTransformerConfig"

# 决策Transformer预训练模型的存档列表
DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "edbeeching/decision-transformer-gym-hopper-medium",
    # 可以查看所有决策Transformer模型的列表
    # https://huggingface.co/models?filter=decision_transformer
]


# 从transformers.models.gpt2.modeling_gpt2.load_tf_weights_in_gpt2中复制而来
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """Load tf checkpoints in a pytorch model"""
    try:
        import re

        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    # 获取TF检查点的绝对路径
    tf_path = os.path.abspath(gpt2_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # 从TF模型加载权重
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array.squeeze())
    # 对于每个名字和数组的组合,执行以下操作
    for name, array in zip(names, arrays):
        # 去掉名字中的"model/"前缀
        name = name[6:]  # skip "model/"
        # 使用斜杠分割名字
        name = name.split("/")
        # 初始化指针为模型本身
        pointer = model
        # 遍历名字中的每个部分
        for m_name in name:
            # 如果名字匹配字母+数字的模式
            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
                # 按数字分割名字
                scope_names = re.split(r"(\d+)", m_name)
            else:
                # 否则将整个名字作为列表中的一个元素
                scope_names = [m_name]
            # 根据第一个部分选择不同的属性
            if scope_names[0] == "w" or scope_names[0] == "g":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "b":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
                # 处理"wpe"或"wte"的情况
                pointer = getattr(pointer, scope_names[0])
                pointer = getattr(pointer, "weight")
            else:
                # 根据名字的第一个部分选择属性
                pointer = getattr(pointer, scope_names[0])
            # 如果名字有第二个部分,则选择对应索引的元素
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        try:
            # 检查指针的形状是否与数组的形状匹配
            if pointer.shape != array.shape:
                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
        except ValueError as e:
            # 如果形状不匹配,将详细信息添加到异常中并重新抛出
            e.args += (pointer.shape, array.shape)
            raise
        # 记录初始化操作信息
        logger.info(f"Initialize PyTorch weight {name}")
        # 将数组转换为PyTorch张量,并赋值给指针的数据属性
        pointer.data = torch.from_numpy(array)
    # 返回处理后的模型
    return model
# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Attention with GPT2->DecisionTransformerGPT2
class DecisionTransformerGPT2Attention(nn.Module):
    def __init__(self, config, is_cross_attention=False, layer_idx=None):
        super().__init__()

        # 初始化注意事项
        max_positions = config.max_position_embeddings
        # 注册缓冲区并生成一个下三角形状的布尔张量作为注意力偏置
        self.register_buffer(
            "bias",
            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions
            ),
            persistent=False,
        )
        # 注册缓冲区并设置掩码偏置
        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)

        self.embed_dim = config.hidden_size  # 嵌入维度大小
        self.num_heads = config.num_attention_heads  # 注意力头的数量
        self.head_dim = self.embed_dim // self.num_heads  # 每个注意力头的维度
        self.split_size = self.embed_dim  # 分割后的大小
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        self.scale_attn_weights = config.scale_attn_weights  # 注意力权重缩放
        self.is_cross_attention = is_cross_attention  # 是否是交叉注意力

        # 层级注意力权重缩放、重排序和向上转换
        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
        self.layer_idx = layer_idx
        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn

        if self.is_cross_attention:
            # 如果是交叉注意力,创建交叉注意力层和查询注意力层
            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
        else:
            # 如果不是交叉注意力,创建常规注意力层
            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)  # 创建投影层

        self.attn_dropout = nn.Dropout(config.attn_pdrop)  # 注意力丢弃率
        self.resid_dropout = nn.Dropout(config.resid_pdrop)  # 残差丢弃率

        self.pruned_heads = set()  # 初始化被修剪的注意力头集合

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])

        # 对 conv1d 层进行修剪
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)

        # 更新超参数
        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
        self.num_heads = self.num_heads - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)
    # 计算注意力权重,通过查询和键的矩阵乘法得到
    attn_weights = torch.matmul(query, key.transpose(-1, -2))

    # 如果设置了缩放注意力权重标志,则对注意力权重进行缩放
    if self.scale_attn_weights:
        attn_weights = attn_weights / torch.full(
            [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
        )

    # 如果设置了按逆层索引缩放注意力权重,则对注意力权重进行额外缩放
    if self.scale_attn_by_inverse_layer_idx:
        attn_weights = attn_weights / float(self.layer_idx + 1)

    # 如果不是交叉注意力,实现因果屏蔽
    if not self.is_cross_attention:
        # 获取查询和键的长度
        query_length, key_length = query.size(-2), key.size(-2)
        # 生成因果屏蔽掩码
        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
        # 设定掩码的值为极小值,用于屏蔽不需要的位置
        mask_value = torch.finfo(attn_weights.dtype).min
        # 创建与注意力权重相同类型和设备的掩码张量
        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
        # 将因果屏蔽应用于注意力权重
        attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)

    # 如果存在注意力掩码,则将其应用于注意力权重
    if attention_mask is not None:
        attn_weights = attn_weights + attention_mask

    # 对注意力权重进行 softmax 归一化
    attn_weights = nn.functional.softmax(attn_weights, dim=-1)

    # 将注意力权重的数据类型降回到 value 张量的数据类型(如果使用了混合精度)
    attn_weights = attn_weights.type(value.dtype)

    # 应用注意力 dropout
    attn_weights = self.attn_dropout(attn_weights)

    # 如果需要,对注意力权重应用头部掩码
    if head_mask is not None:
        attn_weights = attn_weights * head_mask

    # 计算最终的注意力输出
    attn_output = torch.matmul(attn_weights, value)

    # 返回注意力输出和注意力权重
    return attn_output, attn_weights
    # 将 query, key, value 和 attention_mask(如果存在)按照指定的方式进行上转型和重新排序,并计算注意力权重
    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
        # 获取 query 的维度信息:batch size, num_heads, query sequence length, key dimension
        bsz, num_heads, q_seq_len, dk = query.size()
        # 获取 key 的维度信息:batch size, num_heads, key sequence length, key dimension
        _, _, k_seq_len, _ = key.size()

        # 为 `baddbmm` 预先分配注意力权重张量
        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)

        # 计算注意力权重的缩放因子
        scale_factor = 1.0
        if self.scale_attn_weights:
            scale_factor /= float(value.size(-1)) ** 0.5

        if self.scale_attn_by_inverse_layer_idx:
            scale_factor /= float(self.layer_idx + 1)

        # 关闭自动混合精度并上转型和重新排序 (Scale K by 1 / root(dk))
        with autocast(enabled=False):
            # 将 query 转换为形状为 (-1, q_seq_len, dk) 的张量
            q = query.reshape(-1, q_seq_len, dk)
            # 将 key 转置并重塑为形状为 (-1, dk, k_seq_len) 的张量
            k = key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
            # 使用 `torch.baddbmm` 计算加权和,注意力权重使用缩放因子进行缩放
            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
            # 将注意力权重张量重塑为形状为 (bsz, num_heads, q_seq_len, k_seq_len)
            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)

        # 如果不是跨注意力(cross-attention),实现因果掩码
        if not self.is_cross_attention:
            # 获取 query 和 key 的长度
            query_length, key_length = query.size(-2), key.size(-2)
            # 创建因果掩码,限制只能看到过去的信息
            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
            # 计算用于掩码的最小值,确保张量的类型和设备一致
            mask_value = torch.finfo(attn_weights.dtype).min
            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
            # 根据因果掩码应用掩码操作
            attn_weights = torch.where(causal_mask, attn_weights, mask_value)

        # 如果存在注意力掩码,则应用该掩码
        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

        # 对注意力权重张量进行 softmax 操作,以获得归一化的注意力分布
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        # 将注意力权重张量转换回 value 张量的数据类型(如果需要)
        if attn_weights.dtype != torch.float32:
            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
        attn_weights = attn_weights.type(value.dtype)

        # 应用注意力 dropout 操作
        attn_weights = self.attn_dropout(attn_weights)

        # 如果需要,对注意力权重应用头部掩码
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算最终的注意力输出,通过注意力权重与 value 的乘积得到
        attn_output = torch.matmul(attn_weights, value)

        # 返回注意力输出和注意力权重张量
        return attn_output, attn_weights

    # 将张量按照给定的方式进行分割为多个头部
    def _split_heads(self, tensor, num_heads, attn_head_size):
        """
        Splits hidden_size dim into attn_head_size and num_heads
        """
        # 计算新的张量形状,将 hidden_size 维度分割为 num_heads 和 attn_head_size
        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
        # 重新调整张量形状,并交换维度以符合注意力头部的分割要求
        tensor = tensor.view(new_shape)
        tensor = tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
        return tensor
    def _merge_heads(self, tensor, num_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        """
        # 交换张量的维度顺序,将注意力头和头数的维度合并到隐藏层维度中
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        # 计算新的形状,将注意力头和头数维度合并成新的隐藏层维度
        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
        # 重新视图张量以适应新形状
        return tensor.view(new_shape)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn"):
                # 如果作为跨注意力使用,则必须定义权重 `q_attn`,否则引发错误
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
                )

            # 使用 self.q_attn 处理隐藏状态以生成查询张量
            query = self.q_attn(hidden_states)
            # 使用 self.c_attn 处理编码器隐藏状态以生成键和值张量,并按指定维度分割
            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            # 使用编码器的注意力掩码
            attention_mask = encoder_attention_mask
        else:
            # 使用 self.c_attn 处理隐藏状态以生成查询、键和值张量,并按指定维度分割
            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)

        # 将查询张量按头数和头维度分割
        query = self._split_heads(query, self.num_heads, self.head_dim)
        # 将键张量按头数和头维度分割
        key = self._split_heads(key, self.num_heads, self.head_dim)
        # 将值张量按头数和头维度分割
        value = self._split_heads(value, self.num_heads, self.head_dim)

        # 如果存在过去的层状态,将过去的键和值与当前的键和值拼接在一起
        if layer_past is not None:
            past_key, past_value = layer_past
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        # 如果使用缓存,保存当前的键和值
        if use_cache is True:
            present = (key, value)
        else:
            present = None

        # 如果需要重新排序和向上转型的注意力机制
        if self.reorder_and_upcast_attn:
            # 使用特定方法处理注意力机制,得到注意力输出和注意力权重
            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
        else:
            # 使用默认的注意力方法处理注意力机制,得到注意力输出和注意力权重
            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # 将注意力输出按头数和头维度合并成隐藏层维度
        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
        # 使用投影层处理注意力输出
        attn_output = self.c_proj(attn_output)
        # 应用残差连接和dropout到注意力输出
        attn_output = self.resid_dropout(attn_output)

        # 输出包括注意力输出和可能的 present
        outputs = (attn_output, present)
        # 如果需要输出注意力权重,也将其加入到输出中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出
        return outputs  # a, present, (attentions)
# 从transformers.models.gpt2.modeling_gpt2.GPT2MLP复制代码,将GPT2改为DecisionTransformerGPT2
class DecisionTransformerGPT2MLP(nn.Module):
    def __init__(self, intermediate_size, config):
        super().__init__()
        embed_dim = config.hidden_size
        # 定义一个一维卷积层,输入维度为embed_dim,输出维度为intermediate_size
        self.c_fc = Conv1D(intermediate_size, embed_dim)
        # 定义另一个一维卷积层,输入维度为intermediate_size,输出维度为embed_dim
        self.c_proj = Conv1D(embed_dim, intermediate_size)
        # 设置激活函数为配置中指定的激活函数类型对应的函数
        self.act = ACT2FN[config.activation_function]
        # 设置dropout层,丢弃概率为config.resid_pdrop
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        # 应用第一个卷积层
        hidden_states = self.c_fc(hidden_states)
        # 应用激活函数
        hidden_states = self.act(hidden_states)
        # 应用第二个卷积层
        hidden_states = self.c_proj(hidden_states)
        # 应用dropout层
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# 从transformers.models.gpt2.modeling_gpt2.GPT2Block复制代码,将GPT2改为DecisionTransformerGPT2
class DecisionTransformerGPT2Block(nn.Module):
    def __init__(self, config, layer_idx=None):
        super().__init__()
        hidden_size = config.hidden_size
        # 初始化LayerNorm层,输入维度为hidden_size,eps为config.layer_norm_epsilon
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # 初始化DecisionTransformerGPT2Attention层
        self.attn = DecisionTransformerGPT2Attention(config, layer_idx=layer_idx)
        # 初始化LayerNorm层,输入维度为hidden_size,eps为config.layer_norm_epsilon
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        if config.add_cross_attention:
            # 如果配置中指定添加跨注意力层,则初始化DecisionTransformerGPT2Attention层作为跨注意力层
            self.crossattention = DecisionTransformerGPT2Attention(
                config, is_cross_attention=True, layer_idx=layer_idx
            )
            # 初始化LayerNorm层,输入维度为hidden_size,eps为config.layer_norm_epsilon
            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 初始化DecisionTransformerGPT2MLP层
        self.mlp = DecisionTransformerGPT2MLP(inner_dim, config)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
        # 定义函数的返回类型,可以返回包含 torch.Tensor 的元组或者包含可选元组的 Union
        residual = hidden_states
        # 应用 LayerNormalization,归一化隐藏状态
        hidden_states = self.ln_1(hidden_states)
        # 使用 self.attn 处理注意力机制
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 提取注意力输出的第一个元素,即注意力的输出
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        # 提取除了第一个元素外的所有输出,作为其他输出
        outputs = attn_outputs[1:]
        # 残差连接,将注意力输出与原始隐藏状态相加
        hidden_states = attn_output + residual

        if encoder_hidden_states is not None:
            # 如果传入了 encoder_hidden_states,则进行交叉注意力处理
            if not hasattr(self, "crossattention"):
                # 如果模型未配置交叉注意力层,则引发错误
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            residual = hidden_states
            # 应用交叉注意力层前的 LayerNormalization
            hidden_states = self.ln_cross_attn(hidden_states)
            # 使用 self.crossattention 进行交叉注意力计算
            cross_attn_outputs = self.crossattention(
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            # 提取交叉注意力输出的第一个元素
            attn_output = cross_attn_outputs[0]
            # 残差连接,将交叉注意力输出与之前的隐藏状态相加
            hidden_states = residual + attn_output
            # 将交叉注意力输出的其他部分添加到已有的 outputs 中,如果输出了注意力权重
            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights

        residual = hidden_states
        # 应用 LayerNormalization
        hidden_states = self.ln_2(hidden_states)
        # 应用 MLP(Feed Forward)层
        feed_forward_hidden_states = self.mlp(hidden_states)
        # 残差连接,将 MLP 层的输出与原始隐藏状态相加
        hidden_states = residual + feed_forward_hidden_states

        if use_cache:
            # 如果需要缓存,则将隐藏状态和其他输出组成一个元组返回
            outputs = (hidden_states,) + outputs
        else:
            # 否则,只返回隐藏状态和除第一个元素外的其他输出
            outputs = (hidden_states,) + outputs[1:]

        return outputs  # 返回隐藏状态、present、(attentions, cross_attentions)
class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 DecisionTransformerConfig 作为配置类
    config_class = DecisionTransformerConfig
    # 使用 load_tf_weights_in_gpt2 函数加载 TensorFlow 权重
    load_tf_weights = load_tf_weights_in_gpt2
    # 基础模型前缀
    base_model_prefix = "transformer"
    # 可并行化处理
    is_parallelizable = True
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear, Conv1D)):
            # 初始化线性层和一维卷积层的权重
            # 与 TF 版本略有不同,TF 版本使用截断正态分布进行初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 初始化嵌入层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 层的偏置和权重
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        # 根据 OpenAI GPT-2 论文的方案重新初始化选定的权重:
        #   > 修改的初始化方法考虑到了模型深度中残差路径的累积。在初始化时,通过因子 1/√N 缩放残差层的权重,
        #   > 其中 N 是残差层数量。
        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
        #
        # 参考 (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
        for name, p in module.named_parameters():
            if "c_proj" in name and "weight" in name:
                # 特殊的缩放初始化 --> 每个 Transformer 块中有 2 个 Layer Norm
                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))


class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.embed_dim = config.hidden_size

        # 词嵌入层和位置编码层的初始化
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

        # Dropout 层的初始化
        self.drop = nn.Dropout(config.embd_pdrop)

        # Transformer 块的初始化
        self.h = nn.ModuleList(
            [DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
        )

        # 最终的 LayerNorm 层的初始化
        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

        # 模型并行
        self.model_parallel = False
        self.device_map = None
        self.gradient_checkpointing = False

        # 初始化权重并应用最终处理
        self.post_init()
    # 获取输入的词嵌入(词向量)矩阵
    def get_input_embeddings(self):
        return self.wte

    # 设置输入的词嵌入(词向量)矩阵为新的嵌入矩阵
    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings

    # 从transformers库中GPT2Model类的forward方法复制而来
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 为决策变换器模型输出定义一个数据类,继承自模型输出基类
@dataclass
class DecisionTransformerOutput(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.
    
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
            Environment state predictions
        action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
            Model action predictions
        return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
            Predicted returns for each state
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """
    
    # 环境状态预测
    state_preds: torch.FloatTensor = None
    # 模型动作预测
    action_preds: torch.FloatTensor = None
    # 对每个状态的预测返回
    return_preds: torch.FloatTensor = None
    # 模型隐藏状态
    hidden_states: torch.FloatTensor = None
    # 注意力权重
    attentions: torch.FloatTensor = None
    # 最后一层隐藏状态
    last_hidden_state: torch.FloatTensor = None


# 决策变换器预训练模型的抽象类,处理权重初始化、预训练模型下载和加载的简单接口
class DecisionTransformerPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    
    # 决策变换器配置类
    config_class = DecisionTransformerConfig
    # 基础模型前缀
    base_model_prefix = "decision_transformer"
    # 主输入名称
    main_input_name = "states"
    # 是否支持梯度检查点
    supports_gradient_checkpointing = False
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重,均值为0,标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项,将其初始化为0
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重,均值为0,标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果定义了填充索引,将填充索引位置的权重初始化为0
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果是层归一化层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为0
            module.bias.data.zero_()
            # 将权重初始化为1
            module.weight.data.fill_(1.0)
# 决策变换器模型的文档字符串,描述了这是一个 PyTorch 的子类模块,可作为常规的 PyTorch 模块使用。建议参考 PyTorch 文档以获取有关通用用法和行为的详细信息。
DECISION_TRANSFORMER_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~DecisionTransformerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 决策变换器模型的输入文档字符串,描述了模型的输入参数及其形状。
DECISION_TRANSFORMER_INPUTS_DOCSTRING = r"""
    Args:
        states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`):
            The states for each step in the trajectory
        actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`):
            The actions taken by the "expert" policy for the current state, these are masked for auto regressive
            prediction
        rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
            The rewards for each state, action
        returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
            The returns for each state in the trajectory
        timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`):
            The timestep for each step in the trajectory
        attention_mask (`torch.FloatTensor` of shape `(batch_size, episode_length)`):
            Masking, used to mask the actions when performing autoregressive prediction
"""

# 通过装饰器 @add_start_docstrings 将决策变换器模型的文档字符串和起始描述串联接起来,用以说明决策变换器模型的作用和功能。
@add_start_docstrings("The Decision Transformer Model", DECISION_TRANSFORMER_START_DOCSTRING)
class DecisionTransformerModel(DecisionTransformerPreTrainedModel):
    """
    The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL
    setting. Refer to the paper for more details: https://arxiv.org/abs/2106.01345
    """
    # 初始化函数,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法,传入配置对象
        super().__init__(config)
        # 将配置对象保存在实例中
        self.config = config
        # 设置隐藏层大小为配置对象中指定的隐藏层大小
        self.hidden_size = config.hidden_size

        # 创建一个 DecisionTransformerGPT2Model 实例作为编码器
        # 注意:与 Huggingface 默认版本唯一的区别是移除了位置嵌入(因为我们将自己添加)
        self.encoder = DecisionTransformerGPT2Model(config)

        # 创建嵌入层,用于不同类型的输入
        self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size)
        self.embed_return = torch.nn.Linear(1, config.hidden_size)
        self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size)
        self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size)

        # LayerNorm 层,用于标准化隐藏层表示
        self.embed_ln = nn.LayerNorm(config.hidden_size)

        # 不预测状态或回报值(根据论文设定)
        
        # 线性层,用于预测状态
        self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim)
        # 序列模块,用于预测动作
        self.predict_action = nn.Sequential(
            *([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else []))
        )
        # 线性层,用于预测回报值
        self.predict_return = torch.nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 前向传播函数,接受多个输入参数并返回一个输出
    @add_start_docstrings_to_model_forward(DECISION_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=DecisionTransformerOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        states: Optional[torch.FloatTensor] = None,
        actions: Optional[torch.FloatTensor] = None,
        rewards: Optional[torch.FloatTensor] = None,
        returns_to_go: Optional[torch.FloatTensor] = None,
        timesteps: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,

.\models\decision_transformer\__init__.py

# 版权声明和许可证信息,说明此文件的版权归HuggingFace团队所有,并遵循Apache License 2.0许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入TYPE_CHECKING用于静态类型检查
from typing import TYPE_CHECKING

# 从utils模块导入OptionalDependencyNotAvailable、_LazyModule和is_torch_available函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构,包括configuration_decision_transformer模块的部分内容
_import_structure = {
    "configuration_decision_transformer": [
        "DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "DecisionTransformerConfig",
    ],
}

# 检查是否torch可用,如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用,则扩展_import_structure添加modeling_decision_transformer模块的内容
    _import_structure["modeling_decision_transformer"] = [
        "DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DecisionTransformerGPT2Model",
        "DecisionTransformerGPT2PreTrainedModel",
        "DecisionTransformerModel",
        "DecisionTransformerPreTrainedModel",
    ]

# 如果正在进行类型检查
if TYPE_CHECKING:
    # 从configuration_decision_transformer模块导入特定内容,包括DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP和DecisionTransformerConfig
    from .configuration_decision_transformer import (
        DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        DecisionTransformerConfig,
    )

    # 再次检查torch是否可用,如果不可用则跳过
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从modeling_decision_transformer模块导入特定内容,包括DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST和多个DecisionTransformer类
        from .modeling_decision_transformer import (
            DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            DecisionTransformerGPT2Model,
            DecisionTransformerGPT2PreTrainedModel,
            DecisionTransformerModel,
            DecisionTransformerPreTrainedModel,
        )

# 如果不是在进行类型检查
else:
    # 导入sys模块
    import sys

    # 将当前模块设置为_LazyModule,使用_LazyModule延迟加载模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\deformable_detr\configuration_deformable_detr.py

# 设置文件编码为 UTF-8
# 版权声明,声明版权及许可条款
# 根据 Apache 许可证 2.0 版本,除非符合许可证,否则不得使用此文件
# 可以在以下网址获取许可证的副本:http://www.apache.org/licenses/LICENSE-2.0
# 如果适用法律要求或书面同意,软件按“原样”分发,不提供任何形式的担保或条件
# 请查看许可证了解特定语言下的权限和限制
""" Deformable DETR 模型配置 """

# 导入必要的配置和日志模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 预训练模型配置映射字典,将模型名称映射到其预训练配置文件的 URL
DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "SenseTime/deformable-detr": "https://huggingface.co/sensetime/deformable-detr/resolve/main/config.json",
    # 查看所有 Deformable DETR 模型,请访问 https://huggingface.co/models?filter=deformable-detr
}

# DeformableDetrConfig 类,继承自 PretrainedConfig 类
class DeformableDetrConfig(PretrainedConfig):
    r"""
    这是用于存储 [`DeformableDetrModel`] 配置的类。它用于根据指定参数实例化 Deformable DETR 模型,定义模型架构。
    使用默认配置来实例化对象将会生成类似于 Deformable DETR [SenseTime/deformable-detr]
    (https://huggingface.co/SenseTime/deformable-detr) 架构的配置。

    配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。

    Examples:

    ```
    >>> from transformers import DeformableDetrConfig, DeformableDetrModel

    >>> # 初始化一个 Deformable DETR SenseTime/deformable-detr 风格的配置
    >>> configuration = DeformableDetrConfig()

    >>> # 从指定配置文件初始化一个(带有随机权重)SenseTime/deformable-detr 风格的模型
    >>> model = DeformableDetrModel(configuration)

    >>> # 访问模型的配置
    >>> configuration = model.config
    ```
    """

    # 模型类型
    model_type = "deformable_detr"

    # 属性映射字典,将配置文件中的属性名称映射到 Deformable DETR 模型中相应的属性名称
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
    }
    # 初始化函数,用于创建一个新的对象实例,初始化各种参数和属性
    def __init__(
        self,
        use_timm_backbone=True,  # 是否使用timm库中的backbone模型作为特征提取器,默认为True
        backbone_config=None,  # backbone模型的配置参数,默认为None
        num_channels=3,  # 输入图像的通道数,默认为3(RGB图像)
        num_queries=300,  # 查询数量,用于查询Transformer解码器输出的对象位置,默认为300
        max_position_embeddings=1024,  # 最大位置编码数,默认为1024
        encoder_layers=6,  # Transformer编码器层数,默认为6
        encoder_ffn_dim=1024,  # Transformer编码器中FeedForward层的维度,默认为1024
        encoder_attention_heads=8,  # Transformer编码器中注意力头的数量,默认为8
        decoder_layers=6,  # Transformer解码器层数,默认为6
        decoder_ffn_dim=1024,  # Transformer解码器中FeedForward层的维度,默认为1024
        decoder_attention_heads=8,  # Transformer解码器中注意力头的数量,默认为8
        encoder_layerdrop=0.0,  # Transformer编码器中每层dropout的比例,默认为0.0(不使用dropout)
        is_encoder_decoder=True,  # 是否使用编码-解码结构,默认为True
        activation_function="relu",  # 激活函数的类型,默认为ReLU
        d_model=256,  # Transformer模型中的隐藏层维度,默认为256
        dropout=0.1,  # 模型中的普通dropout比例,默认为0.1
        attention_dropout=0.0,  # 注意力机制中的dropout比例,默认为0.0(不使用dropout)
        activation_dropout=0.0,  # 激活函数中的dropout比例,默认为0.0(不使用dropout)
        init_std=0.02,  # 初始化模型参数的标准差,默认为0.02
        init_xavier_std=1.0,  # Xavier初始化中的标准差,默认为1.0
        return_intermediate=True,  # 是否返回中间层的输出,默认为True
        auxiliary_loss=False,  # 是否使用辅助损失,默认为False
        position_embedding_type="sine",  # 位置编码的类型,默认为"sine"(正弦位置编码)
        backbone="resnet50",  # 使用的backbone模型,默认为"resnet50"
        use_pretrained_backbone=True,  # 是否使用预训练的backbone模型,默认为True
        backbone_kwargs=None,  # backbone模型的其他参数,默认为None
        dilation=False,  # 是否使用空洞卷积(dilation convolution),默认为False
        num_feature_levels=4,  # 特征级别的数量,默认为4
        encoder_n_points=4,  # 编码器中位置嵌入的点数,默认为4
        decoder_n_points=4,  # 解码器中位置嵌入的点数,默认为4
        two_stage=False,  # 是否使用两阶段检测器,默认为False
        two_stage_num_proposals=300,  # 第二阶段的提议数量,默认为300
        with_box_refine=False,  # 是否使用边界框细化,默认为False
        class_cost=1,  # 类别损失的系数,默认为1
        bbox_cost=5,  # 边界框损失的系数,默认为5
        giou_cost=2,  # GIoU损失的系数,默认为2
        mask_loss_coefficient=1,  # 掩膜损失的系数,默认为1
        dice_loss_coefficient=1,  # Dice损失的系数,默认为1
        bbox_loss_coefficient=5,  # 边界框损失的系数,默认为5
        giou_loss_coefficient=2,  # GIoU损失的系数,默认为2
        eos_coefficient=0.1,  # EOS(结束符)损失的系数,默认为0.1
        focal_alpha=0.25,  # Focal损失的alpha参数,默认为0.25
        disable_custom_kernels=False,  # 是否禁用自定义内核,默认为False
        **kwargs,  # 其他未列出的关键字参数
    ):
        # 继承类的初始化方法
        super().__init__(**kwargs)

    @property
    def num_attention_heads(self) -> int:
        # 返回编码器中的注意力头数量
        return self.encoder_attention_heads

    @property
    def hidden_size(self) -> int:
        # 返回模型中的隐藏层维度
        return self.d_model

.\models\deformable_detr\convert_deformable_detr_to_pytorch.py

# 从状态字典中重命名键,根据特定规则进行替换
def rename_key(orig_key):
    if "backbone.0.body" in orig_key:
        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
    if "transformer" in orig_key:
        orig_key = orig_key.replace("transformer.", "")
    if "norm1" in orig_key:
        # 根据上下文替换层次规范化的键名,区分编码器和解码器的情况
        if "encoder" in orig_key:
            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
        else:
            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
    if "norm2" in orig_key:
        # 根据上下文替换层次规范化的键名,区分编码器和解码器的情况
        if "encoder" in orig_key:
            orig_key = orig_key.replace("norm2", "final_layer_norm")
        else:
            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
    if "norm3" in orig_key:
        # 替换最终层次规范化的键名
        orig_key = orig_key.replace("norm3", "final_layer_norm")
    if "linear1" in orig_key:
        # 替换第一个线性层的键名
        orig_key = orig_key.replace("linear1", "fc1")
    if "linear2" in orig_key:
        # 替换第二个线性层的键名
        orig_key = orig_key.replace("linear2", "fc2")
    if "query_embed" in orig_key:
        # 替换查询位置嵌入的键名
        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
    if "cross_attn" in orig_key:
        # 替换交叉注意力的键名
        orig_key = orig_key.replace("cross_attn", "encoder_attn")

    return orig_key


# 从状态字典中读取查询、键和值
def read_in_q_k_v(state_dict):
    # 循环遍历范围为0到5,共6次,处理每个自注意力层的权重和偏置
    for i in range(6):
        # 从状态字典中弹出当前自注意力层输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
        
        # 将权重切片分配给查询、键和值投影层的权重
        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
        # 将偏置切片分配给查询投影层的偏置
        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
        # 将权重切片分配给键投影层的权重
        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
        # 将偏置切片分配给键投影层的偏置
        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
        # 将权重切片分配给值投影层的权重
        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
        # 将偏置切片分配给值投影层的偏置
        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
# 我们将在一张可爱猫咪的图片上验证我们的结果
def prepare_img():
    # 图片的 URL 地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过请求获取图片的原始二进制数据流,并用 PIL 打开这个图片
    im = Image.open(requests.get(url, stream=True).raw)

    return im


@torch.no_grad()
def convert_deformable_detr_checkpoint(
    checkpoint_path,
    single_scale,
    dilation,
    with_box_refine,
    two_stage,
    pytorch_dump_folder_path,
    push_to_hub,
):
    """
    复制/粘贴/调整模型的权重以适应我们的 Deformable DETR 结构。
    """

    # 加载默认配置
    config = DeformableDetrConfig()
    # 设置配置属性
    if single_scale:
        config.num_feature_levels = 1  # 设置特征层级数为1
    config.dilation = dilation  # 设置膨胀参数
    config.with_box_refine = with_box_refine  # 设置是否进行框调整
    config.two_stage = two_stage  # 设置是否为两阶段模型
    # 设置标签数目
    config.num_labels = 91
    repo_id = "huggingface/label-files"
    filename = "coco-detection-id2label.json"
    # 从 HuggingFace Hub 下载并加载 COCO 检测标签映射文件
    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label  # 设置 ID 到标签的映射
    config.label2id = {v: k for k, v in id2label.items()}  # 设置标签到 ID 的映射

    # 加载图像处理器
    image_processor = DeformableDetrImageProcessor(format="coco_detection")

    # 准备图片
    img = prepare_img()  # 调用准备图片函数获取图片对象
    encoding = image_processor(images=img, return_tensors="pt")  # 对图片进行编码处理
    pixel_values = encoding["pixel_values"]  # 获取像素数值

    logger.info("Converting model...")  # 记录日志,表示正在转换模型

    # 加载原始的状态字典
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
    # 重命名键名
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        state_dict[rename_key(key)] = val
    # 查询、键、值矩阵需要特殊处理
    read_in_q_k_v(state_dict)
    # 重要:需要在每个基础模型键名前添加前缀,因为头部模型使用不同的属性
    prefix = "model."
    for key in state_dict.copy().keys():
        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
            val = state_dict.pop(key)
            state_dict[prefix + key] = val
    # 最后,创建 HuggingFace 模型并加载状态字典
    model = DeformableDetrForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()

    device = "cuda" if torch.cuda.is_available() else "cpu"  # 检测设备是否支持 CUDA
    model.to(device)  # 将模型移动到指定设备
    # 验证转换结果
    outputs = model(pixel_values.to(device))

    expected_logits = torch.tensor(
        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
    )
    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])

    if single_scale:
        expected_logits = torch.tensor(
            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
        )
        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
    # 如果选择了单尺度和扩张操作,则设定预期的分类 logits 和边界框
    if single_scale and dilation:
        expected_logits = torch.tensor(
            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
        )
        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])

    # 如果需要进行边界框细化,则设定预期的分类 logits 和边界框
    if with_box_refine:
        expected_logits = torch.tensor(
            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
        )
        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])

    # 如果同时需要边界框细化和两阶段操作,则设定预期的分类 logits 和边界框
    if with_box_refine and two_stage:
        expected_logits = torch.tensor(
            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
        )
        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])

    # 打印模型输出的前三行三列的 logits
    print("Logits:", outputs.logits[0, :3, :3])

    # 断言模型输出的前三行三列的 logits 和预期的 logits 在给定的误差范围内相似
    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
    # 断言模型输出的前三行三列的预测边界框和预期的边界框在给定的误差范围内相似
    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)

    # 打印信息,表明一切正常
    print("Everything ok!")

    # 保存 PyTorch 模型和图像处理器到指定路径
    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
    # 确保保存模型和处理器的文件夹存在
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 调用模型的保存方法和图像处理器的保存方法
    model.save_pretrained(pytorch_dump_folder_path)
    image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型推送到 Hub 上,则进行相应操作
    if push_to_hub:
        # 构造模型的名称,根据选择的参数添加后缀
        model_name = "deformable-detr"
        model_name += "-single-scale" if single_scale else ""
        model_name += "-dc5" if dilation else ""
        model_name += "-with-box-refine" if with_box_refine else ""
        model_name += "-two-stage" if two_stage else ""
        # 打印提示信息,表明正在将模型推送到 Hub 上
        print("Pushing model to hub...")
        # 调用模型对象的推送到 Hub 的方法
        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
# 如果当前脚本作为主程序执行(而不是被导入为模块),则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加命令行参数:checkpoint_path,用于指定 PyTorch checkpoint 文件的路径
    parser.add_argument(
        "--checkpoint_path",
        type=str,
        default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth",
        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
    )

    # 添加命令行参数:single_scale,设置为 True 则设置 config.num_features_levels = 1
    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")

    # 添加命令行参数:dilation,设置为 True 则设置 config.dilation=True
    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")

    # 添加命令行参数:with_box_refine,设置为 True 则设置 config.with_box_refine=True
    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")

    # 添加命令行参数:two_stage,设置为 True 则设置 config.two_stage=True
    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")

    # 添加命令行参数:pytorch_dump_folder_path,必需的参数,指定输出 PyTorch 模型的文件夹路径
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=True,
        help="Path to the folder to output PyTorch model.",
    )

    # 添加命令行参数:push_to_hub,设置为 True 则表示要将转换后的模型推送到 🤗 hub
    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    # 解析命令行参数并将其保存到 args 变量中
    args = parser.parse_args()

    # 调用函数 convert_deformable_detr_checkpoint,并传入命令行参数中的相应值
    convert_deformable_detr_checkpoint(
        args.checkpoint_path,
        args.single_scale,
        args.dilation,
        args.with_box_refine,
        args.two_stage,
        args.pytorch_dump_folder_path,
        args.push_to_hub,
    )

.\models\deformable_detr\feature_extraction_deformable_detr.py

# 设置 Python 文件的编码格式为 UTF-8
# 版权声明,声明此代码版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本,除非符合许可证的规定,否则不得使用此文件
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件是基于"原样"提供的,不提供任何形式的担保或条件,
# 无论是明示的还是暗示的。有关详细信息,请参阅许可证。
"""Deformable DETR 的特征提取器类。"""

# 导入警告模块
import warnings

# 从本地模块中导入 rgb_to_id 函数,并重命名为 _rgb_to_id
from ...image_transforms import rgb_to_id as _rgb_to_id
# 导入日志记录工具
from ...utils import logging
# 从本地模块中导入 DeformableDetrImageProcessor 类
from .image_processing_deformable_detr import DeformableDetrImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def rgb_to_id(x):
    # 发出警告,提醒用户从版本 5 开始,不再从当前模块中导入 rgb_to_id 函数
    warnings.warn(
        "rgb_to_id has moved and will not be importable from this module from v5. "
        "Please import from transformers.image_transforms instead.",
        FutureWarning,
    )
    # 调用本地模块中的 _rgb_to_id 函数,执行颜色转换操作
    return _rgb_to_id(x)


class DeformableDetrFeatureExtractor(DeformableDetrImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告,提示 DeformableDetrFeatureExtractor 类在 Transformers 版本 5 中将被移除
        # 建议使用 DeformableDetrImageProcessor 类代替
        warnings.warn(
            "The class DeformableDetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use DeformableDetrImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法,传入所有位置参数和关键字参数
        super().__init__(*args, **kwargs)

.\models\deformable_detr\image_processing_deformable_detr.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Deformable DETR."""

import io
import pathlib
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union

import numpy as np

from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
    PaddingMode,
    center_to_corners_format,
    corners_to_center_format,
    id_to_rgb,
    pad,
    rescale,
    resize,
    rgb_to_id,
    to_channel_dimension_format,
)
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
    AnnotationType,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_annotations,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import (
    TensorType,
    is_flax_available,
    is_jax_tensor,
    is_scipy_available,
    is_tf_available,
    is_tf_tensor,
    is_torch_available,
    is_torch_tensor,
    is_vision_available,
    logging,
)


if is_torch_available():
    import torch
    from torch import nn


if is_vision_available():
    import PIL

if is_scipy_available():
    import scipy.special
    import scipy.stats


logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size.

    Args:
        image_size (`Tuple[int, int]`):
            The input image size.
        size (`int`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
    """
    # 解构输入的图片尺寸元组,分别取出高度和宽度
    height, width = image_size
    # 如果指定了最大尺寸限制
    if max_size is not None:
        # 计算原始尺寸中的最小值
        min_original_size = float(min((height, width)))
        # 计算原始尺寸中的最大值
        max_original_size = float(max((height, width)))
        # 如果根据最大原始尺寸调整后的尺寸超过了最大限制,则重新调整尺寸
        if max_original_size / min_original_size * size > max_size:
            size = int(round(max_size * min_original_size / max_original_size))
    
    # 如果高度小于等于宽度且高度等于目标尺寸,或者宽度小于等于高度且宽度等于目标尺寸,则直接返回原始高度和宽度
    if (height <= width and height == size) or (width <= height and width == size):
        return height, width
    
    # 根据图像的宽高比例调整输出的宽高
    if width < height:
        ow = size
        oh = int(size * height / width)
    else:
        oh = size
        ow = int(size * width / height)
    
    # 返回调整后的输出高度和宽度
    return (oh, ow)
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
    input_image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int]],
    max_size: Optional[int] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size. If the desired output size
    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
    image size is computed by keeping the aspect ratio of the input image size.

    Args:
        input_image (`np.ndarray`):
            The image to resize.
        size (`int` or `Tuple[int, int]` or `List[int]`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
    """
    # 获取输入图像的尺寸
    image_size = get_image_size(input_image, input_data_format)
    
    # 如果输出尺寸是元组或列表,则直接返回
    if isinstance(size, (list, tuple)):
        return size
    
    # 否则根据输入图像的尺寸和输出的单一尺寸计算具有保持宽高比的输出尺寸
    return get_size_with_aspect_ratio(image_size, size, max_size)


# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
    """
    Returns a function that converts a numpy array to the framework of the input array.

    Args:
        arr (`np.ndarray`): The array to convert.
    """
    # 如果输入数组是 numpy 数组,则返回 numpy 的 array 函数
    if isinstance(arr, np.ndarray):
        return np.array
    
    # 如果 TensorFlow 可用且输入数组是 TensorFlow 张量,则返回 TensorFlow 的 convert_to_tensor 函数
    if is_tf_available() and is_tf_tensor(arr):
        import tensorflow as tf
        return tf.convert_to_tensor
    
    # 如果 PyTorch 可用且输入数组是 PyTorch 张量,则返回 PyTorch 的 tensor 函数
    if is_torch_available() and is_torch_tensor(arr):
        import torch
        return torch.tensor
    
    # 如果 Flax 可用且输入数组是 JAX 张量,则返回 JAX 的 array 函数
    if is_flax_available() and is_jax_tensor(arr):
        import jax.numpy as jnp
        return jnp.array
    
    # 如果无法识别输入数组的类型,则抛出 ValueError
    raise ValueError(f"Cannot convert arrays of type {type(arr)}")


# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
    """
    Squeezes an array, but only if the axis specified has dim 1.
    """
    # 如果未指定轴,则按默认行为挤压数组
    if axis is None:
        return arr.squeeze()
    
    # 否则尝试按指定轴挤压数组,若失败则返回原数组
    try:
        return arr.squeeze(axis=axis)
    except ValueError:
        return arr


# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
    # 从图像尺寸元组中获取高度和宽度
    image_height, image_width = image_size
    
    # 初始化归一化后的注释字典
    norm_annotation = {}
    # 遍历注释字典中的每个键值对
    for key, value in annotation.items():
        # 如果键是 "boxes"
        if key == "boxes":
            # 将值赋给变量 boxes
            boxes = value
            # 转换边界框格式为中心点表示,并归一化到图像尺寸
            boxes = corners_to_center_format(boxes)
            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
            # 将归一化后的边界框数据存入 norm_annotation 字典
            norm_annotation[key] = boxes
        else:
            # 对于其他键直接存入 norm_annotation 字典
            norm_annotation[key] = value
    # 返回归一化后的注释字典
    return norm_annotation
# 从 `transformers.models.detr.image_processing_detr.max_across_indices` 模块中复制的函数,用于返回可迭代值中每个索引的最大值列表。
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    返回一个可迭代值中所有索引的最大值列表。
    """
    return [max(values_i) for values_i in zip(*values)]


# 从 `transformers.models.detr.image_processing_detr.get_max_height_width` 模块中复制的函数,用于获取批次中所有图像的最大高度和宽度。
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    获取批次中所有图像的最大高度和宽度。
    """
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)


# 从 `transformers.models.detr.image_processing_detr.make_pixel_mask` 模块中复制的函数,用于生成图像的像素掩码。
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    生成图像的像素掩码,其中 1 表示有效像素,0 表示填充像素。

    Args:
        image (`np.ndarray`):
            要生成像素掩码的图像。
        output_size (`Tuple[int, int]`):
            掩码的输出尺寸。
    """
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    mask = np.zeros(output_size, dtype=np.int64)
    mask[:input_height, :input_width] = 1
    return mask


# 从 `transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask` 模块中复制的函数,用于将 COCO 多边形注释转换为掩码。
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
    """
    将 COCO 多边形注释转换为掩码。

    Args:
        segmentations (`List[List[float]]`):
            多边形列表,每个多边形由一组 x-y 坐标表示。
        height (`int`):
            掩码的高度。
        width (`int`):
            掩码的宽度。
    """
    try:
        from pycocotools import mask as coco_mask
    except ImportError:
        raise ImportError("Pycocotools is not installed in your environment.")

    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = np.asarray(mask, dtype=np.uint8)
        mask = np.any(mask, axis=2)
        masks.append(mask)
    if masks:
        masks = np.stack(masks, axis=0)
    else:
        masks = np.zeros((0, height, width), dtype=np.uint8)

    return masks
# 从transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation复制并将DETR更改为DeformableDetr
def prepare_coco_detection_annotation(
    image,
    target,
    return_segmentation_masks: bool = False,
    input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
    """
    将COCO格式的目标转换为DeformableDetr期望的格式。
    """
    # 获取图像的高度和宽度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)

    # 提取图像ID
    image_id = target["image_id"]
    image_id = np.asarray([image_id], dtype=np.int64)

    # 获取给定图像的所有COCO注释
    annotations = target["annotations"]
    # 过滤掉“iscrowd”属性为1的对象
    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]

    # 提取类别ID
    classes = [obj["category_id"] for obj in annotations]
    classes = np.asarray(classes, dtype=np.int64)

    # 为了转换为COCO API格式
    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)

    # 提取边界框信息
    boxes = [obj["bbox"] for obj in annotations]
    # 处理无边界框的情况
    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
    boxes[:, 2:] += boxes[:, :2]  # 将(x_min, y_min, width, height)转换为(x_min, y_min, x_max, y_max)
    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)   # 裁剪边界框的x坐标
    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)  # 裁剪边界框的y坐标

    # 保留有效的边界框
    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])

    # 创建新的目标字典
    new_target = {}
    new_target["image_id"] = image_id
    new_target["class_labels"] = classes[keep]
    new_target["boxes"] = boxes[keep]
    new_target["area"] = area[keep]
    new_target["iscrowd"] = iscrowd[keep]
    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)

    # 如果注释中包含关键点信息,则提取并添加到新的目标字典中
    if annotations and "keypoints" in annotations[0]:
        keypoints = [obj["keypoints"] for obj in annotations]
        keypoints = np.asarray(keypoints, dtype=np.float32)
        keypoints = keypoints[keep]
        num_keypoints = keypoints.shape[0]
        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
        new_target["keypoints"] = keypoints

    # 如果需要返回分割掩码,则提取分割掩码并添加到新的目标字典中
    if return_segmentation_masks:
        segmentation_masks = [obj["segmentation"] for obj in annotations]
        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
        new_target["masks"] = masks[keep]

    return new_target


# 从transformers.models.detr.image_processing_detr.masks_to_boxes复制
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
    """
    计算提供的全景分割掩码周围的边界框。

    Args:
        masks: 格式为`[number_masks, height, width]`的掩码,其中N是掩码的数量

    Returns:
        boxes: 格式为`[number_masks, 4]`的边界框,xyxy格式
    """
    # 如果掩码数组为空,则返回一个形状为 (0, 4) 的零数组
    if masks.size == 0:
        return np.zeros((0, 4))

    # 获取掩码数组的高度 h 和宽度 w
    h, w = masks.shape[-2:]

    # 创建一维数组 y 和 x,分别表示高度和宽度范围,数据类型为 np.float32
    y = np.arange(0, h, dtype=np.float32)
    x = np.arange(0, w, dtype=np.float32)

    # 创建二维网格,用 y 和 x 数组作为坐标,并按照 'ij' 索引顺序
    y, x = np.meshgrid(y, x, indexing="ij")

    # 将掩码数组与 x 数组进行逐元素相乘,得到 x_mask
    x_mask = masks * np.expand_dims(x, axis=0)

    # 对 x_mask 进行重塑和最大值计算,得到 x_max
    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)

    # 使用掩码创建一个掩码数组的掩码对象,并填充未掩码部分为 1e8
    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))

    # 对填充后的 x_min 进行重塑和最小值计算,得到 x_min
    x_min = x.filled(fill_value=1e8)
    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)

    # 将掩码数组与 y 数组进行逐元素相乘,得到 y_mask
    y_mask = masks * np.expand_dims(y, axis=0)

    # 对 y_mask 进行重塑和最大值计算,得到 y_max
    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)

    # 使用掩码创建一个掩码数组的掩码对象,并填充未掩码部分为 1e8
    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))

    # 对填充后的 y_min 进行重塑和最小值计算,得到 y_min
    y_min = y.filled(fill_value=1e8)
    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)

    # 返回堆叠的 x_min, y_min, x_max, y_max 数组,形状为 (N, 4),其中 N 是掩码数量
    return np.stack([x_min, y_min, x_max, y_max], 1)
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr
def prepare_coco_panoptic_annotation(
    image: np.ndarray,
    target: Dict,
    masks_path: Union[str, pathlib.Path],
    return_masks: bool = True,
    input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
    """
    Prepare a coco panoptic annotation for DeformableDetr.
    """
    # 获取图像的高度和宽度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    # 构建注释文件的路径
    annotation_path = pathlib.Path(masks_path) / target["file_name"]

    # 初始化新的目标字典
    new_target = {}
    # 将图像ID转换为numpy数组形式存储在新的目标字典中
    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
    # 将图像尺寸存储在新的目标字典中
    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
    # 将原始图像尺寸存储在新的目标字典中
    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)

    # 如果目标字典包含分段信息
    if "segments_info" in target:
        # 从注释文件中读取掩码信息并转换为numpy数组
        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
        # 将RGB格式的掩码转换为类别ID格式的掩码
        masks = rgb_to_id(masks)

        # 从segments_info中提取分段信息中的ID
        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
        # 使用类别ID掩码创建掩码数组
        masks = masks == ids[:, None, None]
        masks = masks.astype(np.uint8)
        
        # 如果需要返回掩码,则存储在新的目标字典中
        if return_masks:
            new_target["masks"] = masks
        
        # 将掩码转换为边界框格式并存储在新的目标字典中
        new_target["boxes"] = masks_to_boxes(masks)
        
        # 提取分段信息中的类别ID并存储在新的目标字典中
        new_target["class_labels"] = np.array(
            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        
        # 提取分段信息中的iscrowd标志并存储在新的目标字典中
        new_target["iscrowd"] = np.asarray(
            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        
        # 提取分段信息中的区域面积并存储在新的目标字典中
        new_target["area"] = np.asarray(
            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
        )

    # 返回处理后的新的目标字典
    return new_target


# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
def get_segmentation_image(
    masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
    # 提取输入图像的高度和宽度
    h, w = input_size
    # 提取目标图像的最终高度和宽度
    final_h, final_w = target_size

    # 对掩码执行softmax操作,以获得每个像素最可能的类别ID
    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)

    # 如果掩码的类别ID数量为0,则创建全零矩阵
    if m_id.shape[-1] == 0:
        m_id = np.zeros((h, w), dtype=np.int64)
    else:
        # 取最大概率类别ID,并将其重新形状为原始图像尺寸
        m_id = m_id.argmax(-1).reshape(h, w)

    # 如果需要去重复
    if deduplicate:
        # 合并具有相同类别的掩码
        for equiv in stuff_equiv_classes.values():
            for eq_id in equiv:
                m_id[m_id == eq_id] = equiv[0]

    # 将类别ID图像转换为RGB图像
    seg_img = id_to_rgb(m_id)
    # 将图像大小调整为目标尺寸并使用最近邻插值
    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
    # 返回分割图像
    return seg_img


# Copied from transformers.models.detr.image_processing_detr.get_mask_area
def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
    # 提取目标图像的最终高度和宽度
    final_h, final_w = target_size
    # 将分割图像转换为numpy数组,并将其形状调整为最终图像尺寸
    np_seg_img = seg_img.astype(np.uint8)
    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
    # 将RGB格式的图像转换为类别ID格式的图像
    m_id = rgb_to_id(np_seg_img)
    # 返回类别ID图像
    # 计算每个类别的样本数,返回一个列表,列表索引对应类别编号
    area = [(m_id == i).sum() for i in range(n_classes)]
    # 返回计算出的各类别样本数列表作为结果
    return area
# 定义函数,从类别概率的对数输出中计算标签和分数
def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    # 对类别概率进行 softmax 处理,使其变成概率分布
    probs = scipy.special.softmax(logits, axis=-1)
    # 获取每个样本中概率最高的类别标签
    labels = probs.argmax(-1, keepdims=True)
    # 根据标签取出对应的概率作为分数
    scores = np.take_along_axis(probs, labels, axis=-1)
    # 去除多余的维度,使得 scores 和 labels 变为一维数组
    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
    return scores, labels


# 定义函数,处理单个样本的 Panoptic 分割输出
def post_process_panoptic_sample(
    out_logits: np.ndarray,
    masks: np.ndarray,
    boxes: np.ndarray,
    processed_size: Tuple[int, int],
    target_size: Tuple[int, int],
    is_thing_map: Dict,
    threshold=0.85,
) -> Dict:
    """
    Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.

    Args:
        out_logits (`torch.Tensor`):
            The logits for this sample.
        masks (`torch.Tensor`):
            The predicted segmentation masks for this sample.
        boxes (`torch.Tensor`):
            The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
        processed_size (`Tuple[int, int]`):
            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
            after data augmentation but before batching.
        target_size (`Tuple[int, int]`):
            The target size of the image, `(height, width)` corresponding to the requested final size of the
            prediction.
        is_thing_map (`Dict`):
            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
        threshold (`float`, *optional*, defaults to 0.85):
            The threshold used to binarize the segmentation masks.
    """
    # 根据类别概率计算标签和分数
    scores, labels = score_labels_from_class_probabilities(out_logits)
    # 筛选出有效预测结果,去除空查询和低于阈值的检测结果
    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)

    # 从筛选后的结果中取出分数、类别和边界框
    cur_scores = scores[keep]
    cur_classes = labels[keep]
    cur_boxes = center_to_corners_format(boxes[keep])

    # 检查每个类别是否都有相应的边界框
    if len(cur_boxes) != len(cur_classes):
        raise ValueError("Not as many boxes as there are classes")

    # 取出当前有效预测的掩膜,并调整大小以匹配预处理后的图像尺寸
    cur_masks = masks[keep]
    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
    cur_masks = safe_squeeze(cur_masks, 1)
    b, h, w = cur_masks.shape

    # 将掩膜展平,以便后续合并同一类别的多个掩膜
    cur_masks = cur_masks.reshape(b, -1)
    # 创建一个 defaultdict,用于跟踪每个物体类别的掩膜 ids(后续将这些 ids 合并)
    stuff_equiv_classes = defaultdict(list)
    # 遍历当前类别列表,并使用枚举函数获取索引和标签
    for k, label in enumerate(cur_classes):
        # 如果当前标签对应的不是物体类别,则将索引添加到对应的“stuff”等价类别列表中
        if not is_thing_map[label]:
            stuff_equiv_classes[label].append(k)

    # 生成分割图像,传入当前掩膜、处理后的大小、目标大小、等价类别映射和去重标志
    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
    
    # 获取掩膜的面积,传入当前掩膜、处理后的大小以及当前类别数
    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))

    # 如果当前类别列表非空
    if cur_classes.size() > 0:
        # 过滤面积小于等于4的掩膜
        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
        # 只要还有被过滤的掩膜存在就继续循环
        while filtered_small.any():
            # 从当前掩膜、分数和类别中移除面积小于等于4的掩膜
            cur_masks = cur_masks[~filtered_small]
            cur_scores = cur_scores[~filtered_small]
            cur_classes = cur_classes[~filtered_small]
            # 重新生成分割图像,传入处理后的大小、目标大小、等价类别映射和去重标志
            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
            # 获取更新后的掩膜的面积
            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
            # 重新过滤面积小于等于4的掩膜
            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
    else:
        # 如果当前类别列表为空,则创建一个包含一个元素的numpy数组,元素为1,数据类型为int64
        cur_classes = np.ones((1, 1), dtype=np.int64)

    # 创建segments_info列表,每个元素是一个字典,包含id、是否物体、类别id和面积信息
    segments_info = [
        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
        for i, (cat, a) in enumerate(zip(cur_classes, area))
    ]
    # 删除cur_classes变量
    del cur_classes

    # 使用io.BytesIO创建一个字节流对象out
    with io.BytesIO() as out:
        # 将seg_img转换为PIL图像格式,并保存到out字节流中,格式为PNG
        PIL.Image.fromarray(seg_img).save(out, format="PNG")
        # 构建预测结果字典,包含PNG图像字符串和segments_info列表
        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}

    # 返回预测结果字典
    return predictions
# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation(
    annotation: Dict[str, Any],
    orig_size: Tuple[int, int],
    target_size: Tuple[int, int],
    threshold: float = 0.5,
    resample: PILImageResampling = PILImageResampling.NEAREST,
):
    """
    Resizes an annotation to a target size.

    Args:
        annotation (`Dict[str, Any]`):
            The annotation dictionary.
        orig_size (`Tuple[int, int]`):
            The original size of the input image.
        target_size (`Tuple[int, int]`):
            The target size of the image, as returned by the preprocessing `resize` step.
        threshold (`float`, *optional*, defaults to 0.5):
            The threshold used to binarize the segmentation masks.
        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
            The resampling filter to use when resizing the masks.
    """
    # 计算目标尺寸与原始尺寸的比率
    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
    ratio_height, ratio_width = ratios

    # 创建新的注释字典,设定大小为目标尺寸
    new_annotation = {}
    new_annotation["size"] = target_size

    # 遍历原始注释的键值对
    for key, value in annotation.items():
        # 如果键是"boxes",则将边界框按比例缩放
        if key == "boxes":
            boxes = value
            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
            new_annotation["boxes"] = scaled_boxes
        # 如果键是"area",则将面积按比例缩放
        elif key == "area":
            area = value
            scaled_area = area * (ratio_width * ratio_height)
            new_annotation["area"] = scaled_area
        # 如果键是"masks",则按目标尺寸和指定的重采样方法调整掩码
        elif key == "masks":
            masks = value[:, None]
            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
            masks = masks.astype(np.float32)
            masks = masks[:, 0] > threshold  # 使用阈值二值化掩码
            new_annotation["masks"] = masks
        # 如果键是"size",则直接设定大小为目标尺寸
        elif key == "size":
            new_annotation["size"] = target_size
        # 其他情况下直接复制原始注释的键值对
        else:
            new_annotation[key] = value

    # 返回调整后的新注释字典
    return new_annotation


# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
def binary_mask_to_rle(mask):
    """
    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        mask (`torch.Tensor` or `numpy.array`):
            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
            segment_id or class_id.
    Returns:
        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
        format.
    """
    # 如果输入的掩码是 PyTorch 张量,则转换为 NumPy 数组
    if is_torch_tensor(mask):
        mask = mask.numpy()

    # 将掩码展平为一维数组
    pixels = mask.flatten()
    # 在数组两端各添加一个零,以处理掩码边界
    pixels = np.concatenate([[0], pixels, [0]])
    # 找到连续不同像素值的起始和结束索引,构建 RLE 编码
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return list(runs)


# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
def convert_segmentation_to_rle(segmentation):
    """
    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        segmentation (`torch.Tensor` or `numpy.array`):
            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
    Returns:
        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
    """
    # 获取唯一的分割标识符列表,即所有不同的分割或类别标识符
    segment_ids = torch.unique(segmentation)

    # 初始化存储所有分割标识符的运行长度编码列表
    run_length_encodings = []
    # 遍历每个分割标识符
    for idx in segment_ids:
        # 创建一个二进制掩码,其中分割标识符对应的位置为1,其它位置为0
        mask = torch.where(segmentation == idx, 1, 0)
        # 将二进制掩码转换为运行长度编码(RLE)
        rle = binary_mask_to_rle(mask)
        # 将当前分割标识符的运行长度编码添加到列表中
        run_length_encodings.append(rle)

    # 返回所有分割标识符的运行长度编码列表
    return run_length_encodings
# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
    """
    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
    `labels`.

    Args:
        masks (`torch.Tensor`):
            A tensor of shape `(num_queries, height, width)`.
        scores (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        labels (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        object_mask_threshold (`float`):
            A number between 0 and 1 used to binarize the masks.
    Raises:
        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
    Returns:
        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
        < `object_mask_threshold`.
    """
    # 检查输入张量的第一个维度是否匹配
    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
        raise ValueError("mask, scores and labels must have the same shape!")

    # 根据阈值和标签数筛选保留的对象
    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)

    return masks[to_keep], scores[to_keep], labels[to_keep]


# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
    """
    Determine the validity of a segment based on mask labels and probabilities.

    Args:
        mask_labels (`torch.Tensor`):
            Tensor indicating mask labels.
        mask_probs (`torch.Tensor`):
            Tensor of mask probabilities.
        k (`int`):
            Class index to evaluate.
        mask_threshold (`float`, optional):
            Threshold value for binarizing masks. Default is 0.5.
        overlap_mask_area_threshold (`float`, optional):
            Threshold for determining valid segment based on area overlap. Default is 0.8.
    Returns:
        `Tuple[bool, torch.Tensor]`: A tuple indicating segment validity and the mask for the class `k`.
    """
    # 获取与类别 k 相关的掩码
    mask_k = mask_labels == k
    # 计算类别 k 的掩码区域面积
    mask_k_area = mask_k.sum()

    # 计算查询 k 中所有内容的区域面积
    original_area = (mask_probs[k] >= mask_threshold).sum()
    # 检查掩码是否存在
    mask_exists = mask_k_area > 0 and original_area > 0

    # 消除断开的小段
    if mask_exists:
        # 计算区域比例
        area_ratio = mask_k_area / original_area
        # 如果区域比例低于阈值,则认为掩码不存在
        if not area_ratio.item() > overlap_mask_area_threshold:
            mask_exists = False

    return mask_exists, mask_k


# Copied from transformers.models.detr.image_processing_detr.compute_segments
def compute_segments(
    mask_probs,
    pred_scores,
    pred_labels,
    mask_threshold: float = 0.5,
    overlap_mask_area_threshold: float = 0.8,
    label_ids_to_fuse: Optional[Set[int]] = None,
    target_size: Tuple[int, int] = None,
):
    """
    Compute segments based on mask probabilities, prediction scores, and labels.

    Args:
        mask_probs (`torch.Tensor`):
            Tensor of mask probabilities.
        pred_scores (`torch.Tensor`):
            Tensor of prediction scores.
        pred_labels (`torch.Tensor`):
            Tensor of prediction labels.
        mask_threshold (`float`, optional):
            Threshold value for binarizing masks. Default is 0.5.
        overlap_mask_area_threshold (`float`, optional):
            Threshold for determining valid segment based on area overlap. Default is 0.8.
        label_ids_to_fuse (`Optional[Set[int]]`, optional):
            Set of label IDs to fuse. Default is None.
        target_size (`Tuple[int, int]`, optional):
            Tuple specifying target size. Default is None.
    Returns:
        `torch.Tensor`: Segmentation results as a tensor of integers.
    """
    # 根据目标大小或默认大小获取高度和宽度
    height = mask_probs.shape[1] if target_size is None else target_size[0]
    width = mask_probs.shape[2] if target_size is None else target_size[1]

    # 初始化分割结果
    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
    # 初始化段列表
    segments: List[Dict] = []

    # 如果有指定目标大小,则插值调整 mask_probs
    if target_size is not None:
        mask_probs = nn.functional.interpolate(
            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
        )[0]

    # 当前段的 ID
    current_segment_id = 0

    # 根据预测分数加权每个掩码
    mask_probs *= pred_scores.view(-1, 1, 1)
    # 确定每个像素的主要标签
    mask_labels = mask_probs.argmax(0)  # [height, width]

    # 跟踪每个类别的实例
    # 初始化一个空字典,用于存储物体类别和其对应的当前段的标识符
    stuff_memory_list: Dict[str, int] = {}
    
    # 遍历预测标签的每一行
    for k in range(pred_labels.shape[0]):
        # 获取当前预测的类别标签
        pred_class = pred_labels[k].item()
        
        # 检查当前类别是否需要融合
        should_fuse = pred_class in label_ids_to_fuse
    
        # 检查当前索引 k 对应的掩码是否有效且足够大作为一个段
        mask_exists, mask_k = check_segment_validity(
            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
        )
    
        # 如果存在有效掩码
        if mask_exists:
            # 如果当前预测类别已经在 stuff_memory_list 中存在
            if pred_class in stuff_memory_list:
                # 获取当前类别对应的段标识符
                current_segment_id = stuff_memory_list[pred_class]
            else:
                # 如果不存在,则增加段标识符并更新到 stuff_memory_list 中
                current_segment_id += 1
    
            # 将当前对象段添加到最终的分割地图中,使用掩码索引 mask_k
            segmentation[mask_k] = current_segment_id
            
            # 获取当前预测得分,并四舍五入保留六位小数
            segment_score = round(pred_scores[k].item(), 6)
            
            # 将当前段的信息添加到 segments 列表中
            segments.append(
                {
                    "id": current_segment_id,
                    "label_id": pred_class,
                    "was_fused": should_fuse,
                    "score": segment_score,
                }
            )
            
            # 如果当前类别需要融合,则更新 stuff_memory_list 中对应类别的段标识符
            if should_fuse:
                stuff_memory_list[pred_class] = current_segment_id
    
    # 返回最终的分割地图和段列表
    return segmentation, segments
# 定义一个 Deformable DETR 图像处理器类,继承自 BaseImageProcessor 基类
class DeformableDetrImageProcessor(BaseImageProcessor):
    """
    Constructs a Deformable DETR image processor.

    Args:
        format (`str`, *optional*, defaults to `"coco_detection"`):
            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
            overridden by the `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
            the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize:
            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
            `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]

    # 从transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__中复制而来,初始化函数
    def __init__(
        self,
        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
        # 如果kwargs中存在"pad_and_return_pixel_mask"参数,则将do_pad设置为该参数值并将其从kwargs中删除
        if "pad_and_return_pixel_mask" in kwargs:
            do_pad = kwargs.pop("pad_and_return_pixel_mask")

        # 如果kwargs中存在"max_size"参数,则发出警告提示,推荐使用size字典中的"longest_edge"参数
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            # 否则将max_size设置为None或者size字典中的"longest_edge"参数,最大尺寸为1333
            max_size = None if size is None else 1333

        # 如果size为None,则将size设置为{"shortest_edge": 800, "longest_edge": 1333}字典
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        # 调用get_size_dict函数,根据max_size和default_to_square参数调整size字典的内容
        size = get_size_dict(size, max_size=max_size, default_to_square=False)

        # 兼容处理,如果do_convert_annotations为None,则设置其值等于do_normalize
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize

        # 调用父类的初始化方法,传入kwargs中的其它参数
        super().__init__(**kwargs)
        # 设置对象的各个属性值
        self.format = format
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
        # 设置有效的处理器键列表
        self._valid_processor_keys = [
            "images",
            "annotations",
            "return_segmentation_masks",
            "masks_path",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "do_convert_annotations",
            "image_mean",
            "image_std",
            "do_pad",
            "format",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    @classmethod
    # 从transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict中复制而来,修改为DeformableDetr
    # 从字典中重新构建 DeformableDetrImageProcessor 对象,更新参数
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        """
        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
        created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600,
        max_size=800)`
        """
        # 复制输入的字典,以确保不会修改原始数据
        image_processor_dict = image_processor_dict.copy()
        # 如果 kwargs 中有 "max_size" 参数,则更新到 image_processor_dict 中,并从 kwargs 中移除该参数
        if "max_size" in kwargs:
            image_processor_dict["max_size"] = kwargs.pop("max_size")
        # 如果 kwargs 中有 "pad_and_return_pixel_mask" 参数,则更新到 image_processor_dict 中,并从 kwargs 中移除该参数
        if "pad_and_return_pixel_mask" in kwargs:
            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
        # 调用父类的 from_dict 方法,将更新后的 image_processor_dict 和剩余的 kwargs 传递给它
        return super().from_dict(image_processor_dict, **kwargs)

    # 从 DETR 的代码中复制,准备注释以供 DeformableDetr 使用
    def prepare_annotation(
        self,
        image: np.ndarray,
        target: Dict,
        format: Optional[AnnotationFormat] = None,
        return_segmentation_masks: bool = None,
        masks_path: Optional[Union[str, pathlib.Path]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> Dict:
        """
        Prepare an annotation for feeding into DeformableDetr model.
        """
        # 如果未指定格式,则使用类中定义的格式
        format = format if format is not None else self.format

        # 如果格式是 COCO_DETECTION
        if format == AnnotationFormat.COCO_DETECTION:
            # 如果未指定是否返回分割掩码,则默认为 False
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
            # 调用 prepare_coco_detection_annotation 函数,准备 COCO_DETECTION 类型的注释
            target = prepare_coco_detection_annotation(
                image, target, return_segmentation_masks, input_data_format=input_data_format
            )
        # 如果格式是 COCO_PANOPTIC
        elif format == AnnotationFormat.COCO_PANOPTIC:
            # 如果未指定是否返回分割掩码,则默认为 True
            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
            # 调用 prepare_coco_panoptic_annotation 函数,准备 COCO_PANOPTIC 类型的注释
            target = prepare_coco_panoptic_annotation(
                image,
                target,
                masks_path=masks_path,
                return_masks=return_segmentation_masks,
                input_data_format=input_data_format,
            )
        else:
            # 如果格式不是 COCO_DETECTION 或 COCO_PANOPTIC,则抛出异常
            raise ValueError(f"Format {format} is not supported.")
        
        # 返回处理后的目标字典
        return target

    # 从 DETR 的代码中复制,警告该方法即将被弃用,建议使用 prepare_annotation 方法代替
    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
        logger.warning_once(
            "The `prepare` method is deprecated and will be removed in a v4.33. "
            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
            "does not return the image anymore.",
        )
        # 调用 prepare_annotation 方法来准备图像和目标
        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
        # 返回图像和处理后的目标
        return image, target

    # 从 DETR 的代码中复制,用于将 COCO 多边形转换为掩码的方法
    # 发出警告日志,提醒该方法即将在 v4.33 版本中移除
    def convert_coco_poly_to_mask(self, *args, **kwargs):
        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
        # 调用被复制的函数 convert_coco_poly_to_mask,并传递所有参数和关键字参数
        return convert_coco_poly_to_mask(*args, **kwargs)

    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection 复制而来
    def prepare_coco_detection(self, *args, **kwargs):
        # 发出警告日志,提醒该方法即将在 v4.33 版本中移除
        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
        # 调用被复制的函数 prepare_coco_detection_annotation,并传递所有参数和关键字参数
        return prepare_coco_detection_annotation(*args, **kwargs)

    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic 复制而来
    def prepare_coco_panoptic(self, *args, **kwargs):
        # 发出警告日志,提醒该方法即将在 v4.33 版本中移除
        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
        # 调用被复制的函数 prepare_coco_panoptic_annotation,并传递所有参数和关键字参数
        return prepare_coco_panoptic_annotation(*args, **kwargs)

    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.resize 复制而来
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    # 定义函数 resize,用于调整图像大小
    def resize(
        image: np.ndarray,
        size: Union[int, Tuple[int, int]],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: str = "channels_last",
        input_data_format: Optional[str] = None,
        **kwargs
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Union[int, Tuple[int, int]]`):
                Size to resize to. Can be an integer or a tuple of height and width.
            resample (`PILImageResampling`, optional):
                Resampling filter to use if resizing the image.
            data_format (`str`, optional):
                The channel dimension format for the output image.
            input_data_format (`str`, optional):
                The channel dimension format of the input image.
        """
        # 如果参数中包含 'max_size',发出警告并从 kwargs 中移除该参数,将其值赋给 max_size;否则 max_size 设为 None
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None
        
        # 调用 get_size_dict 函数,根据参数 size 和 max_size 获得实际的调整大小结果,不默认为正方形
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        
        # 如果 size 中同时包含 'shortest_edge' 和 'longest_edge',调用 get_resize_output_image_size 函数获取调整后的大小
        if "shortest_edge" in size and "longest_edge" in size:
            size = get_resize_output_image_size(
                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
            )
        # 如果 size 中同时包含 'height' 和 'width',直接使用这两个值作为调整后的大小
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
        else:
            # 如果 size 不符合以上格式要求,抛出 ValueError 异常
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
        
        # 调用 resize 函数,实际执行图像调整大小的操作
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )
        
        # 返回调整大小后的图像
        return image

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
    # 定义函数 resize_annotation,用于调整注释(标注)的大小以匹配调整后的图像
    def resize_annotation(
        self,
        annotation,
        orig_size,
        size,
        resample: PILImageResampling = PILImageResampling.NEAREST,
    ) -> Dict:
        """
        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
        to this number.
        """
        # 调用 resize_annotation 函数,实际执行标注调整大小的操作
        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
    # 定义一个方法用于对图像进行重新缩放
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        # 调用外部方法,返回重新缩放后的图像数组
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        # 调用外部方法,返回规范化后的注释字典
        return normalize_annotation(annotation, image_size=image_size)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ):
        """
        Update the annotation to reflect changes made due to image padding.

        Args:
            annotation (`Dict`):
                The annotation dictionary to update.
            input_image_size (`Tuple[int, int]`):
                The size of the original input image (height, width).
            output_image_size (`Tuple[int, int]`):
                The size of the padded output image (height, width).
            padding:
                The padding applied to the image.
            update_bboxes:
                Boolean flag indicating whether to update bounding boxes in the annotation.
        """
        # 处理由于图像填充而导致的注释更新,但未给出具体实现
        pass
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        # 创建一个新的空注释字典
        new_annotation = {}
        # 将输出图像大小添加到新注释字典中的 "size" 键
        new_annotation["size"] = output_image_size

        # 遍历现有注释字典中的每个键值对
        for key, value in annotation.items():
            # 如果键是 "masks"
            if key == "masks":
                # 获取 masks 数据
                masks = value
                # 对 masks 应用零填充,使用指定的填充模式和常量值
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                # 压缩 masks 的第一个维度,确保形状适合预期
                masks = safe_squeeze(masks, 1)
                # 将处理后的 masks 存入新注释字典中的 "masks" 键
                new_annotation["masks"] = masks
            # 如果键是 "boxes" 并且 update_bboxes 为真
            elif key == "boxes" and update_bboxes:
                # 获取 boxes 数据
                boxes = value
                # 缩放边界框坐标,以适应输出图像大小
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                # 将处理后的 boxes 存入新注释字典中的 "boxes" 键
                new_annotation["boxes"] = boxes
            # 如果键是 "size"
            elif key == "size":
                # 将输出图像大小添加到新注释字典中的 "size" 键
                new_annotation["size"] = output_image_size
            else:
                # 对于其他键,直接将其值复制到新注释字典中
                new_annotation[key] = value
        # 返回更新后的注释字典
        return new_annotation

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = output_size

        # 计算需要在图像底部和右侧填充的像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        # 构造填充元组
        padding = ((0, pad_bottom), (0, pad_right))
        # 使用指定的填充模式和常量值对图像进行填充
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        # 如果存在注释数据,则更新注释以适应填充后的图像
        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        # 返回填充后的图像和更新后的注释数据(如果有)
        return padded_image, annotation

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    # 定义类的方法 pad,用于填充图像数组,并处理相关的注释
    def pad(
        self,
        images: List[np.ndarray],  # 图像数组列表,每个元素是一个 numpy 数组
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # 可选的注释数据,可以是单个注释或注释列表
        constant_values: Union[float, Iterable[float]] = 0,  # 填充使用的常数值,可以是单个浮点数或可迭代对象
        return_pixel_mask: bool = True,  # 是否返回像素掩码,默认为 True
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的数据类型,可以是字符串或张量类型
        data_format: Optional[ChannelDimension] = None,  # 图像数据的格式,可以是通道维度对象或 None
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入图像的数据格式,可以是字符串或通道维度对象
        update_bboxes: bool = True,  # 是否更新边界框信息,默认为 True



    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess 复制而来的方法
    def preprocess(
        self,
        images: ImageInput,  # 图像输入,可以是单个图像或图像列表
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # 可选的注释数据,可以是单个注释或注释列表
        return_segmentation_masks: bool = None,  # 是否返回分割掩码
        masks_path: Optional[Union[str, pathlib.Path]] = None,  # 掩码文件的路径,可以是字符串或路径对象的可选对象
        do_resize: Optional[bool] = None,  # 是否调整图像大小,可选布尔值
        size: Optional[Dict[str, int]] = None,  # 图像大小的字典,包含宽度和高度
        resample=None,  # PIL 图像重新采样方法
        do_rescale: Optional[bool] = None,  # 是否重新缩放图像,可选布尔值
        rescale_factor: Optional[Union[int, float]] = None,  # 重新缩放的因子,可以是整数或浮点数
        do_normalize: Optional[bool] = None,  # 是否归一化图像像素值,可选布尔值
        do_convert_annotations: Optional[bool] = None,  # 是否转换注释数据格式,可选布尔值
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像像素均值,可以是单个浮点数或均值列表
        image_std: Optional[Union[float, List[float]]] = None,  # 图像像素标准差,可以是单个浮点数或标准差列表
        do_pad: Optional[bool] = None,  # 是否填充图像,可选布尔值
        format: Optional[Union[str, AnnotationFormat]] = None,  # 注释数据的格式,可以是字符串或注释格式对象
        return_tensors: Optional[Union[TensorType, str]] = None,  # 返回的数据类型,可以是张量类型或字符串
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,  # 图像数据的格式,可以是字符串或通道维度对象,默认为第一种通道维度
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入图像的数据格式,可以是字符串或通道维度对象
        **kwargs,  # 其他参数,用于接收额外的关键字参数



    # 后处理方法 - TODO: 添加对其他框架的支持
        """
        将 [`DeformableDetrForObjectDetection`] 的原始输出转换为最终的边界框,格式为 (top_left_x, top_left_y, bottom_right_x, bottom_right_y)。仅支持 PyTorch。

        Args:
            outputs ([`DeformableDetrObjectDetectionOutput`]):
                模型的原始输出。
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                包含批处理中每个图像的大小(高度,宽度)的张量。在评估时,这必须是原始图像大小(在任何数据增强之前)。在可视化时,这应该是数据增强后,但在填充之前的图像大小。
        Returns:
            `List[Dict]`: 一个字典列表,每个字典包含模型预测的批处理中每个图像的分数、标签和边界框。
        """
        logger.warning_once(
            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
        )

        # 提取输出中的分类 logits 和边界框
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # 检查输出 logits 和目标大小的维度是否匹配
        if len(out_logits) != len(target_sizes):
            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
        # 检查目标大小的每个元素是否包含批处理中每个图像的大小(h, w)
        if target_sizes.shape[1] != 2:
            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

        # 对 logits 应用 sigmoid 函数得到概率
        prob = out_logits.sigmoid()

        # 获取每个图像中前 100 个预测的最高分和其索引
        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
        scores = topk_values
        # 计算 topk_boxes 的索引
        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        labels = topk_indexes % out_logits.shape[2]
        
        # 将边界框转换为 (top_left_x, top_left_y, bottom_right_x, bottom_right_y) 格式
        boxes = center_to_corners_format(out_bbox)
        # 使用 topk_boxes 获取每个图像的 top-k 边界框
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        # 将相对坐标 [0, 1] 转换为绝对坐标 [0, height] 和 [0, width]
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
        boxes = boxes * scale_fct[:, None, :]

        # 创建结果列表,每个元素是一个字典,包含预测的分数、标签和边界框
        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]

        return results
    ):
        """
        Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`DetrObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                (height, width) of each image in the batch. If left to None, predictions will not be resized.
            top_k (`int`, *optional*, defaults to 100):
                Keep only top k bounding boxes before filtering by thresholding.

        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # Extract logits and predicted bounding boxes from the model's outputs
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # Check if target sizes are provided and validate their length against logits batch size
        if target_sizes is not None:
            if len(out_logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

        # Apply sigmoid function to logits to get probabilities and reshape them
        prob = out_logits.sigmoid()
        prob = prob.view(out_logits.shape[0], -1)

        # Determine the number of top-k boxes to consider
        k_value = min(top_k, prob.size(1))

        # Find top-k values and their corresponding indices
        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
        scores = topk_values

        # Convert top-k indexes to top-k boxes
        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        labels = topk_indexes % out_logits.shape[2]

        # Convert bounding boxes from center format to corner format
        boxes = center_to_corners_format(out_bbox)

        # Gather top-k boxes from all predicted boxes
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        # Convert boxes from relative [0, 1] to absolute [0, height] coordinates if target sizes are provided
        if target_sizes is not None:
            if isinstance(target_sizes, list):
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        # Filter out boxes with scores below the threshold and construct results dictionary
        results = []
        for s, l, b in zip(scores, labels, boxes):
            score = s[s > threshold]
            label = l[s > threshold]
            box = b[s > threshold]
            results.append({"scores": score, "labels": label, "boxes": box})

        return results

.\models\deformable_detr\load_custom.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Loading of Deformable DETR's CUDA kernels"""

import os  # 导入操作系统相关的模块
from pathlib import Path  # 导入处理文件路径的模块


def load_cuda_kernels():
    from torch.utils.cpp_extension import load  # 导入加载自定义C++扩展的函数

    # 获取当前脚本文件的父目录的父目录,并拼接出CUDA内核源文件所在路径
    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
    # 定义需要加载的所有源文件的路径列表
    src_files = [
        root / filename
        for filename in [
            "vision.cpp",
            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
        ]
    ]

    # 使用torch的cpp_extension模块加载CUDA扩展,并指定相关配置
    load(
        "MultiScaleDeformableAttention",  # 扩展名
        src_files,  # 源文件路径列表
        with_cuda=True,  # 指定是否包含CUDA支持
        extra_include_paths=[str(root)],  # 额外的头文件包含路径
        extra_cflags=["-DWITH_CUDA=1"],  # 额外的C编译标志
        extra_cuda_cflags=[
            "-DCUDA_HAS_FP16=1",  # CUDA支持的FP16
            "-D__CUDA_NO_HALF_OPERATORS__",  # 禁用CUDA半精度操作符
            "-D__CUDA_NO_HALF_CONVERSIONS__",  # 禁用CUDA半精度转换
            "-D__CUDA_NO_HALF2_OPERATORS__",  # 禁用CUDA半精度操作符
        ],
    )

    import MultiScaleDeformableAttention as MSDA  # 导入加载的扩展模块作为MSDA

    return MSDA  # 返回加载后的扩展模块对象

.\models\deformable_detr\modeling_deformable_detr.py

# coding=utf-8
# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Deformable DETR model."""

import copy
import math
import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable

# Importing various utilities and dependencies from transformers and related libraries
from ...activations import ACT2FN
from ...file_utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_scipy_available,
    is_timm_available,
    is_torch_cuda_available,
    is_vision_available,
    replace_return_docstrings,
    requires_backends,
)
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import meshgrid
from ...utils import is_accelerate_available, is_ninja_available, logging
from ...utils.backbone_utils import load_backbone
from .configuration_deformable_detr import DeformableDetrConfig

# Get logger instance for logging messages
logger = logging.get_logger(__name__)

# Initialize MultiScaleDeformableAttention to None initially
MultiScaleDeformableAttention = None

# Function to load CUDA kernels required for MultiScaleDeformableAttention
def load_cuda_kernels():
    from torch.utils.cpp_extension import load

    global MultiScaleDeformableAttention

    # Define the path to the CUDA and CPU source files
    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
    src_files = [
        root / filename
        for filename in [
            "vision.cpp",
            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
        ]
    ]

    # Load the CUDA kernels using torch's cpp_extension.load()
    MultiScaleDeformableAttention = load(
        "MultiScaleDeformableAttention",
        src_files,
        with_cuda=True,
        extra_include_paths=[str(root)],
        extra_cflags=["-DWITH_CUDA=1"],
        extra_cuda_cflags=[
            "-DCUDA_HAS_FP16=1",
            "-D__CUDA_NO_HALF_OPERATORS__",
            "-D__CUDA_NO_HALF_CONVERSIONS__",
            "-D__CUDA_NO_HALF2_OPERATORS__",
        ],
    )

# Check if vision utilities are available and import center_to_corners_format
if is_vision_available():
    from transformers.image_transforms import center_to_corners_format

# Check if accelerate library is available and import necessary components
if is_accelerate_available():
    from accelerate import PartialState
    from accelerate.utils import reduce

# Define the Function class for MultiScaleDeformableAttentionFunction
class MultiScaleDeformableAttentionFunction(Function):
    @staticmethod
    # 定义静态方法 `forward`,用于执行前向传播操作
    def forward(
        context,
        value,
        value_spatial_shapes,
        value_level_start_index,
        sampling_locations,
        attention_weights,
        im2col_step,
    ):
        # 将 im2col_step 参数保存到上下文对象 context 中
        context.im2col_step = im2col_step
        # 调用 MultiScaleDeformableAttention 类的静态方法 ms_deform_attn_forward 进行多尺度可变形注意力的前向传播
        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            context.im2col_step,
        )
        # 将前向传播中需要保存的张量保存到 context 的备忘录中
        context.save_for_backward(
            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
        )
        # 返回前向传播的输出结果
        return output

    # 定义静态方法 `backward`,用于执行反向传播操作
    @staticmethod
    @once_differentiable
    def backward(context, grad_output):
        # 从 context 的备忘录中获取前向传播时保存的张量
        (
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
        ) = context.saved_tensors
        # 调用 MultiScaleDeformableAttention 类的静态方法 ms_deform_attn_backward 进行多尺度可变形注意力的反向传播
        # 返回各个梯度值:grad_value 为输入值的梯度,grad_sampling_loc 为采样位置的梯度,grad_attn_weight 为注意力权重的梯度
        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            grad_output,
            context.im2col_step,
        )

        # 返回梯度值,其中输入值的梯度 grad_value 需要返回,其他梯度项为 None
        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
# 如果 scipy 可用,导入线性求解模块
if is_scipy_available():
    from scipy.optimize import linear_sum_assignment

# 如果 timm 可用,导入模型创建函数
if is_timm_available():
    from timm import create_model

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 文档用的配置名称
_CONFIG_FOR_DOC = "DeformableDetrConfig"

# 文档用的检查点名称
_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"

# Deformable DETR 预训练模型存档列表
DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "sensetime/deformable-detr",
    # 查看所有 Deformable DETR 模型的列表链接
    # https://huggingface.co/models?filter=deformable-detr
]

@dataclass
class DeformableDetrDecoderOutput(ModelOutput):
    """
    DeformableDetrDecoder 的输出的基类。这个类向 BaseModelOutputWithCrossAttentions 添加了两个属性:
    - 一个堆叠的中间解码器隐藏状态张量(即每个解码器层的输出)
    - 一个堆叠的中间参考点张量

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列。
        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
            堆叠的中间隐藏状态(解码器每层的输出)。
        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
            堆叠的中间参考点(解码器每层的参考点)。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            `torch.FloatTensor` 元组(一个用于嵌入输出 + 一个用于每层输出),形状为 `(batch_size, sequence_length, hidden_size)`。
            模型每层输出的隐藏状态加上初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            `torch.FloatTensor` 元组(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力权重经过注意力 softmax 后的结果,在自注意力头中用于计算加权平均。
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            `torch.FloatTensor` 元组(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            解码器交叉注意力层的注意力权重,在注意力 softmax 后用于计算加权平均。
    """
    last_hidden_state: torch.FloatTensor = None
    intermediate_hidden_states: torch.FloatTensor = None
    # 定义一个变量 intermediate_reference_points,类型为 torch.FloatTensor,初始值为 None
    intermediate_reference_points: torch.FloatTensor = None
    
    # 定义一个变量 hidden_states,类型为 Optional[Tuple[torch.FloatTensor]],初始值为 None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义一个变量 attentions,类型为 Optional[Tuple[torch.FloatTensor]],初始值为 None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义一个变量 cross_attentions,类型为 Optional[Tuple[torch.FloatTensor]],初始值为 None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 使用 `dataclass` 装饰器定义了一个数据类 `DeformableDetrModelOutput`,表示Deformable DETR模型的输出。
class DeformableDetrModelOutput(ModelOutput):
    """
    Base class for outputs of the Deformable DETR encoder-decoder model.
    """

    # 下面是该类的属性定义,每个属性都是一个 `torch.FloatTensor` 类型的张量,用于存储不同的模型输出。
    init_reference_points: torch.FloatTensor = None  # 初始参考点
    last_hidden_state: torch.FloatTensor = None  # 最后隐藏状态
    intermediate_hidden_states: torch.FloatTensor = None  # 中间隐藏状态
    intermediate_reference_points: torch.FloatTensor = None  # 中间参考点
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 解码器隐藏状态
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 解码器注意力
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 交叉注意力
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None  # 编码器最后隐藏状态
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 编码器隐藏状态
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 编码器注意力
    enc_outputs_class: Optional[torch.FloatTensor] = None  # 输出类别
    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None  # 输出坐标对数

# 使用 `dataclass` 装饰器定义了另一个数据类 `DeformableDetrObjectDetectionOutput`,表示Deformable DETR模型的目标检测输出。
class DeformableDetrObjectDetectionOutput(ModelOutput):
    """
    Output type of [`DeformableDetrForObjectDetection`].
    """

    # 下面是该类的属性定义,每个属性都是与模型输出相关的数据。
    loss: Optional[torch.FloatTensor] = None  # 损失值
    loss_dict: Optional[Dict] = None  # 损失字典
    logits: torch.FloatTensor = None  # 对数
    pred_boxes: torch.FloatTensor = None  # 预测框
    auxiliary_outputs: Optional[List[Dict]] = None  # 辅助输出
    init_reference_points: Optional[torch.FloatTensor] = None  # 初始参考点
    last_hidden_state: Optional[torch.FloatTensor] = None  # 最后隐藏状态
    intermediate_hidden_states: Optional[torch.FloatTensor] = None  # 中间隐藏状态
    intermediate_reference_points: Optional[torch.FloatTensor] = None  # 中间参考点
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 解码器隐藏状态
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 解码器注意力
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 交叉注意力
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None  # 编码器最后隐藏状态
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 编码器隐藏状态
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 编码器注意力
    enc_outputs_class: Optional = None  # 输出类别
    enc_outputs_coord_logits: Optional = None  # 输出坐标对数

# 定义了一个函数 `_get_clones`,用于克隆指定模块多次。
def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

# 定义了一个函数 `inverse_sigmoid`,计算输入张量的逆sigmoid值。
def inverse_sigmoid(x, eps=1e-5):
    x = x.clamp(min=0, max=1)  # 将输入张量限制在 [0, 1] 范围内
    x1 = x.clamp(min=eps)  # 将输入张量在较小值处截断为 `eps`
    x2 = (1 - x).clamp(min=eps)  # 将 (1 - x) 在较小值处截断为 `eps`
    return torch.log(x1 / x2)  # 返回计算后的逆sigmoid值

# 定义了一个类 `DeformableDetrFrozenBatchNorm2d`,继承自 `nn.Module`,用于冻结统计数据和仿射参数的批标准化。
class DeformableDetrFrozenBatchNorm2d(nn.Module):
    """
    BatchNorm2d where the batch statistics and the affine parameters are fixed.
    
    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
    torchvision.models.resnet[18,34,50,101] produce nans.
    """

    # 类初始化函数,接受一个参数 `n`。
    def __init__(self, n):
        super().__init__()
        # 注册 `weight`、`bias`、`running_mean`、`running_var` 四个缓冲区。
        self.register_buffer("weight", torch.ones(n))  # 权重初始化为全1
        self.register_buffer("bias", torch.zeros(n))  # 偏置初始化为全0
        self.register_buffer("running_mean", torch.zeros(n))  # 运行时均值初始化为全0
        self.register_buffer("running_var", torch.ones(n))  # 运行时方差初始化为全1
    # 从状态字典中加载模型参数,并根据给定的前缀处理键名,处理缺失和意外的键
    def _load_from_state_dict(
        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
    ):
        # 构建用于追踪批次数的键名
        num_batches_tracked_key = prefix + "num_batches_tracked"
        # 如果追踪批次数的键名存在于状态字典中,则删除该键
        if num_batches_tracked_key in state_dict:
            del state_dict[num_batches_tracked_key]

        # 调用父类的_load_from_state_dict方法,加载模型参数
        super()._load_from_state_dict(
            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
        )

    # 定义前向传播方法
    def forward(self, x):
        # 将权重张量重塑为指定形状,以便适合前向传播的需求
        weight = self.weight.reshape(1, -1, 1, 1)
        # 将偏置张量重塑为指定形状,以便适合前向传播的需求
        bias = self.bias.reshape(1, -1, 1, 1)
        # 将运行时方差张量重塑为指定形状,以便适合前向传播的需求
        running_var = self.running_var.reshape(1, -1, 1, 1)
        # 将运行时均值张量重塑为指定形状,以便适合前向传播的需求
        running_mean = self.running_mean.reshape(1, -1, 1, 1)
        # 定义一个极小值常量 epsilon,用于数值稳定性
        epsilon = 1e-5
        # 计算缩放系数,用于规范化输入数据
        scale = weight * (running_var + epsilon).rsqrt()
        # 计算偏置项,用于将规范化后的数据重新调整到正确的范围
        bias = bias - running_mean * scale
        # 返回经过规范化和调整后的数据
        return x * scale + bias
# 从 transformers.models.detr.modeling_detr.replace_batch_norm 复制,并将 Detr 替换为 DeformableDetr
def replace_batch_norm(model):
    """
    递归地将所有 `torch.nn.BatchNorm2d` 替换为 `DeformableDetrFrozenBatchNorm2d`。

    Args:
        model (torch.nn.Module):
            输入的模型
    """
    # 遍历模型的每个子模块
    for name, module in model.named_children():
        # 如果当前模块是 nn.BatchNorm2d 类型
        if isinstance(module, nn.BatchNorm2d):
            # 创建一个新的 DeformableDetrFrozenBatchNorm2d 模块,与原始模块的特征数相同
            new_module = DeformableDetrFrozenBatchNorm2d(module.num_features)

            # 如果原始模块的权重不在 torch.device("meta") 上
            if not module.weight.device == torch.device("meta"):
                # 复制原始模块的权重、偏置、运行时均值和方差到新模块
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
                new_module.running_var.data.copy_(module.running_var)

            # 将模型中原始的 BatchNorm2d 模块替换为新创建的 DeformableDetrFrozenBatchNorm2d 模块
            model._modules[name] = new_module

        # 如果当前模块还有子模块,则递归调用替换函数
        if len(list(module.children())) > 0:
            replace_batch_norm(module)


class DeformableDetrConvEncoder(nn.Module):
    """
    使用 AutoBackbone API 或 timm 库之一的卷积主干网络。

    所有 nn.BatchNorm2d 层都被上面定义的 DeformableDetrFrozenBatchNorm2d 替换。

    """

    def __init__(self, config):
        super().__init__()

        self.config = config

        # 根据配置选择使用 timm 库的 backbone 还是自定义的加载
        if config.use_timm_backbone:
            # 确保需要的后端库已导入
            requires_backends(self, ["timm"])
            kwargs = {}
            if config.dilation:
                kwargs["output_stride"] = 16
            # 创建 timm 库中指定的 backbone 模型
            backbone = create_model(
                config.backbone,
                pretrained=config.use_pretrained_backbone,
                features_only=True,
                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
                in_chans=config.num_channels,
                **kwargs,
            )
        else:
            # 自定义加载 backbone 模型
            backbone = load_backbone(config)

        # 使用 torch.no_grad() 替换所有的 BatchNorm 层为冻结的 BatchNorm 层
        with torch.no_grad():
            replace_batch_norm(backbone)

        # 将处理后的 backbone 设置为模型的一部分
        self.model = backbone
        # 获取中间层的通道数信息
        self.intermediate_channel_sizes = (
            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
        )

        # 根据 backbone 的类型和配置冻结特定的参数
        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
        if "resnet" in backbone_model_type:
            for name, parameter in self.model.named_parameters():
                if config.use_timm_backbone:
                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
                        parameter.requires_grad_(False)
                else:
                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
                        parameter.requires_grad_(False)
    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
        # 将像素值通过模型传递,以获取特征图列表
        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
        
        out = []
        for feature_map in features:
            # 将像素掩码下采样至与对应特征图相同的形状
            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
            out.append((feature_map, mask))
        # 返回特征图和相应的掩码组成的列表
        return out
# 从 transformers.models.detr.modeling_detr.DetrConvModel 复制并修改为 DeformableDetrConvModel
class DeformableDetrConvModel(nn.Module):
    """
    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
    """

    def __init__(self, conv_encoder, position_embedding):
        super().__init__()
        self.conv_encoder = conv_encoder  # 初始化卷积编码器
        self.position_embedding = position_embedding  # 初始化位置编码器

    def forward(self, pixel_values, pixel_mask):
        # 通过骨干网络(backbone)传递像素值和像素掩码,获取 (特征图, 像素掩码) 元组列表
        out = self.conv_encoder(pixel_values, pixel_mask)
        pos = []
        for feature_map, mask in out:
            # 执行位置编码
            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))

        return out, pos


class DeformableDetrSinePositionEmbedding(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    """

    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
        super().__init__()
        self.embedding_dim = embedding_dim  # 设置嵌入维度
        self.temperature = temperature  # 温度参数
        self.normalize = normalize  # 是否进行归一化
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale  # 缩放参数,默认为2π

    def forward(self, pixel_values, pixel_mask):
        if pixel_mask is None:
            raise ValueError("No pixel mask provided")  # 如果未提供像素掩码,抛出异常
        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)  # 在第一维度上累积求和,得到y方向的位置编码
        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)  # 在第二维度上累积求和,得到x方向的位置编码
        if self.normalize:
            eps = 1e-6
            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale  # 对y方向位置编码进行归一化和缩放
            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale  # 对x方向位置编码进行归一化和缩放

        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()  # 创建维度参数
        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)  # 计算温度调整

        pos_x = x_embed[:, :, :, None] / dim_t  # 计算x方向的位置编码
        pos_y = y_embed[:, :, :, None] / dim_t  # 计算y方向的位置编码
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)  # 对x方向位置编码应用正弦余弦变换
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)  # 对y方向位置编码应用正弦余弦变换
        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)  # 组合并转置位置编码,以适应模型输入要求
        return pos


# 从 transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding 复制的类,已用于 DeformableDetr
class DeformableDetrLearnedPositionEmbedding(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """
    # 初始化函数,定义了类的初始化方法,设置了嵌入维度为256
    def __init__(self, embedding_dim=256):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个行索引的嵌入层,50表示索引的范围,embedding_dim表示每个嵌入的维度
        self.row_embeddings = nn.Embedding(50, embedding_dim)
        # 创建一个列索引的嵌入层,参数与行索引的嵌入层类似
        self.column_embeddings = nn.Embedding(50, embedding_dim)

    # 前向传播方法,接收像素值和像素掩码作为输入
    def forward(self, pixel_values, pixel_mask=None):
        # 获取像素值的高度和宽度
        height, width = pixel_values.shape[-2:]
        # 生成宽度的索引张量,设备与像素值张量相同
        width_values = torch.arange(width, device=pixel_values.device)
        # 生成高度的索引张量,设备与像素值张量相同
        height_values = torch.arange(height, device=pixel_values.device)
        # 通过列嵌入层获取每个宽度索引的嵌入表示
        x_emb = self.column_embeddings(width_values)
        # 通过行嵌入层获取每个高度索引的嵌入表示
        y_emb = self.row_embeddings(height_values)
        # 拼接 x_emb 和 y_emb 成为位置嵌入张量,最后一个维度为2*embedding_dim
        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
        # 将 pos 张量的维度重新排列为 (2*embedding_dim, height, width)
        pos = pos.permute(2, 0, 1)
        # 在第一维度上添加一个维度,成为 (1, 2*embedding_dim, height, width)
        pos = pos.unsqueeze(0)
        # 将 pos 张量沿着第一维度复制 pixel_values.shape[0] 次,形成 (batch_size, 2*embedding_dim, height, width)
        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
        # 返回位置编码张量 pos
        return pos
# 从 transformers.models.detr.modeling_detr.build_position_encoding 复制并修改为 DeformableDetr 的位置编码构建函数
def build_position_encoding(config):
    # 计算位置编码的步数,使用模型维度的一半
    n_steps = config.d_model // 2
    # 根据配置选择位置编码类型为 "sine"
    if config.position_embedding_type == "sine":
        # 使用 DeformableDetrSinePositionEmbedding 类创建正弦位置编码对象,进行正则化
        position_embedding = DeformableDetrSinePositionEmbedding(n_steps, normalize=True)
    # 根据配置选择位置编码类型为 "learned"
    elif config.position_embedding_type == "learned":
        # 使用 DeformableDetrLearnedPositionEmbedding 类创建学习位置编码对象
        position_embedding = DeformableDetrLearnedPositionEmbedding(n_steps)
    else:
        # 若配置中的位置编码类型不支持,则抛出异常
        raise ValueError(f"Not supported {config.position_embedding_type}")

    # 返回创建的位置编码对象
    return position_embedding


def multi_scale_deformable_attention(
    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
    # 获取 value 张量的维度信息
    batch_size, _, num_heads, hidden_dim = value.shape
    # 获取 sampling_locations 张量的维度信息
    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
    # 根据 value 的空间形状将 value 分割为列表
    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
    # 计算采样网格位置
    sampling_grids = 2 * sampling_locations - 1
    # 初始化采样值列表
    sampling_value_list = []
    # 遍历 value 的空间形状和对应的 value 列表
    for level_id, (height, width) in enumerate(value_spatial_shapes):
        # 将 value_list[level_id] 展平并转置,以便进行 grid_sample 操作
        value_l_ = (
            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
        )
        # 获取当前级别的采样网格
        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
        # 使用 bilinear 插值方式进行 grid_sample,得到采样值
        sampling_value_l_ = nn.functional.grid_sample(
            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
        )
        # 将采样值添加到列表中
        sampling_value_list.append(sampling_value_l_)
    # 调整注意力权重的形状,以便与采样值列表进行乘积操作
    attention_weights = attention_weights.transpose(1, 2).reshape(
        batch_size * num_heads, 1, num_queries, num_levels * num_points
    )
    # 计算最终输出,将采样值与注意力权重相乘并求和,然后调整形状
    output = (
        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
        .sum(-1)
        .view(batch_size, num_heads * hidden_dim, num_queries)
    )
    # 调整输出的维度顺序并保持连续性
    return output.transpose(1, 2).contiguous()


class DeformableDetrMultiscaleDeformableAttention(nn.Module):
    """
    Deformable DETR 中提出的多尺度可变形注意力模块。
    """
    # 初始化函数,接受配置对象、注意力头数目和采样点数目作为参数
    def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
        # 调用父类的初始化方法
        super().__init__()

        # 检查是否已经加载了多尺度可变形注意力的内核
        kernel_loaded = MultiScaleDeformableAttention is not None
        # 如果CUDA可用并且已安装Ninja并且内核未加载,则尝试加载CUDA内核
        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
            try:
                load_cuda_kernels()
            except Exception as e:
                # 记录警告信息,指出无法加载多尺度可变形注意力的自定义内核
                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")

        # 检查配置中的d_model是否能被num_heads整除,否则抛出数值错误
        if config.d_model % num_heads != 0:
            raise ValueError(
                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
            )

        # 计算每个注意力头的维度
        dim_per_head = config.d_model // num_heads
        # 检查dim_per_head是否是2的幂
        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
            # 发出警告,建议将embed_dim设置为2的幂,这在CUDA实现中更有效
            warnings.warn(
                "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
                " implementation."
            )

        # 初始化im2col步长为64
        self.im2col_step = 64

        # 设置对象的属性值
        self.d_model = config.d_model
        self.n_levels = config.num_feature_levels
        self.n_heads = num_heads
        self.n_points = n_points

        # 初始化采样偏移量的线性层,输出维度为num_heads * n_levels * n_points * 2
        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
        # 初始化注意力权重的线性层,输出维度为num_heads * n_levels * n_points
        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
        # 初始化值投影的线性层,输入和输出维度都是config.d_model
        self.value_proj = nn.Linear(config.d_model, config.d_model)
        # 初始化输出投影的线性层,输入和输出维度都是config.d_model
        self.output_proj = nn.Linear(config.d_model, config.d_model)

        # 设置是否禁用自定义内核的标志
        self.disable_custom_kernels = config.disable_custom_kernels

        # 调用内部方法_reset_parameters,用于初始化参数
        self._reset_parameters()

    # 内部方法,用于初始化模型的参数
    def _reset_parameters(self):
        # 将采样偏移量的权重初始化为常数0.0
        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
        # 获取默认数据类型
        default_dtype = torch.get_default_dtype()
        # 创建一组角度thetas,用于初始化采样偏移量
        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
        # 初始化网格grid_init,形状为(n_heads, n_levels, n_points, 2),用于采样偏移量
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (
            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
            .view(self.n_heads, 1, 1, 2)
            .repeat(1, self.n_levels, self.n_points, 1)
        )
        # 根据采样点的索引调整grid_init的值
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1
        # 使用torch.no_grad()上下文管理器,设置采样偏移量的偏置值为grid_init
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
        # 将注意力权重的权重和偏置初始化为常数0.0
        nn.init.constant_(self.attention_weights.weight.data, 0.0)
        nn.init.constant_(self.attention_weights.bias.data, 0.0)
        # 使用xavier_uniform方法初始化值投影和输出投影的权重,并将偏置初始化为常数0.0
        nn.init.xavier_uniform_(self.value_proj.weight.data)
        nn.init.constant_(self.value_proj.bias.data, 0.0)
        nn.init.xavier_uniform_(self.output_proj.weight.data)
        nn.init.constant_(self.output_proj.bias.data, 0.0)
    # 如果位置嵌入不为 None,则将其加到输入张量上,实现位置编码的加法操作
    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
        return tensor if position_embeddings is None else tensor + position_embeddings

    # Transformer 模型的前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩,可选参数,默认为 None
        encoder_hidden_states=None,  # 编码器的隐藏状态,用于注意力机制中的键值对
        encoder_attention_mask=None,  # 编码器的注意力遮罩,用于注意力机制中的键值对
        position_embeddings: Optional[torch.Tensor] = None,  # 位置嵌入,可选参数,默认为 None
        reference_points=None,  # 参考点,用于空间注意力机制
        spatial_shapes=None,  # 空间形状,用于空间注意力机制
        level_start_index=None,  # 层级开始索引,用于分层注意力机制
        output_attentions: bool = False,  # 是否输出注意力权重,默认为 False
class DeformableDetrMultiheadAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper.

    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim  # 设置注意力机制的输入/输出维度
        self.num_heads = num_heads  # 设置注意力头的数量
        self.dropout = dropout  # 设置dropout比率
        self.head_dim = embed_dim // num_heads  # 计算每个注意力头的维度
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 初始化投影矩阵 k
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 初始化投影矩阵 v
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 初始化投影矩阵 q
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 初始化输出投影矩阵

    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
        # 重塑输入张量的形状,以便多头注意力机制处理

    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
        return tensor if position_embeddings is None else tensor + position_embeddings
        # 将位置嵌入添加到输入张量中,如果位置嵌入不为 None

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,



class DeformableDetrEncoderLayer(nn.Module):
    def __init__(self, config: DeformableDetrConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 设置编码器层的输入/输出维度
        self.self_attn = DeformableDetrMultiscaleDeformableAttention(
            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
        )  # 初始化多尺度可变形注意力机制
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # Layer normalization层
        self.dropout = config.dropout  # dropout比率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.activation_dropout = config.activation_dropout  # 激活函数的dropout比率
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)  # 第一个全连接层
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)  # 第二个全连接层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终的Layer normalization层

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        position_embeddings: torch.Tensor = None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        output_attentions: bool = False,
    # 定义一个函数,用于处理多尺度变形注意力模块的输入数据
    def forward(
        hidden_states: `torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            """
            Args:
                hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                    输入的张量数据,代表层的输入。
                attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                    注意力掩码,用于指示哪些元素需要被忽略。
                position_embeddings (`torch.FloatTensor`, *optional*):
                    位置嵌入,将被加到 `hidden_states` 上。
                reference_points (`torch.FloatTensor`, *optional*):
                    参考点。
                spatial_shapes (`torch.LongTensor`, *optional*):
                    主干特征图的空间形状。
                level_start_index (`torch.LongTensor`, *optional*):
                    级别起始索引。
                output_attentions (`bool`, *optional*):
                    是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。
            """
            # 将原始输入保存为残差连接的基础
            residual = hidden_states
    
            # 在多尺度特征图上应用多尺度变形注意力模块
            hidden_states, attn_weights = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                encoder_hidden_states=hidden_states,
                encoder_attention_mask=attention_mask,
                position_embeddings=position_embeddings,
                reference_points=reference_points,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,
                output_attentions=output_attentions,
            )
    
            # 应用 dropout 层,用于防止过拟合
            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
            
            # 将残差连接和处理后的数据相加
            hidden_states = residual + hidden_states
            
            # 应用自注意力层的 layer normalization
            hidden_states = self.self_attn_layer_norm(hidden_states)
    
            # 将处理后的数据保存为新的残差连接基础
            residual = hidden_states
    
            # 应用激活函数和全连接层 fc1
            hidden_states = self.activation_fn(self.fc1(hidden_states))
            
            # 再次应用 dropout 层,用于进一步防止过拟合
            hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
    
            # 应用最后的全连接层 fc2
            hidden_states = self.fc2(hidden_states)
            
            # 一最后一次应用 dropout 层
            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
    
            # 将残差连接和处理后的数据相加
            hidden_states = residual + hidden_states
            
            # 应用最后的 layer normalization
            hidden_states = self.final_layer_norm(hidden_states)
    
            # 在训练模式下,检查处理后的数据是否包含无穷大或 NaN
            if self.training:
                if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
                    # 将数据的范围限制在一个较小的值域内,防止数值溢出
                    clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                    hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
    
            # 将最终处理后的结果打包成一个 tuple 输出
            outputs = (hidden_states,)
    
            # 如果需要输出注意力权重,则将它们添加到输出中
            if output_attentions:
                outputs += (attn_weights,)
    
            return outputs
# 定义 DeformableDetrDecoderLayer 类,继承自 nn.Module
class DeformableDetrDecoderLayer(nn.Module):
    # 初始化方法,接受一个 DeformableDetrConfig 类型的 config 参数
    def __init__(self, config: DeformableDetrConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置 embed_dim 属性为 config.d_model,即模型的维度
        self.embed_dim = config.d_model

        # self-attention
        # 初始化 self.self_attn 属性为 DeformableDetrMultiheadAttention 对象
        self.self_attn = DeformableDetrMultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
        )
        # 设置 dropout 属性为 config.dropout
        self.dropout = config.dropout
        # 设置 activation_fn 属性为 ACT2FN[config.activation_function],激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置 activation_dropout 属性为 config.activation_dropout
        self.activation_dropout = config.activation_dropout

        # 初始化 self.self_attn_layer_norm 属性为 nn.LayerNorm 对象,对 self-attention 结果进行归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # cross-attention
        # 初始化 self.encoder_attn 属性为 DeformableDetrMultiscaleDeformableAttention 对象
        self.encoder_attn = DeformableDetrMultiscaleDeformableAttention(
            config,
            num_heads=config.decoder_attention_heads,
            n_points=config.decoder_n_points,
        )
        # 初始化 self.encoder_attn_layer_norm 属性为 nn.LayerNorm 对象,对 cross-attention 结果进行归一化
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # feedforward neural networks
        # 初始化 self.fc1 属性为 nn.Linear 对象,进行线性变换
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 初始化 self.fc2 属性为 nn.Linear 对象,进行线性变换
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        # 初始化 self.final_layer_norm 属性为 nn.LayerNorm 对象,对最终输出进行归一化

    # 前向传播方法,接受多个输入参数并返回结果
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: Optional[torch.Tensor] = None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 进行自注意力计算,通过 self.self_attn 属性
        # hidden_states 是输入的张量,根据给定的参数进行计算
        self_attn_output = self.self_attn(
            hidden_states,
            position_embeddings=position_embeddings,
            reference_points=reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
        )
        # 对自注意力输出结果进行 dropout
        hidden_states = F.dropout(self_attn_output, p=self.dropout, training=self.training)
        # 将激活函数应用于输出
        hidden_states = self.activation_fn(hidden_states)
        # 对输出再次进行 dropout
        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 对结果进行 LayerNorm 归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 进行跨注意力计算,通过 self.encoder_attn 属性
        # hidden_states 是上一步的输出,根据给定的参数进行计算
        encoder_attn_output = self.encoder_attn(
            hidden_states,
            position_embeddings=position_embeddings,
            reference_points=reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
        )
        # 对跨注意力输出结果进行 dropout
        hidden_states = F.dropout(encoder_attn_output, p=self.dropout, training=self.training)
        # 对结果进行 LayerNorm 归一化
        hidden_states = self.encoder_attn_layer_norm(hidden_states)

        # 使用第一个全连接层进行前馈神经网络计算
        hidden_states = self.fc1(hidden_states)
        # 应用激活函数
        hidden_states = self.activation_fn(hidden_states)
        # 使用第二个全连接层进行前馈神经网络计算
        hidden_states = self.fc2(hidden_states)
        # 对结果进行 LayerNorm 归一化
        hidden_states = self.final_layer_norm(hidden_states)

        # 返回处理后的结果
        return hidden_states
    # 初始化神经网络模块的权重
    def _init_weights(self, module):
        std = self.config.init_std  # 获取初始化标准差

        # 如果模块是 DeformableDetrLearnedPositionEmbedding 类型
        if isinstance(module, DeformableDetrLearnedPositionEmbedding):
            # 对行和列嵌入的权重进行均匀分布初始化
            nn.init.uniform_(module.row_embeddings.weight)
            nn.init.uniform_(module.column_embeddings.weight)
        
        # 如果模块是 DeformableDetrMultiscaleDeformableAttention 类型
        elif isinstance(module, DeformableDetrMultiscaleDeformableAttention):
            # 调用模块的参数重置方法
            module._reset_parameters()
        
        # 如果模块是 nn.Linear, nn.Conv2d, nn.BatchNorm2d 中的一种
        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
            # 使用正态分布初始化权重,均值为0,标准差为设定的std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果存在偏置项,则将其初始化为0
            if module.bias is not None:
                module.bias.data.zero_()
        
        # 如果模块是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重,均值为0,标准差为设定的std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果设置了padding_idx,将其对应的权重初始化为0
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        
        # 如果模块具有 "reference_points" 属性且不是两阶段配置
        if hasattr(module, "reference_points") and not self.config.two_stage:
            # 使用Xavier均匀分布初始化 reference_points 的权重
            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
            # 将 reference_points 的偏置项初始化为0
            nn.init.constant_(module.reference_points.bias.data, 0.0)
        
        # 如果模块具有 "level_embed" 属性
        if hasattr(module, "level_embed"):
            # 使用正态分布初始化 level_embed 的权重
            nn.init.normal_(module.level_embed)
DEFORMABLE_DETR_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DeformableDetrConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

DEFORMABLE_DETR_INPUTS_DOCSTRING = r"""
    Inputs:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are not masked,
            - 0 for tokens that are masked.
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selecting a position_id equal to :obj:`padding_idx` will result in padding token. Position embeddings are
            not used by default in Deformable DETR. Therefore, this argument can be safely ignored.
        bbox (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`, optional):
            The normalized coordinates of the bounding boxes for the input queries.
            Coordinates are normalized in the format `(y_min, x_min, y_max, x_max)` and their values are in the
            interval `[0, 1]`.
        query_embed (:obj:`torch.FloatTensor` of shape :obj:`(num_queries, embed_dim)`, optional):
            The learnable embedding of each query token in the object queries. It is a learnable parameter initialized
            randomly if not provided.
        relation_embed (:obj:`torch.FloatTensor` of shape :obj:`(num_object_queries, num_object_queries, embed_dim)`, optional):
            The learnable embedding of each pair of object queries in the object queries.
            It is a learnable parameter initialized randomly if not provided.
        masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_object_queries)`, optional):
            The relation mask used to calculate the attention between object queries.
        return_dict (:obj:`bool`, optional, defaults to :obj:`True`):
            Whether or not to return a :obj:`Dict` with the output of the model. If set to :obj:`False`, returns a
            :obj:`Tuple` with the sequence of token logits and the attention.
            Returns:
                If :obj:`return_dict` is :obj:`True`, a :obj:`Dict` with the model's outputs will be returned that
                include the logits and hidden states.

    Returns:
        :obj:`Dict[str, torch.FloatTensor]`: Dictionary of outputs containing:
            - **logits** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, config.num_classes)`):
              Classification logits (scores) for each query.
            - **hidden_states** (:obj:`List[torch.FloatTensor]` of length :obj:`config.num_hidden_layers`):
              Hidden states for each layer in the model. Each hidden state is a :obj:`torch.FloatTensor` of shape
              :obj:`(batch_size, sequence_length, hidden_size)`.
"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 像素值。默认情况下会忽略填充部分。

            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
            for details.
            # 可以使用 `AutoImageProcessor` 获取像素值。详见 [`DeformableDetrImageProcessor.__call__`]。

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            # 遮罩,用于在填充像素值上避免执行注意力操作。遮罩的值在 `[0, 1]` 之间:

            - 1 表示真实像素(即**未遮罩**),
            - 0 表示填充像素(即**已遮罩**)。

            [What are attention masks?](../glossary#attention-mask)
            # 注意力遮罩是什么?

        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            # 默认不使用。可以用来遮罩对象查询。

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            # 元组包含 (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            # `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`,可选部分是编码器最后一层的隐藏状态。
            # 在解码器的交叉注意力中使用。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选的,可以直接传递图像的平坦表示,而不是传递后骨干网络和投影层的输出特征图。

        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            # 可选的,可以直接传递嵌入表示来初始化查询,而不是用零张量初始化。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 获取更多细节。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states` 获取更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~file_utils.ModelOutput`] 而不是普通元组。
"""
class DeformableDetrEncoder(DeformableDetrPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
    [`DeformableDetrEncoderLayer`].

    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.

    Args:
        config: DeformableDetrConfig
    """

    def __init__(self, config: DeformableDetrConfig):
        super().__init__(config)
        self.gradient_checkpointing = False

        # 设置 dropout 概率
        self.dropout = config.dropout
        # 创建多个 DeformableDetrEncoderLayer 层,并放入 ModuleList 中
        self.layers = nn.ModuleList([DeformableDetrEncoderLayer(config) for _ in range(config.encoder_layers)])

        # 初始化权重并进行最终处理
        self.post_init()

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, device):
        """
        Get reference points for each feature map. Used in decoder.

        Args:
            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                Spatial shapes of each feature map.
            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
                Valid ratios of each feature map.
            device (`torch.device`):
                Device on which to create the tensors.
        Returns:
            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
        """
        reference_points_list = []
        # 遍历每个特征图的空间形状
        for level, (height, width) in enumerate(spatial_shapes):
            # 创建网格矩阵,作为参考点的初始值
            ref_y, ref_x = meshgrid(
                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
                indexing="ij",
            )
            # 对参考点进行调整,考虑有效比例因子和特征图的高度和宽度
            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
            ref = torch.stack((ref_x, ref_y), -1)
            reference_points_list.append(ref)
        # 将参考点列表堆叠起来,形成最终的参考点张量
        reference_points = torch.cat(reference_points_list, 1)
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
        return reference_points

    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        position_embeddings=None,
        spatial_shapes=None,
        level_start_index=None,
        valid_ratios=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    """
    Some tweaks for Deformable DETR:

    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
    - it also returns a stack of intermediate outputs and reference points from all decoding layers.

    Args:
        config: DeformableDetrConfig
    """

    # 初始化函数,根据给定的配置参数初始化 Deformable DETR 模型
    def __init__(self, config: DeformableDetrConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 设定模型中使用的 dropout 概率
        self.dropout = config.dropout
        # 创建多个 DeformableDetrDecoderLayer 层组成的列表
        self.layers = nn.ModuleList([DeformableDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
        # 是否使用渐变检查点,默认为 False
        self.gradient_checkpointing = False

        # hack 实现,用于迭代边界框细化和两阶段 Deformable DETR
        self.bbox_embed = None  # 边界框嵌入,目前未指定具体的实现
        self.class_embed = None  # 类别嵌入,目前未指定具体的实现

        # 初始化权重并应用最终处理
        self.post_init()

    # 前向传播函数,接收多个输入和参数,执行模型的前向计算过程
    def forward(
        self,
        inputs_embeds=None,  # 输入的嵌入表示,通常是编码器的输出
        encoder_hidden_states=None,  # 编码器的隐藏状态
        encoder_attention_mask=None,  # 编码器的注意力掩码
        position_embeddings=None,  # 位置嵌入,用于处理空间信息的嵌入向量
        reference_points=None,  # 参考点,用于变形注意力机制
        spatial_shapes=None,  # 空间形状,用于处理不同层次的空间信息
        level_start_index=None,  # 层级开始索引,用于多层级处理
        valid_ratios=None,  # 有效比率,用于多尺度处理
        output_attentions=None,  # 是否输出注意力权重
        output_hidden_states=None,  # 是否输出隐藏状态
        return_dict=None,  # 是否返回字典形式的输出
"""
The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
hidden-states without any specific head on top.
"""
# 使用装饰器将类的文档字符串与已有的文档字符串合并
@add_start_docstrings(
    """
    The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
    hidden-states without any specific head on top.
    """,
    DEFORMABLE_DETR_START_DOCSTRING,
)
# 定义 DeformableDetrModel 类,继承自 DeformableDetrPreTrainedModel 类
class DeformableDetrModel(DeformableDetrPreTrainedModel):
    # 构造函数,接收一个 DeformableDetrConfig 类型的 config 参数
    def __init__(self, config: DeformableDetrConfig):
        # 调用父类的构造函数
        super().__init__(config)

        # 创建 backbone + positional encoding
        # 使用 DeformableDetrConvEncoder 创建 backbone
        backbone = DeformableDetrConvEncoder(config)
        # 构建位置编码
        position_embeddings = build_position_encoding(config)
        # 将 backbone 和位置编码传递给 DeformableDetrConvModel,并赋值给 self.backbone
        self.backbone = DeformableDetrConvModel(backbone, position_embeddings)

        # 创建输入投影层
        if config.num_feature_levels > 1:
            # 获取 backbone 的中间通道大小列表
            num_backbone_outs = len(backbone.intermediate_channel_sizes)
            input_proj_list = []
            # 根据中间通道大小列表创建输入投影层列表
            for _ in range(num_backbone_outs):
                in_channels = backbone.intermediate_channel_sizes[_]
                input_proj_list.append(
                    nn.Sequential(
                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
                        nn.GroupNorm(32, config.d_model),
                    )
                )
            # 如果配置中的特征级别数大于 backbone 输出的特征级别数,则继续添加投影层
            for _ in range(config.num_feature_levels - num_backbone_outs):
                input_proj_list.append(
                    nn.Sequential(
                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
                        nn.GroupNorm(32, config.d_model),
                    )
                )
                in_channels = config.d_model
            # 将输入投影层列表转换为 ModuleList,并赋值给 self.input_proj
            self.input_proj = nn.ModuleList(input_proj_list)
        else:
            # 如果只有一个特征级别,创建单个输入投影层并赋值给 self.input_proj
            self.input_proj = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
                        nn.GroupNorm(32, config.d_model),
                    )
                ]
            )

        # 如果不是两阶段模型,创建查询位置编码层
        if not config.two_stage:
            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)

        # 创建 DeformableDetrEncoder 和 DeformableDetrDecoder 实例,并赋值给 self.encoder 和 self.decoder
        self.encoder = DeformableDetrEncoder(config)
        self.decoder = DeformableDetrDecoder(config)

        # 创建级别嵌入参数,并赋值给 self.level_embed
        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))

        # 如果是两阶段模型,创建额外的层和正则化
        if config.two_stage:
            self.enc_output = nn.Linear(config.d_model, config.d_model)
            self.enc_output_norm = nn.LayerNorm(config.d_model)
            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
        else:
            # 如果不是两阶段模型,创建参考点层
            self.reference_points = nn.Linear(config.d_model, 2)

        # 执行初始化后的操作
        self.post_init()

    # 返回 encoder 对象
    def get_encoder(self):
        return self.encoder

    # 返回 decoder 对象
    def get_decoder(self):
        return self.decoder

    # 冻结 backbone 的参数
    def freeze_backbone(self):
        # 遍历 backbone 的模型参数,并设置为不可训练
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(False)
    def unfreeze_backbone(self):
        # 解冻模型的骨干网络(backbone)中的所有参数,使其可以进行梯度计算
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(True)

    def get_valid_ratio(self, mask, dtype=torch.float32):
        """Get the valid ratio of all feature maps."""
        
        # 获取掩码(mask)的高度和宽度
        _, height, width = mask.shape
        # 计算每个特征图在高度和宽度上的有效比例
        valid_height = torch.sum(mask[:, :, 0], 1)
        valid_width = torch.sum(mask[:, 0, :], 1)
        valid_ratio_height = valid_height.to(dtype) / height
        valid_ratio_width = valid_width.to(dtype) / width
        # 将高度和宽度的有效比例组合成一个张量
        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
        return valid_ratio

    def get_proposal_pos_embed(self, proposals):
        """Get the position embedding of the proposals."""
        
        # 获取位置嵌入(position embedding)的维度
        num_pos_feats = self.config.d_model // 2
        temperature = 10000
        scale = 2 * math.pi

        # 生成维度张量,用于计算位置嵌入
        dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
        
        # 对提议框进行 sigmoid 转换,并乘以比例尺度
        proposals = proposals.sigmoid() * scale
        
        # 计算位置嵌入,将结果展开为(batch_size, num_queries, 512)的形式
        pos = proposals[:, :, :, None] / dim_t
        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
        return pos
    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
        """Generate the encoder output proposals from encoded enc_output.

        Args:
            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.

        Returns:
            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
                  directly predict a bounding box. (without the need of a decoder)
                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
                  sigmoid.
        """
        batch_size = enc_output.shape[0]  # 获取批量大小
        proposals = []  # 初始化建议列表
        _cur = 0  # 当前处理的位置索引初始化为0
        for level, (height, width) in enumerate(spatial_shapes):  # 遍历空间形状列表
            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)  # 根据当前级别的高度和宽度计算扁平化的掩码
            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)  # 计算有效的高度
            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)  # 计算有效的宽度

            grid_y, grid_x = meshgrid(
                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
                indexing="ij",
            )  # 创建网格坐标

            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # 合并网格坐标

            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)  # 计算比例
            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale  # 根据比例调整网格
            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)  # 计算宽度和高度
            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)  # 构建建议
            proposals.append(proposal)  # 将建议添加到列表中
            _cur += height * width  # 更新当前位置索引

        output_proposals = torch.cat(proposals, 1)  # 合并所有建议
        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)  # 确定有效的建议
        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # 对建议进行逆sigmoid转换
        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))  # 将填充位置置为无穷大
        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))  # 将无效的建议位置置为无穷大

        # 每个像素分配为一个对象查询
        object_query = enc_output  # 使用编码输出作为对象查询
        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))  # 将填充位置置为0
        object_query = object_query.masked_fill(~output_proposals_valid, float(0))  # 将无效的建议位置置为0
        object_query = self.enc_output_norm(self.enc_output(object_query))  # 对对象查询进行归一化处理
        return object_query, output_proposals  # 返回对象查询和输出建议
    # 给模型的前向传播方法添加文档字符串,文档字符串的内容来源于 DEFORMABLE_DETR_INPUTS_DOCSTRING
    @add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING)
    # 替换前向传播方法的返回文档字符串,指定输出类型为 DeformableDetrModelOutput,配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=DeformableDetrModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
top, for tasks such as COCO detection.
"""
# 导入开始文档字符串装饰器和相关的模块文档字符串
@add_start_docstrings(
    """
    Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
    top, for tasks such as COCO detection.
    """,
    DEFORMABLE_DETR_START_DOCSTRING,
)
# 继承自预训练的 Deformable DETR 模型
class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
    # 当使用克隆时,所有大于 0 的层都将被克隆,但层 0 是必需的
    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
    # 不能在元设备上初始化模型,因为某些权重在初始化过程中会被修改
    _no_split_modules = None

    def __init__(self, config: DeformableDetrConfig):
        super().__init__(config)

        # Deformable DETR encoder-decoder 模型
        self.model = DeformableDetrModel(config)

        # 放置在顶部的检测头
        self.class_embed = nn.Linear(config.d_model, config.num_labels)
        self.bbox_embed = DeformableDetrMLPPredictionHead(
            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
        )

        # 设置先验概率和偏置值
        prior_prob = 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob)
        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)

        # 如果是两阶段模型,最后的 class_embed 和 bbox_embed 用于区域提议生成
        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
        if config.with_box_refine:
            self.class_embed = _get_clones(self.class_embed, num_pred)
            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
            # 对迭代式边界框细化的 hack 实现
            self.model.decoder.bbox_embed = self.bbox_embed
        else:
            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
            self.model.decoder.bbox_embed = None
        if config.two_stage:
            # 对两阶段模型的 hack 实现
            self.model.decoder.class_embed = self.class_embed
            for box_embed in self.bbox_embed:
                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 https://github.com/facebookresearch/detr/blob/master/models/detr.py 中获取的未使用的 torch.jit 注解
    @torch.jit.unused
    # 设置辅助损失函数,接受分类输出和坐标输出作为参数
    def _set_aux_loss(self, outputs_class, outputs_coord):
        # 这是为了使 torchscript 能够正常工作的一种解决方法,因为 torchscript
        # 不支持包含非同质值的字典,例如既有张量又有列表的字典。
        # 返回一个列表,其中每个元素是一个字典,包含"logits"和"pred_boxes"两个键,分别对应 outputs_class 和 outputs_coord 的每个元素(除最后一个)。
        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]

    # 将模型前向方法(forward)添加文档字符串
    @add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING)
    # 替换返回值的文档字符串为 DeformableDetrObjectDetectionOutput 类型,使用 _CONFIG_FOR_DOC 作为配置类
    @replace_return_docstrings(output_type=DeformableDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[List[dict]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# Copied from transformers.models.detr.modeling_detr.dice_loss
def dice_loss(inputs, targets, num_boxes):
    """
    Compute the DICE loss, similar to generalized IOU for masks

    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs (0 for the negative class and 1 for the positive
                 class).
    """
    # 对模型输出进行 sigmoid 激活,使其在 (0, 1) 范围内
    inputs = inputs.sigmoid()
    # 将输入扁平化,以便计算损失
    inputs = inputs.flatten(1)
    # 计算 DICE 损失的分子部分
    numerator = 2 * (inputs * targets).sum(1)
    # 计算 DICE 损失的分母部分
    denominator = inputs.sum(-1) + targets.sum(-1)
    # 计算最终的 DICE 损失
    loss = 1 - (numerator + 1) / (denominator + 1)
    # 对所有样本的损失求和并取平均
    return loss.sum() / num_boxes


# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        alpha (`float`, *optional*, defaults to `0.25`):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to `2`):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    """
    # 对模型输出进行 sigmoid 激活,将其转换为概率值
    prob = inputs.sigmoid()
    # 使用二元交叉熵损失计算损失,reduction="none"表示不进行求和
    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    # 计算 modulating factor
    p_t = prob * targets + (1 - prob) * (1 - targets)
    # 计算最终的 focal loss
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        # 计算 alpha 加权
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    # 对所有样本的损失求和并取平均
    return loss.mean(1).sum() / num_boxes


class DeformableDetrLoss(nn.Module):
    """
    This class computes the losses for `DeformableDetrForObjectDetection`. The process happens in two steps: 1) we
    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
    matched ground-truth / prediction (supervise class and box).

    Args:
        matcher (`DeformableDetrHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        num_classes (`int`):
            Number of object categories, omitting the special no-object category.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`List[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    """
    def __init__(self, matcher, num_classes, focal_alpha, losses):
        super().__init__()
        self.matcher = matcher
        self.num_classes = num_classes
        self.focal_alpha = focal_alpha
        self.losses = losses



        # 初始化函数,设置模型的匹配器、类别数、focal loss 的 alpha 参数和损失函数
        super().__init__()
        # 保存参数到对象实例中
        self.matcher = matcher
        self.num_classes = num_classes
        self.focal_alpha = focal_alpha
        self.losses = losses



    # removed logging parameter, which was part of the original implementation
    def loss_labels(self, outputs, targets, indices, num_boxes):
        """
        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
        of dim [nb_target_boxes]
        """
        # 检查输出中是否存在 "logits" 键
        if "logits" not in outputs:
            raise KeyError("No logits were found in the outputs")
        # 获取模型输出中的 logits
        source_logits = outputs["logits"]

        # 获取源索引的排列顺序
        idx = self._get_source_permutation_idx(indices)
        # 从目标中提取类别标签
        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
        # 创建一个填充了默认类别值的张量
        target_classes = torch.full(
            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
        )
        target_classes[idx] = target_classes_o

        # 创建一个 one-hot 编码的类别张量
        target_classes_onehot = torch.zeros(
            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
            dtype=source_logits.dtype,
            layout=source_logits.layout,
            device=source_logits.device,
        )
        # 在目标类别张量上进行 scatter 操作,填充 one-hot 编码
        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)

        # 去除多余的最后一个类别维度
        target_classes_onehot = target_classes_onehot[:, :, :-1]
        # 计算分类交叉熵损失
        loss_ce = (
            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
            * source_logits.shape[1]
        )
        # 返回损失字典
        losses = {"loss_ce": loss_ce}

        return losses



    @torch.no_grad()
    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
        """
        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.

        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
        """
        # 获取模型输出中的 logits
        logits = outputs["logits"]
        # 确定设备类型
        device = logits.device
        # 计算目标长度的张量
        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
        # 计算预测的非空盒子数量
        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
        # 计算基于 L1 损失的基数错误
        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
        # 返回基数错误的损失字典
        losses = {"cardinality_error": card_err}
        return losses



    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes



    # 从 transformers.models.detr.modeling_detr.DetrLoss.loss_boxes 复制过来
    # 定义计算边界框损失的方法,包括 L1 回归损失和 GIoU 损失
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.

        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        # 检查输出中是否包含预测的边界框
        if "pred_boxes" not in outputs:
            raise KeyError("No predicted boxes found in outputs")
        # 根据 indices 获取源排列的索引
        idx = self._get_source_permutation_idx(indices)
        # 获取预测的边界框和目标边界框,并按照 indices 给定的顺序连接起来
        source_boxes = outputs["pred_boxes"][idx]
        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)

        # 计算 L1 损失
        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")

        losses = {}
        # 将 L1 损失求和并归一化
        losses["loss_bbox"] = loss_bbox.sum() / num_boxes

        # 计算 GIoU 损失
        loss_giou = 1 - torch.diag(
            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
        )
        # 将 GIoU 损失求和并归一化
        losses["loss_giou"] = loss_giou.sum() / num_boxes
        return losses

    # 从 DETR 模型中复制的方法,用于获取源排列的索引
    def _get_source_permutation_idx(self, indices):
        # 根据 indices 重新排列预测结果
        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
        source_idx = torch.cat([source for (source, _) in indices])
        return batch_idx, source_idx

    # 从 DETR 模型中复制的方法,用于获取目标排列的索引
    def _get_target_permutation_idx(self, indices):
        # 根据 indices 重新排列目标标签
        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
        target_idx = torch.cat([target for (_, target) in indices])
        return batch_idx, target_idx

    # 根据给定的损失类型选择相应的损失计算方法,并调用
    def get_loss(self, loss, outputs, targets, indices, num_boxes):
        loss_map = {
            "labels": self.loss_labels,
            "cardinality": self.loss_cardinality,
            "boxes": self.loss_boxes,
        }
        if loss not in loss_map:
            raise ValueError(f"Loss {loss} not supported")
        return loss_map[loss](outputs, targets, indices, num_boxes)
    def forward(self, outputs, targets):
        """
        This performs the loss computation.

        Args:
             outputs (`dict`, *optional*):
                Dictionary of tensors, see the output specification of the model for the format.
             targets (`List[dict]`, *optional*):
                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                losses applied, see each loss' doc.
        """
        # Filter out auxiliary outputs from the main outputs dictionary
        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}

        # Retrieve the indices that match the outputs with the corresponding targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the total number of target boxes for normalization
        num_boxes = sum(len(t["class_labels"]) for t in targets)
        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
        world_size = 1

        # Adjust num_boxes and world_size if using the `accelerate` library
        if is_accelerate_available():
            if PartialState._shared_state != {}:
                num_boxes = reduce(num_boxes)
                world_size = PartialState().num_processes

        # Normalize num_boxes and clamp the result to ensure it's at least 1
        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()

        # Compute all requested losses and store them in the losses dictionary
        losses = {}
        for loss in self.losses:
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        # If there are auxiliary outputs, compute losses for each and append to the losses dictionary
        if "auxiliary_outputs" in outputs:
            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
                indices = self.matcher(auxiliary_outputs, targets)
                for loss in self.losses:
                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        # If there are encoder outputs, compute losses specific to these outputs and add to the losses dictionary
        if "enc_outputs" in outputs:
            enc_outputs = outputs["enc_outputs"]
            bin_targets = copy.deepcopy(targets)
            for bt in bin_targets:
                bt["class_labels"] = torch.zeros_like(bt["class_labels"])  # Zero out class labels
            indices = self.matcher(enc_outputs, bin_targets)
            for loss in self.losses:
                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
                l_dict = {k + "_enc": v for k, v in l_dict.items()}
                losses.update(l_dict)

        # Return the computed losses dictionary
        return losses
# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
class DeformableDetrMLPPredictionHead(nn.Module):
    """
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        # 创建一个由多个线性层组成的神经网络,用于预测边界框的中心坐标、高度和宽度
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        # 前向传播函数,通过多个线性层进行特征提取和预测
        for i, layer in enumerate(self.layers):
            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


class DeformableDetrHungarianMatcher(nn.Module):
    """
    This class computes an assignment between the targets and the predictions of the network.

    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).

    Args:
        class_cost:
            The relative weight of the classification error in the matching cost.
        bbox_cost:
            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
        giou_cost:
            The relative weight of the giou loss of the bounding box in the matching cost.
    """

    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
        super().__init__()
        # 引入后端依赖的函数库
        requires_backends(self, ["scipy"])

        self.class_cost = class_cost
        self.bbox_cost = bbox_cost
        self.giou_cost = giou_cost
        # 如果所有的成本都为零,则抛出异常
        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
            raise ValueError("All costs of the Matcher can't be 0")

    @torch.no_grad()
    def forward(self, outputs, targets):
        """
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
            targets (`List[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        batch_size, num_queries = outputs["logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # 将分类 logits 展平并应用 sigmoid 函数,得到概率 [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # 将预测框坐标展平 [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        target_ids = torch.cat([v["class_labels"] for v in targets])  # 将所有目标的类别标签拼接起来
        target_bbox = torch.cat([v["boxes"] for v in targets])  # 将所有目标的框坐标拼接起来

        # Compute the classification cost.
        alpha = 0.25
        gamma = 2.0
        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())  # 计算分类损失中的负类损失项
        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())  # 计算分类损失中的正类损失项
        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]  # 根据目标类别计算分类损失

        # Compute the L1 cost between boxes
        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)  # 计算框之间的 L1 损失

        # Compute the giou cost between boxes
        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))  # 计算框之间的 GIoU 损失

        # Final cost matrix
        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost  # 组合成最终的损失矩阵
        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()  # 将损失矩阵调整形状并转移到 CPU 上处理

        sizes = [len(v["boxes"]) for v in targets]  # 获取每个目标的框数量
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]  # 使用匈牙利算法计算最佳匹配索引

        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]  # 将匹配索引转换为张量并返回
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
    # 如果输入张量是浮点型,则保护免受乘法溢出风险,通过升级到相应更高的类型进行处理
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
    """
    计算一组边界框的面积,这些边界框由它们的 (x1, y1, x2, y2) 坐标指定。

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            需要计算面积的边界框。它们应以 (x1, y1, x2, y2) 格式提供,其中 `0 <= x1 < x2` 和 `0 <= y1 < y2`。

    Returns:
        `torch.FloatTensor`: 包含每个边界框面积的张量。
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
    """
    来自 https://giou.stanford.edu/ 的广义 IoU 计算方法。边界框应处于 [x0, y0, x1, y1] (角点) 格式。

    Returns:
        `torch.FloatTensor`: 一个 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
    """
    # 退化的边界框会产生无穷大 / NaN 的结果,因此进行早期检查
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 必须以 [x0, y0, x1, y1] (角点) 格式提供,但给定的是 {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 必须以 [x0, y0, x1, y1] (角点) 格式提供,但给定的是 {boxes2}")
    iou, union = box_iou(boxes1, boxes2)

    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    return iou - (area - union) / area


# Copied from transformers.models.detr.modeling_detr._max_by_axis
def _max_by_axis(the_list):
    # type: (List[List[int]]) -> List[int]
    maxes = the_list[0]
    for sublist in the_list[1:]:
        for index, item in enumerate(sublist):
            maxes[index] = max(maxes[index], item)
    return maxes
# 定义了一个 NestedTensor 类,用于处理包含张量和可选遮罩的嵌套张量对象
class NestedTensor(object):
    def __init__(self, tensors, mask: Optional[Tensor]):
        self.tensors = tensors  # 初始化对象时传入的张量列表
        self.mask = mask  # 初始化对象时传入的遮罩张量(可选)

    # 将嵌套张量对象转移到指定的设备上
    def to(self, device):
        cast_tensor = self.tensors.to(device)  # 将张量列表转移到指定设备上
        mask = self.mask
        if mask is not None:
            cast_mask = mask.to(device)  # 如果存在遮罩张量,将其也转移到指定设备上
        else:
            cast_mask = None  # 如果没有遮罩张量,则设置为 None
        return NestedTensor(cast_tensor, cast_mask)  # 返回转移后的嵌套张量对象

    # 返回嵌套张量对象的原始张量和遮罩张量(如果存在)
    def decompose(self):
        return self.tensors, self.mask

    # 返回嵌套张量对象的字符串表示,即其张量列表的字符串表示
    def __repr__(self):
        return str(self.tensors)


# 从给定的张量列表创建嵌套张量对象
# 函数来自于 transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
    if tensor_list[0].ndim == 3:  # 检查张量列表中的第一个张量是否为三维张量
        max_size = _max_by_axis([list(img.shape) for img in tensor_list])  # 获取张量列表中张量的最大尺寸
        batch_shape = [len(tensor_list)] + max_size  # 计算批次的形状
        batch_size, num_channels, height, width = batch_shape  # 解构批次形状
        dtype = tensor_list[0].dtype  # 获取张量的数据类型
        device = tensor_list[0].device  # 获取张量的设备
        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)  # 创建全零张量作为批次张量
        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)  # 创建全一的遮罩张量
        # 将每个张量复制到批次张量中,并生成相应的遮罩
        for img, pad_img, m in zip(tensor_list, tensor, mask):
            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
            m[: img.shape[1], : img.shape[2]] = False  # 根据张量的实际尺寸设置遮罩
    else:
        raise ValueError("Only 3-dimensional tensors are supported")  # 抛出错误,只支持三维张量
    return NestedTensor(tensor, mask)  # 返回创建的嵌套张量对象
posted @ 2024-06-30 15:32  绝不原创的飞龙  阅读(28)  评论(0编辑  收藏  举报