Transformers-源码解析-四十二-

Transformers 源码解析(四十二)

.\models\dpr\tokenization_dpr.py

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team, The Hugging Face Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DPR."""


import collections
from typing import List, Optional, Union

from ...tokenization_utils_base import BatchEncoding
from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
from ..bert.tokenization_bert import BertTokenizer


# 获取名为 logging 的日志记录器对象
logger = logging.get_logger(__name__)

# 定义词汇文件名字典,包括词汇文件和分词器文件
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 上下文编码器预训练模型的词汇文件映射字典
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/dpr-ctx_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt"
        ),
        "facebook/dpr-ctx_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "facebook/dpr-ctx_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json"
        ),
        "facebook/dpr-ctx_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json"
        ),
    },
}

# 问题编码器预训练模型的词汇文件映射字典
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/dpr-question_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt"
        ),
        "facebook/dpr-question_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "facebook/dpr-question_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json"
        ),
        "facebook/dpr-question_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json"
        ),
    },
}

# 读者预训练模型的词汇文件映射字典
READER_PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/dpr-reader-single-nq-base": (
            "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt"
        ),
        "facebook/dpr-reader-multiset-base": (
            "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt"
        ),
    },
    # tokenizer_file 字典包含两个条目,每个条目的键是模型名称,值是其对应的 tokenizer.json 文件的 URL
    "tokenizer_file": {
        "facebook/dpr-reader-single-nq-base": (
            "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json"
        ),
        "facebook/dpr-reader-multiset-base": (
            "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json"
        ),
    },
}

# 定义用于不同模型的预训练位置嵌入大小的映射字典
CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/dpr-ctx_encoder-single-nq-base": 512,
    "facebook/dpr-ctx_encoder-multiset-base": 512,
}
QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/dpr-question_encoder-single-nq-base": 512,
    "facebook/dpr-question_encoder-multiset-base": 512,
}
READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/dpr-reader-single-nq-base": 512,
    "facebook/dpr-reader-multiset-base": 512,
}

# 定义用于不同模型的预训练初始化配置的字典
CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
    "facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True},
    "facebook/dpr-ctx_encoder-multiset-base": {"do_lower_case": True},
}
QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
    "facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True},
    "facebook/dpr-question_encoder-multiset-base": {"do_lower_case": True},
}
READER_PRETRAINED_INIT_CONFIGURATION = {
    "facebook/dpr-reader-single-nq-base": {"do_lower_case": True},
    "facebook/dpr-reader-multiset-base": {"do_lower_case": True},
}

# 定义一个自定义的类,继承自BertTokenizer,用于DPR上下文编码器的分词器
class DPRContextEncoderTokenizer(BertTokenizer):
    r"""
    Construct a DPRContextEncoder tokenizer.

    [`DPRContextEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
    splitting and wordpiece.

    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
    """

    # 设置词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 设置预训练词汇文件的映射字典
    pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
    # 设置最大模型输入大小的映射字典
    max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 设置预训练初始化配置的映射字典
    pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION

# 定义一个自定义的类,继承自BertTokenizer,用于DPR问题编码器的分词器
class DPRQuestionEncoderTokenizer(BertTokenizer):
    r"""
    Constructs a DPRQuestionEncoder tokenizer.

    [`DPRQuestionEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
    splitting and wordpiece.

    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
    """

    # 设置词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 设置预训练词汇文件的映射字典
    pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
    # 设置最大模型输入大小的映射字典
    max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 设置预训练初始化配置的映射字典
    pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION

# 定义一个命名元组,用于存储DPR阅读器的输出结果
DPRSpanPrediction = collections.namedtuple(
    "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"]
)

# 定义一个命名元组,用于存储DPR阅读器的输出结果
DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"])

# 自定义DPR阅读器文档字符串
CUSTOM_DPR_READER_DOCSTRING = r"""
    Return a dictionary with the token ids of the input strings and other information to give to `.decode_best_spans`.
    It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
    """
    Prepares input data for a question answering model by tokenizing passages and creating input IDs and attention masks.
    
    Returns:
        `Dict[str, List[List[int]]]`: A dictionary containing the following keys:
        
        - `input_ids`: List of token IDs formatted as `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`.
        - `attention_mask`: List indicating which tokens should be attended to by the model.
    """
# 将自定义的文档字符串添加到类的注释中
@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 定义一个混合类,用于处理DPR阅读器的定制标记器
class CustomDPRReaderTokenizerMixin:
    # 实现__call__方法,使类实例可以像函数一样调用
    def __call__(
        self,
        questions,  # 输入的问题或问题列表
        titles: Optional[str] = None,  # 输入的标题或标题列表(可选)
        texts: Optional[str] = None,  # 输入的文本或文本列表(可选)
        padding: Union[bool, str] = False,  # 是否填充到最大长度或指定填充方法
        truncation: Union[bool, str] = False,  # 是否截断到最大长度或指定截断方法
        max_length: Optional[int] = None,  # 最大序列长度(可选)
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型(可选)
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力遮罩(可选)
        **kwargs,  # 其他关键字参数
    ) -> BatchEncoding:  # 返回类型为BatchEncoding对象
        # 如果标题和文本均为空,则调用父类的__call__方法
        if titles is None and texts is None:
            return super().__call__(
                questions,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                return_tensors=return_tensors,
                return_attention_mask=return_attention_mask,
                **kwargs,
            )
        # 如果标题或文本有一个为空,则处理成对的标题-文本
        elif titles is None or texts is None:
            text_pair = titles if texts is None else texts
            return super().__call__(
                questions,
                text_pair,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                return_tensors=return_tensors,
                return_attention_mask=return_attention_mask,
                **kwargs,
            )
        # 如果标题和文本均为单个字符串,则转换为列表
        titles = titles if not isinstance(titles, str) else [titles]
        texts = texts if not isinstance(texts, str) else [texts]
        n_passages = len(titles)  # 获取标题的数量
        # 如果问题是单个字符串,则复制为与标题数量相匹配的列表
        questions = questions if not isinstance(questions, str) else [questions] * n_passages
        # 检查标题和文本的数量是否相同,不同则引发值错误
        if len(titles) != len(texts):
            raise ValueError(
                f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
            )
        # 获取问题和标题的编码输入(input_ids)
        encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
        # 获取文本的编码输入(input_ids),不添加特殊标记
        encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
        # 构建编码输入字典
        encoded_inputs = {
            "input_ids": [
                (encoded_question_and_title + encoded_text)[:max_length]  # 若截断则截断到最大长度
                if max_length is not None and truncation
                else encoded_question_and_title + encoded_text  # 否则直接拼接
                for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts)
            ]
        }
        # 如果需要返回attention_mask,则生成对应的attention_mask
        if return_attention_mask is not False:
            attention_mask = []
            for input_ids in encoded_inputs["input_ids"]:
                attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids])
            encoded_inputs["attention_mask"] = attention_mask
        # 调用pad方法进行填充处理,并返回结果
        return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors)
    def decode_best_spans(
        self,
        reader_input: BatchEncoding,
        reader_output: DPRReaderOutput,
        num_spans: int = 16,
        max_answer_length: int = 64,
        num_spans_per_passage: int = 4,
    ):
        """
        解码最佳跨度,用于从抽取式问答模型中找出一个段落的最佳答案跨度。
        按照 `span_score` 降序排列,保留最大的 `top_spans` 个跨度。忽略超过 `max_answer_length` 的跨度。
        """
        scores = []
        for start_index, start_score in enumerate(start_logits):
            for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]):
                scores.append(((start_index, start_index + answer_length), start_score + end_score))
        # 根据得分降序排序所有跨度
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        chosen_span_intervals = []
        for (start_index, end_index), score in scores:
            # 检查跨度索引的合法性
            if start_index > end_index:
                raise ValueError(f"Wrong span indices: [{start_index}:{end_index}]")
            length = end_index - start_index + 1
            # 检查跨度长度是否超过最大答案长度
            if length > max_answer_length:
                raise ValueError(f"Span is too long: {length} > {max_answer_length}")
            # 检查是否存在重叠的跨度
            if any(
                start_index <= prev_start_index <= prev_end_index <= end_index
                or prev_start_index <= start_index <= end_index <= prev_end_index
                for (prev_start_index, prev_end_index) in chosen_span_intervals
            ):
                continue
            chosen_span_intervals.append((start_index, end_index))

            # 如果已选出了指定数量的跨度,则停止
            if len(chosen_span_intervals) == top_spans:
                break
        return chosen_span_intervals
# 使用自定义的文档字符串装饰器来添加文档字符串到类定义中,基于给定的自定义文档字符串
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 定义一个类 DPRReaderTokenizer,继承自 CustomDPRReaderTokenizerMixin 和 BertTokenizer
class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
    """
    Construct a DPRReader tokenizer.

    [`DPRReaderTokenizer`] is almost identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
    splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts that are
    combined to be fed to the [`DPRReader`] model.

    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
    """

    # 类属性:词汇表文件名列表,值为 VOCAB_FILES_NAMES
    vocab_files_names = VOCAB_FILES_NAMES
    # 类属性:预训练词汇文件映射,值为 READER_PRETRAINED_VOCAB_FILES_MAP
    pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP
    # 类属性:最大模型输入尺寸列表,值为 READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 类属性:预训练初始化配置,值为 READER_PRETRAINED_INIT_CONFIGURATION
    pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
    # 类属性:模型输入名称列表,包含 "input_ids" 和 "attention_mask"
    model_input_names = ["input_ids", "attention_mask"]

.\models\dpr\tokenization_dpr_fast.py

# coding=utf-8
# 代码文件的编码声明,使用UTF-8格式

# 版权声明和许可信息,指明代码的版权归属和许可条件
# Copyright 2018 The HuggingFace Inc. team, The Hugging Face Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tokenization classes for DPR."""
# 本模块提供了DPR模型的tokenization类

# 导入必要的库和模块
import collections
from typing import List, Optional, Union

# 导入基础的tokenization工具和辅助函数
from ...tokenization_utils_base import BatchEncoding
from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging

# 导入BERT模型的快速tokenization类
from ..bert.tokenization_bert_fast import BertTokenizerFast

# 导入DPR模型的tokenization类
from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 上下文编码器预训练模型的词汇文件映射
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/dpr-ctx_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt"
        ),
        "facebook/dpr-ctx_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "facebook/dpr-ctx_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json"
        ),
        "facebook/dpr-ctx_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json"
        ),
    },
}

# 问题编码器预训练模型的词汇文件映射
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/dpr-question_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt"
        ),
        "facebook/dpr-question_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "facebook/dpr-question_encoder-single-nq-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json"
        ),
        "facebook/dpr-question_encoder-multiset-base": (
            "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json"
        ),
    },
}

# 阅读器预训练模型的词汇文件映射
READER_PRETRAINED_VOCAB_FILES_MAP = {
    # 定义一个字典,存储不同模型名称到其对应的词汇表文件 URL 的映射关系
    "vocab_file": {
        "facebook/dpr-reader-single-nq-base": (
            "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt"
        ),
        "facebook/dpr-reader-multiset-base": (
            "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt"
        ),
    },
    # 定义一个字典,存储不同模型名称到其对应的分词器文件 URL 的映射关系
    "tokenizer_file": {
        "facebook/dpr-reader-single-nq-base": (
            "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json"
        ),
        "facebook/dpr-reader-multiset-base": (
            "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json"
        ),
    },
}

CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/dpr-ctx_encoder-single-nq-base": 512,  # 定义上下文编码器单一模型的位置编码大小为512
    "facebook/dpr-ctx_encoder-multiset-base": 512,   # 定义上下文编码器多集模型的位置编码大小为512
}
QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/dpr-question_encoder-single-nq-base": 512,   # 定义问题编码器单一模型的位置编码大小为512
    "facebook/dpr-question_encoder-multiset-base": 512,    # 定义问题编码器多集模型的位置编码大小为512
}
READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/dpr-reader-single-nq-base": 512,     # 定义阅读器单一模型的位置编码大小为512
    "facebook/dpr-reader-multiset-base": 512,      # 定义阅读器多集模型的位置编码大小为512
}


CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
    "facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True},    # 上下文编码器单一模型的初始化配置,设置为小写敏感
    "facebook/dpr-ctx_encoder-multiset-base": {"do_lower_case": True},     # 上下文编码器多集模型的初始化配置,设置为小写敏感
}
QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
    "facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True},   # 问题编码器单一模型的初始化配置,设置为小写敏感
    "facebook/dpr-question_encoder-multiset-base": {"do_lower_case": True},    # 问题编码器多集模型的初始化配置,设置为小写敏感
}
READER_PRETRAINED_INIT_CONFIGURATION = {
    "facebook/dpr-reader-single-nq-base": {"do_lower_case": True},    # 阅读器单一模型的初始化配置,设置为小写敏感
    "facebook/dpr-reader-multiset-base": {"do_lower_case": True},     # 阅读器多集模型的初始化配置,设置为小写敏感
}


class DPRContextEncoderTokenizerFast(BertTokenizerFast):
    r"""
    Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library).

    [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
    punctuation splitting and wordpiece.

    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
    """

    vocab_files_names = VOCAB_FILES_NAMES   # 设置词汇文件的名称列表为已定义的全局变量 VOCAB_FILES_NAMES
    pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP   # 预训练词汇文件的映射表为上下文编码器的预定义映射
    max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES   # 最大模型输入尺寸为上下文编码器的位置编码大小
    pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION   # 预训练模型的初始化配置为上下文编码器的初始化配置
    slow_tokenizer_class = DPRContextEncoderTokenizer   # 慢速分词器类为 DPRContextEncoderTokenizer


class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
    r"""
    Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library).

    [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
    punctuation splitting and wordpiece.

    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
    """

    vocab_files_names = VOCAB_FILES_NAMES   # 设置词汇文件的名称列表为已定义的全局变量 VOCAB_FILES_NAMES
    pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP   # 预训练词汇文件的映射表为问题编码器的预定义映射
    max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES   # 最大模型输入尺寸为问题编码器的位置编码大小
    pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION   # 预训练模型的初始化配置为问题编码器的初始化配置
    slow_tokenizer_class = DPRQuestionEncoderTokenizer   # 慢速分词器类为 DPRQuestionEncoderTokenizer


DPRSpanPrediction = collections.namedtuple(
    "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"]
)

DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"])


CUSTOM_DPR_READER_DOCSTRING = r"""
    # 返回一个包含输入字符串的token id及其他信息的字典,用于传递给 `.decode_best_spans` 函数。
    # 使用分词器和词汇表将问题和不同段落(标题和文本)的字符串转换为一系列整数ID。结果的 `input_ids` 是一个大小为 `(n_passages, sequence_length)` 的矩阵,
    # 其格式为:
    #
    # [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
    #
    # 返回:
    # `Dict[str, List[List[int]]]`: 包含以下键的字典:
    #
    # - `input_ids`: 要输入模型的token id列表。
    # - `attention_mask`: 指定模型应关注哪些token的索引列表。
# 将自定义的文档字符串添加到类上,通常用于API文档生成
@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 定义一个混合类,用于处理DPR Reader的自定义Tokenizer功能
class CustomDPRReaderTokenizerMixin:
    # 定义__call__方法,使对象可以像函数一样调用
    def __call__(
        self,
        questions,  # 输入的问题或问题列表
        titles: Optional[str] = None,  # 可选参数,输入的标题或单个标题字符串
        texts: Optional[str] = None,  # 可选参数,输入的文本或单个文本字符串
        padding: Union[bool, str] = False,  # 是否进行填充,可以是布尔值或填充策略字符串
        truncation: Union[bool, str] = False,  # 是否进行截断,可以是布尔值或截断策略字符串
        max_length: Optional[int] = None,  # 可选参数,最大长度限制
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回张量类型
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力遮罩
        **kwargs,  # 其他未命名的关键字参数
    ) -> BatchEncoding:  # 返回值为BatchEncoding类型的对象
        # 如果标题和文本均未提供,则直接调用父类的__call__方法
        if titles is None and texts is None:
            return super().__call__(
                questions,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                return_tensors=return_tensors,
                return_attention_mask=return_attention_mask,
                **kwargs,
            )
        # 如果标题或文本中有一个为None,则将其作为文本对处理
        elif titles is None or texts is None:
            text_pair = titles if texts is None else texts
            return super().__call__(
                questions,
                text_pair,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                return_tensors=return_tensors,
                return_attention_mask=return_attention_mask,
                **kwargs,
            )
        # 如果titles是字符串,则转换为列表
        titles = titles if not isinstance(titles, str) else [titles]
        # 如果texts是字符串,则转换为列表
        texts = texts if not isinstance(texts, str) else [texts]
        # 计算标题的数量,作为文本对的数量
        n_passages = len(titles)
        # 如果问题是字符串,则复制为问题列表,使每个问题对应一个文本对
        questions = questions if not isinstance(questions, str) else [questions] * n_passages
        # 断言标题和文本的数量应该相同
        assert len(titles) == len(
            texts
        ), f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
        # 调用父类的__call__方法对问题和标题进行编码,禁用填充和截断
        encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
        # 调用父类的__call__方法对文本进行编码,禁用特殊令牌、填充和截断
        encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
        # 合并编码后的问题和标题与文本,并根据最大长度和截断策略进行处理
        encoded_inputs = {
            "input_ids": [
                (encoded_question_and_title + encoded_text)[:max_length]
                if max_length is not None and truncation
                else encoded_question_and_title + encoded_text
                for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts)
            ]
        }
        # 如果不返回注意力遮罩,则创建注意力遮罩列表
        if return_attention_mask is not False:
            attention_mask = []
            for input_ids in encoded_inputs["input_ids"]:
                attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids])
            encoded_inputs["attention_mask"] = attention_mask
        # 调用pad方法对编码输入进行填充,根据填充策略和最大长度进行处理
        return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors)
    def decode_best_spans(
        self,
        reader_input: BatchEncoding,
        reader_output: DPRReaderOutput,
        num_spans: int = 16,
        max_answer_length: int = 64,
        num_spans_per_passage: int = 4,
    ):
        """
        解码最佳跨度的函数,用于从抽取式问答模型中找出一个段落的最佳答案跨度。它按照降序的 `span_score` 排序,并保留最多 `top_spans` 个跨度。超过 `max_answer_length` 的跨度将被忽略。
        """
        scores = []
        for start_index, start_score in enumerate(start_logits):
            for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]):
                scores.append(((start_index, start_index + answer_length), start_score + end_score))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        chosen_span_intervals = []
        for (start_index, end_index), score in scores:
            assert start_index <= end_index, f"Wrong span indices: [{start_index}:{end_index}]"
            length = end_index - start_index + 1
            assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}"
            if any(
                start_index <= prev_start_index <= prev_end_index <= end_index
                or prev_start_index <= start_index <= end_index <= prev_end_index
                for (prev_start_index, prev_end_index) in chosen_span_intervals
            ):
                continue
            chosen_span_intervals.append((start_index, end_index))

            if len(chosen_span_intervals) == top_spans:
                break
        return chosen_span_intervals
# 应用装饰器 @add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING) 来添加自定义文档字符串到类 DPRReaderTokenizerFast
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 声明 DPRReaderTokenizerFast 类,继承自 CustomDPRReaderTokenizerMixin 和 BertTokenizerFast
class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
    # 构造函数说明
    r"""
    构造一个“快速” DPRReader 分词器(由 HuggingFace 的 *tokenizers* 库支持)。

    [`DPRReaderTokenizerFast`] 几乎与 [`BertTokenizerFast`] 相同,并运行端到端的分词:
    标点符号拆分和 wordpiece。区别在于它有三个输入字符串:问题、标题和文本,这些被组合后供 [`DPRReader`] 模型使用。

    参考超类 [`BertTokenizerFast`] 以获取有关参数的使用示例和文档。

    """

    # 定义词汇文件的名称
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义预训练词汇文件的映射
    pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP
    # 定义模型最大输入大小
    max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义预训练初始化配置
    pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
    # 模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 慢速分词器类的定义
    slow_tokenizer_class = DPRReaderTokenizer

.\models\dpr\__init__.py

# 引入必要的模块和类型检查
from typing import TYPE_CHECKING

# 从相对路径引入工具函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块导入结构的字典,用于延迟加载模块
_import_structure = {
    "configuration_dpr": ["DPR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPRConfig"],
    "tokenization_dpr": [
        "DPRContextEncoderTokenizer",
        "DPRQuestionEncoderTokenizer",
        "DPRReaderOutput",
        "DPRReaderTokenizer",
    ],
}

# 检查是否可用 tokenizers,若不可用则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加快速 tokenization_dpr_fast 模块到导入结构字典中
    _import_structure["tokenization_dpr_fast"] = [
        "DPRContextEncoderTokenizerFast",
        "DPRQuestionEncoderTokenizerFast",
        "DPRReaderTokenizerFast",
    ]

# 检查是否可用 torch,若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加 modeling_dpr 模块到导入结构字典中
    _import_structure["modeling_dpr"] = [
        "DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DPRContextEncoder",
        "DPRPretrainedContextEncoder",
        "DPRPreTrainedModel",
        "DPRPretrainedQuestionEncoder",
        "DPRPretrainedReader",
        "DPRQuestionEncoder",
        "DPRReader",
    ]

# 检查是否可用 tensorflow,若不可用则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加 modeling_tf_dpr 模块到导入结构字典中
    _import_structure["modeling_tf_dpr"] = [
        "TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFDPRContextEncoder",
        "TFDPRPretrainedContextEncoder",
        "TFDPRPretrainedQuestionEncoder",
        "TFDPRPretrainedReader",
        "TFDPRQuestionEncoder",
        "TFDPRReader",
    ]

# 如果是类型检查阶段,导入必要的类型和模块
if TYPE_CHECKING:
    from .configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
    from .tokenization_dpr import (
        DPRContextEncoderTokenizer,
        DPRQuestionEncoderTokenizer,
        DPRReaderOutput,
        DPRReaderTokenizer,
    )

    # 检查是否可用 tokenizers,在类型检查阶段也进行检查
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 否则,从当前目录下的tokenization_dpr_fast模块中导入以下快速tokenizer类
    from .tokenization_dpr_fast import (
        DPRContextEncoderTokenizerFast,
        DPRQuestionEncoderTokenizerFast,
        DPRReaderTokenizerFast,
    )

try:
    # 检查是否已经安装了torch依赖
    if not is_torch_available():
        # 如果没有安装,抛出OptionalDependencyNotAvailable异常
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果捕获到OptionalDependencyNotAvailable异常,则不进行任何操作
    pass
else:
    # 否则,从当前目录下的modeling_dpr模块中导入以下内容
    from .modeling_dpr import (
        DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
        DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
        DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
        DPRContextEncoder,
        DPRPretrainedContextEncoder,
        DPRPreTrainedModel,
        DPRPretrainedQuestionEncoder,
        DPRPretrainedReader,
        DPRQuestionEncoder,
        DPRReader,
    )

try:
    # 检查是否已经安装了tensorflow依赖
    if not is_tf_available():
        # 如果没有安装,抛出OptionalDependencyNotAvailable异常
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果捕获到OptionalDependencyNotAvailable异常,则不进行任何操作
    pass
else:
    # 否则,从当前目录下的modeling_tf_dpr模块中导入以下内容
    from .modeling_tf_dpr import (
        TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
        TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
        TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFDPRContextEncoder,
        TFDPRPretrainedContextEncoder,
        TFDPRPretrainedQuestionEncoder,
        TFDPRPretrainedReader,
        TFDPRQuestionEncoder,
        TFDPRReader,
    )
else:
    # 导入 sys 模块,用于在运行时动态操作 Python 解释器
    import sys

    # 将当前模块的名称添加到 sys.modules 中,并指定为一个懒加载模块对象
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\dpt\configuration_dpt.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DPT model configuration"""

# 引入必要的模块和类
import copy

# 从配置工具中导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入日志模块
from ...utils import logging
# 从自动配置中导入配置映射
from ..auto.configuration_auto import CONFIG_MAPPING
# 从bit模块导入BitConfig类
from ..bit import BitConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型与配置文件的映射关系
DPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Intel/dpt-large": "https://huggingface.co/Intel/dpt-large/resolve/main/config.json",
    # See all DPT models at https://huggingface.co/models?filter=dpt
}

# 定义DPTConfig类,继承自PretrainedConfig类
class DPTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DPT
    [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import DPTModel, DPTConfig

    >>> # Initializing a DPT dpt-large style configuration
    >>> configuration = DPTConfig()

    >>> # Initializing a model from the dpt-large style configuration
    >>> model = DPTModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 指定模型类型为"dpt"
    model_type = "dpt"
    def __init__(
        self,
        hidden`
    def __init__(
        self,
        hidden_size=768,  # 设置隐藏层大小,默认为768
        num_hidden_layers=12,  # 设置隐藏层数量,默认为12
        num_attention_heads=12,  # 设置注意力头的数量,默认为12
        intermediate_size=3072,  # 设置中间层的大小,默认为3072
        hidden_act="gelu",  # 设置隐藏层激活函数,默认为'gelu'
        hidden_dropout_prob=0.0,  # 设置隐藏层的丢弃概率,默认为0.0
        attention_probs_dropout_prob=0.0,  # 设置注意力概率的丢弃概率,默认为0.0
        initializer_range=0.02,  # 设置初始化范围,默认为0.02
        layer_norm_eps=1e-12,  # 设置层标准化的 epsilon 值,默认为1e-12
        image_size=384,  # 设置输入图像的尺寸,默认为384
        patch_size=16,  # 设置图像切片的大小,默认为16
        num_channels=3,  # 设置输入图像的通道数,默认为3
        is_hybrid=False,  # 设置是否为混合模型,默认为False
        qkv_bias=True,  # 设置 QKV 是否使用偏置,默认为True
        backbone_out_indices=[2, 5, 8, 11],  # 设置骨干网络输出的层索引,默认为[2, 5, 8, 11]
        readout_type="project",  # 设置读取类型,默认为'project'
        reassemble_factors=[4, 2, 1, 0.5],  # 设置重组因子,默认为[4, 2, 1, 0.5]
        neck_hidden_sizes=[96, 192, 384, 768],  # 设置脖子层隐藏层的大小,默认为[96, 192, 384, 768]
        fusion_hidden_size=256,  # 设置融合层隐藏层的大小,默认为256
        head_in_index=-1,  # 设置头部输入的索引,默认为-1
        use_batch_norm_in_fusion_residual=False,  # 设置在融合残差中是否使用批量归一化,默认为False
        use_bias_in_fusion_residual=None,  # 设置融合残差中是否使用偏置,默认为None
        add_projection=False,  # 设置是否添加投影层,默认为False
        use_auxiliary_head=True,  # 设置是否使用辅助头,默认为True
        auxiliary_loss_weight=0.4,  # 设置辅助损失权重,默认为0.4
        semantic_loss_ignore_index=255,  # 设置语义损失忽略索引,默认为255
        semantic_classifier_dropout=0.1,  # 设置语义分类器的丢弃概率,默认为0.1
        backbone_featmap_shape=[1, 1024, 24, 24],  # 设置骨干特征图的形状,默认为[1, 1024, 24, 24]
        neck_ignore_stages=[0, 1],  # 设置忽略的颈部阶段,默认为[0, 1]
        backbone_config=None,  # 设置骨干配置,默认为None
        backbone=None,  # 设置骨干网络,默认为None
        use_pretrained_backbone=False,  # 设置是否使用预训练骨干,默认为False
        use_timm_backbone=False,  # 设置是否使用 timm 骨干,默认为False
        backbone_kwargs=None,  # 设置骨干网络的关键字参数,默认为None
        **kwargs,  # 允许额外的关键字参数
    ):
        """
        初始化方法,设置模型的各种参数。
        """
        super().__init__(**kwargs)  # 调用父类的初始化方法

    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)  # 深拷贝当前实例的字典属性

        if output["backbone_config"] is not None:  # 如果骨干配置不为空
            output["backbone_config"] = self.backbone_config.to_dict()  # 将骨干配置转换为字典

        output["model_type"] = self.__class__.model_type  # 设置模型类型为当前类的模型类型
        return output  # 返回字典表示

.\models\dpt\convert_dinov2_depth_to_hf.py

# coding=utf-8
# 指定编码格式为 UTF-8

# Copyright 2023 The HuggingFace Inc. team.
# 版权声明,声明代码版权归 HuggingFace Inc. 团队所有。

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 进行许可

# you may not use this file except in compliance with the License.
# 除非遵守许可证规定,否则不得使用此文件。

# You may obtain a copy of the License at
# 可以从以下链接获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 在适用法律要求或书面同意的情况下,根据许可证分发的软件是基于"原样"分发的,没有任何明示或暗示的担保或条件。

# See the License for the specific language governing permissions and
# 请查阅许可证了解具体的权限和限制

# limitations under the License.
# 许可证下的限制。

"""Convert DINOv2 + DPT checkpoints from the original repository. URL:
https://github.com/facebookresearch/dinov2/tree/main"""
# 代码的简要描述和参考链接

import argparse
import itertools
import math
from pathlib import Path

import requests
import torch
from PIL import Image
from torchvision import transforms

from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
from transformers.utils import logging

# 设置日志输出级别为 info
logging.set_verbosity_info()

# 获取 logger 实例
logger = logging.get_logger(__name__)


def get_dpt_config(model_name):
    if "small" in model_name:
        # 使用预训练的 Dinov2Config,选择特定的输出索引和参数设置
        backbone_config = Dinov2Config.from_pretrained(
            "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
        )
        # 设置 neck 层的隐藏层大小
        neck_hidden_sizes = [48, 96, 192, 384]
    elif "base" in model_name:
        backbone_config = Dinov2Config.from_pretrained(
            "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
        )
        neck_hidden_sizes = [96, 192, 384, 768]
    elif "large" in model_name:
        backbone_config = Dinov2Config.from_pretrained(
            "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False
        )
        neck_hidden_sizes = [128, 256, 512, 1024]
    elif "giant" in model_name:
        backbone_config = Dinov2Config.from_pretrained(
            "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False
        )
        neck_hidden_sizes = [192, 384, 768, 1536]
    else:
        # 若未指定模型名称,抛出未实现的错误
        raise NotImplementedError("To do")

    # 创建 DPTConfig 实例
    config = DPTConfig(
        backbone_config=backbone_config,
        neck_hidden_sizes=neck_hidden_sizes,
        use_bias_in_fusion_residual=False,
        add_projection=True,
    )

    return config


# here we list all DPT keys to be renamed (original name on the left, our name on the right)
# 列出需要重命名的所有 DPT 键(左边为原始名称,右边为新名称)
def create_rename_keys_dpt(config):
    rename_keys = []

    # fmt: off
    # 格式化关闭,用于避免 IDE 格式化工具干扰代码的排版
    # activation postprocessing (projections, readout projections + resize blocks)
    for i in range(4):
        # 添加重命名键值对,将"decode_head.reassemble_blocks.projects.{i}.conv.weight"映射到"neck.reassemble_stage.layers.{i}.projection.weight"
        rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
        # 添加重命名键值对,将"decode_head.reassemble_blocks.projects.{i}.conv.bias"映射到"neck.reassemble_stage.layers.{i}.projection.bias"
        rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))

        # 添加重命名键值对,将"decode_head.reassemble_blocks.readout_projects.{i}.0.weight"映射到"neck.reassemble_stage.readout_projects.{i}.0.weight"
        rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
        # 添加重命名键值对,将"decode_head.reassemble_blocks.readout_projects.{i}.0.bias"映射到"neck.reassemble_stage.readout_projects.{i}.0.bias"
        rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))

        # 如果i不等于2,则添加重命名键值对,将"decode_head.reassemble_blocks.resize_layers.{i}.weight"映射到"neck.reassemble_stage.layers.{i}.resize.weight"
        if i != 2:
            rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
            # 添加重命名键值对,将"decode_head.reassemble_blocks.resize_layers.{i}.bias"映射到"neck.reassemble_stage.layers.{i}.resize.bias"
            rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))

    # fusion layers
    for i in range(4):
        # 添加重命名键值对,将"decode_head.fusion_blocks.{i}.project.conv.weight"映射到"neck.fusion_stage.layers.{i}.projection.weight"
        rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight"))
        # 添加重命名键值对,将"decode_head.fusion_blocks.{i}.project.conv.bias"映射到"neck.fusion_stage.layers.{i}.projection.bias"
        rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias"))
        
        # 如果i不等于0,则添加重命名键值对,将"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight"映射到"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight"
        if i != 0:
            rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight"))
            # 添加重命名键值对,将"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight"映射到"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight"
            rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight"))
        
        # 添加重命名键值对,将"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight"映射到"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight"
        rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight"))
        # 添加重命名键值对,将"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight"映射到"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight"
        rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight"))

    # neck convolutions
    for i in range(4):
        # 添加重命名键值对,将"decode_head.convs.{i}.conv.weight"映射到"neck.convs.{i}.weight"
        rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight"))

    # head
    # 添加重命名键值对,将"decode_head.project.conv.weight"映射到"head.projection.weight"
    rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight"))
    # 添加重命名键值对,将"decode_head.project.conv.bias"映射到"head.projection.bias"
    rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias"))

    for i in range(0, 5, 2):
        # 添加重命名键值对,将"decode_head.conv_depth.head.{i}.weight"映射到"head.head.{i}.weight"
        rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight"))
        # 添加重命名键值对,将"decode_head.conv_depth.head.{i}.bias"映射到"head.head.{i}.bias"
        rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias"))

    # 返回所有重命名的键值对列表
    return rename_keys
# 定义函数:创建用于重命名骨干网络参数的键列表
def create_rename_keys_backbone(config):
    # 初始化一个空的重命名键列表
    rename_keys = []

    # fmt: off
    # 开始忽略格式化,便于在下面进行嵌套的键值对添加
    # patch embedding layer
    # 添加需要重命名的键值对:("原始名称", "我们的名称")
    rename_keys.append(("cls_token", "backbone.embeddings.cls_token"))
    rename_keys.append(("mask_token", "backbone.embeddings.mask_token"))
    rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings"))
    rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))

    # Transfomer encoder
    # 对于每一个编码器层进行迭代
    for i in range(config.backbone_config.num_hidden_layers):
        # layernorms
        rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
        rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
        rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
        rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))

        # MLP
        # 根据配置选择使用不同的MLP结构
        if config.backbone_config.use_swiglu_ffn:
            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight"))
            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias"))
            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight"))
            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias"))
        else:
            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))

        # layerscale
        rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
        rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))

        # attention projection layer
        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
    # fmt: on

    # 添加最后两个需要重命名的键值对
    rename_keys.append(("norm.weight", "backbone.layernorm.weight"))
    rename_keys.append(("norm.bias", "backbone.layernorm.bias"))

    # 返回最终的重命名键列表
    return rename_keys
    # 遍历指定范围内的隐藏层数量,按顺序处理每一层
    for i in range(config.backbone_config.num_hidden_layers):
        # 弹出当前层的注意力机制的查询、键、值的权重和偏置
        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
        # 获取隐藏层的大小
        hidden_size = config.backbone_config.hidden_size
        
        # 将查询(query)、键(key)、值(value)依次添加到状态字典中
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
# 从字典中删除旧键,将其对应的值存储在变量val中
def rename_key(dct, old, new):
    val = dct.pop(old)
    # 将旧键对应的值以新键的形式重新插入字典中
    dct[new] = val

# 下载一个可爱猫咪图片并返回其Image对象
def prepare_img():
    # 图片的URL地址
    url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg"
    # 使用requests库获取图片的原始数据流,并由PIL库打开为Image对象
    im = Image.open(requests.get(url, stream=True).raw)
    return im

# 包含模型名称到其对应预训练权重文件URL的字典
name_to_url = {
    "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth",
    "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth",
    "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth",
    "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth",
    "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth",
    "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth",
    "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth",
    "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth",
}

# 获取图像的原始像素值
def get_original_pixel_values(image):
    # 定义一个用于图像预处理的类CenterPadding
    class CenterPadding(object):
        def __init__(self, multiple):
            super().__init__()
            self.multiple = multiple

        # 计算填充值以使图像大小成为multiple的整数倍
        def _get_pad(self, size):
            new_size = math.ceil(size / self.multiple) * self.multiple
            pad_size = new_size - size
            pad_size_left = pad_size // 2
            pad_size_right = pad_size - pad_size_left
            return pad_size_left, pad_size_right

        # 对图像进行填充操作
        def __call__(self, img):
            pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1]))
            output = torch.nn.functional.pad(img, pads)
            return output

        # 返回类的描述字符串
        def __repr__(self):
            return self.__class__.__name__ + "()"

    # 定义图像转换的函数make_depth_transform
    def make_depth_transform() -> transforms.Compose:
        return transforms.Compose(
            [
                transforms.ToTensor(),  # 将图像转换为Tensor
                lambda x: 255.0 * x[:3],  # 丢弃alpha通道并将像素值缩放到0-255范围
                transforms.Normalize(
                    mean=(123.675, 116.28, 103.53),
                    std=(58.395, 57.12, 57.375),
                ),
                CenterPadding(multiple=14),  # 使用CenterPadding类进行图像填充
            ]
        )

    # 创建图像转换操作
    transform = make_depth_transform()
    # 对输入的图像应用转换操作,并在第0维度增加一个维度
    original_pixel_values = transform(image).unsqueeze(0)

    return original_pixel_values

# 用于无梯度计算的装饰器,用于转换DPT模型的检查点
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
    """
    Copy/paste/tweak model's weights to our DPT structure.
    """

    # 根据模型名称获取检查点的URL
    checkpoint_url = name_to_url[model_name]
    # 根据模型名称获取DPT的配置信息
    config = get_dpt_config(model_name)

    # 打印检查点的URL地址
    print("URL:", checkpoint_url)
    # 从指定的 URL 加载预训练模型的状态字典,使用 CPU 运行
    dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]

    # 根据配置文件创建重命名键列表
    rename_keys = create_rename_keys_dpt(config)
    # 遍历重命名键列表,将模型状态字典中的键进行重命名
    for src, dest in rename_keys:
        rename_key(dpt_state_dict, src, dest)

    # 根据模型名称加载原始的骨干网络状态字典
    if "small" in model_name:
        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
    elif "base" in model_name:
        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
    elif "large" in model_name:
        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14")
    elif "giant" in model_name:
        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14")
    else:
        raise NotImplementedError("To do")
    # 将模型设置为评估模式
    original_model.eval()
    # 获取原始骨干网络的状态字典
    backbone_state_dict = original_model.state_dict()

    # 根据配置文件创建重命名键列表
    rename_keys = create_rename_keys_backbone(config)
    # 遍历重命名键列表,将骨干网络状态字典中的键进行重命名
    for src, dest in rename_keys:
        rename_key(backbone_state_dict, src, dest)

    # 从骨干网络状态字典中读取 QKV 矩阵
    read_in_q_k_v(backbone_state_dict, config)

    # 复制骨干网络状态字典的条目,处理特定键的名称替换
    for key, val in backbone_state_dict.copy().items():
        val = backbone_state_dict.pop(key)
        if "w12" in key:
            key = key.replace("w12", "weights_in")
        if "w3" in key:
            key = key.replace("w3", "weights_out")
        backbone_state_dict[key] = val

    # 合并骨干网络状态字典和 DPT 模型状态字典
    state_dict = {**backbone_state_dict, **dpt_state_dict}

    # 加载 HuggingFace 模型
    model = DPTForDepthEstimation(config)
    # 加载模型的状态字典,并允许部分匹配
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    print("Missing keys:", missing_keys)
    print("Unexpected keys:", unexpected_keys)
    # 断言确保缺失的键符合预期
    assert missing_keys == [
        "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight",
        "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight",
    ]
    # 设置模型为评估模式
    model.eval()

    # 验证图像处理器配置
    processor = DPTImageProcessor(
        do_resize=False,
        do_rescale=False,
        do_pad=True,
        size_divisor=14,
        do_normalize=True,
        image_mean=(123.675, 116.28, 103.53),
        image_std=(58.395, 57.12, 57.375),
    )

    # 准备图像数据
    image = prepare_img()
    # 使用图像处理器处理图像并获取像素值张量
    pixel_values = processor(image, return_tensors="pt").pixel_values.float()
    # 获取原始图像的像素值
    original_pixel_values = get_original_pixel_values(image)

    # 断言确保处理后的像素值与原始像素值接近
    assert torch.allclose(pixel_values, original_pixel_values)

    # 验证模型的前向传播
    with torch.no_grad():
        outputs = model(pixel_values)

    # 获取预测的深度图
    predicted_depth = outputs.predicted_depth

    # 打印预测深度的形状信息和部分预测值
    print("Shape of predicted depth:", predicted_depth.shape)
    print("First values of predicted depth:", predicted_depth[0, :3, :3])

    # 断言确保 logits 的条件
    # 如果需要验证 logits,则执行以下操作
    if verify_logits:
        # 如果模型名称是 "dpt-dinov2-small-nyu"
        if model_name == "dpt-dinov2-small-nyu":
            # 设置预期的深度图形状
            expected_shape = torch.Size([1, 576, 736])
            # 设置预期的深度图片段数据
            expected_slice = torch.tensor(
                [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]]
            )

        # 断言预测的深度图形状是否符合预期
        assert predicted_depth.shape == torch.Size(expected_shape)
        # 断言预测的深度图片段是否与预期片段在指定的误差范围内一致
        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5)
        # 打印确认信息
        print("Looks ok!")

    # 如果指定了 PyTorch 模型保存路径
    if pytorch_dump_folder_path is not None:
        # 创建目录(如果不存在)
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印保存模型和处理器的消息
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 打印推送模型和处理器到 Hub 的消息
        print("Pushing model and processor to hub...")
        # 将模型推送到 Hub
        model.push_to_hub(repo_id=f"facebook/{model_name}")
        # 将处理器推送到 Hub
        processor.push_to_hub(repo_id=f"facebook/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行,则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建命令行参数解析器对象

    # 必需的参数
    parser.add_argument(
        "--model_name",
        default="dpt-dinov2-small-nyu",
        type=str,
        choices=name_to_url.keys(),
        help="Name of the model you'd like to convert."
    )
    # 添加一个参数:模型名称,类型为字符串,默认为"dpt-dinov2-small-nyu",可选值为name_to_url字典的键,用于指定要转换的模型名称

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model directory."
    )
    # 添加一个参数:PyTorch 模型输出目录的路径,类型为字符串,默认为None,用于指定输出的PyTorch模型存储目录的路径

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model to the hub after conversion."
    )
    # 添加一个参数:是否在转换后将模型推送到模型中心(hub),采用布尔值标志,默认为False,用于指定是否在转换后将模型推送到hub

    parser.add_argument(
        "--verify_logits",
        action="store_true",
        required=False,
        help="Path to the output PyTorch model directory."
    )
    # 添加一个参数:是否验证 logits,采用布尔值标志,默认为False,用于指定是否验证 logits,并指定验证结果的输出路径

    args = parser.parse_args()
    # 解析命令行参数并将其存储在args变量中

    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
    # 调用convert_dpt_checkpoint函数,传入解析后的参数args中的模型名称、PyTorch模型输出目录路径、推送到hub的标志、验证logits的标志作为参数

.\models\dpt\convert_dpt_beit_to_hf.py

# coding=utf-8
# 版权所有 2023 年 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证 2.0 版本许可,除非符合许可协议,否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按"原样"分发本软件,
# 无任何明示或暗示的担保或条件。
# 请参阅许可协议了解具体语言的权限和限制。
"""从 MiDaS 仓库转换 DPT 3.1 检查点。URL:https://github.com/isl-org/MiDaS"""

import argparse  # 导入命令行参数解析库
from pathlib import Path  # 导入路径操作库

import requests  # 导入 HTTP 请求库
import torch  # 导入 PyTorch 深度学习库
from PIL import Image  # 导入图像处理库

from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor  # 导入转换器库的相关组件
from transformers.utils import logging  # 导入转换器库的日志模块


logging.set_verbosity_info()  # 设置日志级别为信息
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_dpt_config(model_name):
    hidden_size = 768  # 隐藏层大小设为 768
    num_hidden_layers = 12  # 隐藏层层数设为 12
    num_attention_heads = 12  # 注意力头数设为 12
    intermediate_size = 3072  # 中间层大小设为 3072
    out_features = ["stage3", "stage6", "stage9", "stage12"]  # 输出特征设为 ["stage3", "stage6", "stage9", "stage12"]

    if "large" in model_name:
        hidden_size = 1024  # 如果模型名中包含 "large",则将隐藏层大小设为 1024
        num_hidden_layers = 24  # 将隐藏层层数设为 24
        num_attention_heads = 16  # 将注意力头数设为 16
        intermediate_size = 4096  # 将中间层大小设为 4096
        out_features = ["stage6", "stage12", "stage18", "stage24"]  # 输出特征设为 ["stage6", "stage12", "stage18", "stage24"]

    if "512" in model_name:
        image_size = 512  # 如果模型名中包含 "512",则将图像大小设为 512
    elif "384" in model_name:
        image_size = 384  # 如果模型名中包含 "384",则将图像大小设为 384
    else:
        raise ValueError("Model not supported")  # 如果模型不支持,则引发值错误异常

    # 创建背景配置对象
    backbone_config = BeitConfig(
        image_size=image_size,
        num_hidden_layers=num_hidden_layers,
        hidden_size=hidden_size,
        intermediate_size=intermediate_size,
        num_attention_heads=num_attention_heads,
        use_relative_position_bias=True,
        reshape_hidden_states=False,
        out_features=out_features,
    )

    # 根据模型名称设置颈部隐藏层大小列表
    neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768]
    
    # 创建 DPT 配置对象
    config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)

    return config, image_size  # 返回配置对象和图像大小


# 此处列出所有要重命名的键(原始名称在左侧,我们的名称在右侧)
def create_rename_keys(config):
    rename_keys = []  # 初始化空的重命名键列表

    # fmt: off
    # stem
    rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token"))  # 添加重命名键:("pretrained.model.cls_token", "backbone.embeddings.cls_token")
    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))  # 添加重命名键:("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")
    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))  # 添加重命名键:("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")

    # Transfomer encoder
    # fmt: on
    # 遍历从配置中获取的隐藏层数量的范围
    for i in range(config.backbone_config.num_hidden_layers):
        # 添加转换后的键值对,将预训练模型中的参数映射到新的后骨干网络结构中的位置
        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1"))
        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2"))
        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight"))
        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias"))
        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight"))
        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias"))
        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight"))
        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias"))
        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight"))
        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias"))
        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"))
        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"))

    # 激活后处理(读出投影 + 调整块)
    for i in range(4):
        # 读出投影权重和偏置的映射
        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))

        # 中间层投影权重和偏置的映射
        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))

        # 如果不是第二个块,映射调整块权重和偏置
        if i != 2:
            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))

    # RefineNet(这里有点棘手)
    mapping = {1:3, 2:2, 3:1, 4:0}
    # 遍历范围为 1 到 4,根据映射表 mapping 将每个 i 映射到 j
    for i in range(1, 5):
        j = mapping[i]
        # 向 rename_keys 列表中添加元组,将模型参数名从 scratch.refinenet{i} 映射到 neck.fusion_stage.layers.{j}
        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))

    # 遍历范围为 0 到 4,向 rename_keys 列表中添加元组,映射 scratch.layer{i+1}_rn 到 neck.convs.{i}
    for i in range(4):
        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))

    # 遍历范围为 0 到 5(步长为 2),向 rename_keys 列表中添加元组,映射 scratch.output_conv.{i} 到 head.head.{i}
    for i in range(0, 5, 2):
        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))

    # 返回存储了所有模型参数重命名信息的 rename_keys 列表
    return rename_keys
# 从给定的状态字典中移除特定的键
def remove_ignore_keys_(state_dict):
    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
    for k in ignore_keys:
        state_dict.pop(k, None)

# 将每个编码器层的矩阵拆分为查询(queries)、键(keys)和值(values)
def read_in_q_k_v(state_dict, config):
    # 获取隐藏层大小
    hidden_size = config.backbone_config.hidden_size
    # 遍历编码器层次的数量
    for i in range(config.backbone_config.num_hidden_layers):
        # 读取输入投影层的权重和偏置(在原始实现中,这是一个单独的矩阵加上偏置)
        in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight")
        q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias")
        v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias")
        # 将查询(query)、键(key)和值(value)依次添加到状态字典中
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias

# 重命名字典中的键
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val

# 准备一张猫咪图片,用于验证我们的结果
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)
    return im

# 将模型的权重复制/粘贴/调整到我们的DPT结构中
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
    """
    Copy/paste/tweak model's weights to our DPT structure.
    """

    # 定义基于URL的DPT配置
    name_to_url = {
        "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
        "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt",
        "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt",
    }

    # 根据模型名称选择检查点URL
    checkpoint_url = name_to_url[model_name]
    # 获取DPT配置和图像大小
    config, image_size = get_dpt_config(model_name)
    # 从URL加载原始的状态字典
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
    # 移除指定的键
    remove_ignore_keys_(state_dict)
    # 创建重命名键的映射
    rename_keys = create_rename_keys(config)
    # 遍历重命名映射并重命名键
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    # 读取qkv矩阵
    read_in_q_k_v(state_dict, config)

    # 加载HuggingFace模型
    model = DPTForDepthEstimation(config)
    # 加载模型的状态字典,允许严格性检查
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    # 打印缺失的键
    print("Missing keys:", missing_keys)
    # 打印出意外的键列表
    print("Unexpected keys:", unexpected_keys)
    # 确保缺失的键列表为空
    assert missing_keys == []
    # 将模型设置为评估模式
    model.eval()

    # 创建图像处理器对象,设定图像尺寸和其他参数
    # 这里设置 `keep_aspect_ratio=False`,因为当前的 BEiT 不支持任意窗口大小
    processor = DPTImageProcessor(
        size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32
    )

    # 准备图像数据
    image = prepare_img()
    # 使用图像处理器处理图像,返回像素值的张量表示
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # 打印像素值的一些信息
    print("First values of pixel values:", pixel_values[0, 0, :3, :3])
    print("Mean of pixel values:", pixel_values.mean().item())
    print("Shape of pixel values:", pixel_values.shape)

    # 导入必要的库和模块
    import requests
    from PIL import Image
    from torchvision import transforms

    # 从 URL 加载图像并使用 PIL 打开
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)

    # 创建图像转换管道,包括调整大小和转换为张量
    transforms = transforms.Compose(
        [
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
        ]
    )
    # 对图像进行转换处理,添加一个维度
    pixel_values = transforms(image).unsqueeze(0)

    # 前向传播,关闭梯度计算
    with torch.no_grad():
        outputs = model(pixel_values)

    # 获取预测的深度图
    predicted_depth = outputs.predicted_depth

    # 打印预测深度图的形状和部分值
    print("Shape of predicted depth:", predicted_depth.shape)
    print("First values of predicted depth:", predicted_depth[0, :3, :3])

    # 断言预测深度图的形状和部分值与预期相符
    if model_name == "dpt-beit-large-512":
        expected_shape = torch.Size([1, 512, 512])
        expected_slice = torch.tensor(
            [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]]
        )
    elif model_name == "dpt-beit-large-384":
        expected_shape = torch.Size([1, 384, 384])
        expected_slice = torch.tensor(
            [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]],
        )
    elif model_name == "dpt-beit-base-384":
        expected_shape = torch.Size([1, 384, 384])
        expected_slice = torch.tensor(
            [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]],
        )

    # 断言预测的深度图的形状和部分值与期望的形状和值相等
    assert predicted_depth.shape == torch.Size(expected_shape)
    assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存文件夹路径,则保存模型和处理器
    if pytorch_dump_folder_path is not None:
        # 确保文件夹存在或创建
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印保存路径信息,并保存模型和处理器
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)
    # 如果 push_to_hub 变量为真,则执行下面的代码块
    if push_to_hub:
        # 打印消息,指示正在将模型和处理器推送到 Hub
        print("Pushing model and processor to hub...")
        # 调用 model 对象的 push_to_hub 方法,将模型推送到指定的仓库ID
        model.push_to_hub(repo_id=f"nielsr/{model_name}")
        # 调用 processor 对象的 push_to_hub 方法,将处理器推送到指定的仓库ID
        processor.push_to_hub(repo_id=f"nielsr/{model_name}")
if __name__ == "__main__":
    # 如果脚本被直接运行而不是作为模块导入,则执行以下代码块
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="dpt-beit-large-512",
        type=str,
        choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"],
        help="Name of the model you'd like to convert.",
    )
    # 添加一个必需的参数,用于指定要转换的模型名称,提供默认值和选项列表

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加一个参数,用于指定输出 PyTorch 模型文件的目录路径,可选,默认为 None

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model to the hub after conversion.",
    )
    # 添加一个参数,用于指定是否在转换后将模型推送到指定的 hub 中,这是一个布尔标志参数

    args = parser.parse_args()
    # 解析命令行参数并将其存储在 args 对象中

    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_dpt_checkpoint,传入解析得到的参数进行模型转换操作

.\models\dpt\convert_dpt_hybrid_to_pytorch.py

# 设置脚本的编码格式为 UTF-8
# 版权声明,引用的 HuggingFace Inc. 的团队
#
# 根据 Apache 许可证 2.0 版本授权,除非符合许可证的要求,否则不得使用此文件
# 可以在以下网址获取许可证副本:http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按“原样”分发软件,
# 没有任何明示或暗示的保证或条件。查看许可证以获取具体语言的权限和限制
"""从原始存储库中转换 DPT 检查点。URL:https://github.com/isl-org/DPT"""


# 导入必要的库
import argparse  # 用于解析命令行参数
import json  # 用于处理 JSON 数据
from pathlib import Path  # 用于处理文件路径

import requests  # 发送 HTTP 请求
import torch  # PyTorch 深度学习库
from huggingface_hub import cached_download, hf_hub_url  # 从 Hugging Face Hub 下载模型和数据
from PIL import Image  # Python Imaging Library,用于图像处理

# 导入 DPT 模型和相关工具
from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging  # 导入日志记录工具


# 设置日志输出级别为 info
logging.set_verbosity_info()
logger = logging.get_logger(__name__)


# 根据给定的 checkpoint_url 返回相应的 DPTConfig 对象和预期的输出形状
def get_dpt_config(checkpoint_url):
    config = DPTConfig(embedding_type="hybrid")

    # 根据 URL 中是否包含 "large" 来设置不同的配置
    if "large" in checkpoint_url:
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16
        config.backbone_out_indices = [5, 11, 17, 23]
        config.neck_hidden_sizes = [256, 512, 1024, 1024]
        expected_shape = (1, 384, 384)

    # 根据 URL 中是否包含 "nyu" 或 "midas" 来设置不同的配置
    if "nyu" or "midas" in checkpoint_url:
        config.hidden_size = 768
        config.reassemble_factors = [1, 1, 1, 0.5]
        config.neck_hidden_sizes = [256, 512, 768, 768]
        config.num_labels = 150
        config.patch_size = 16
        expected_shape = (1, 384, 384)
        config.use_batch_norm_in_fusion_residual = False
        config.readout_type = "project"

    # 根据 URL 中是否包含 "ade" 来设置不同的配置
    if "ade" in checkpoint_url:
        config.use_batch_norm_in_fusion_residual = True
        config.hidden_size = 768
        config.reassemble_stage = [1, 1, 1, 0.5]
        config.num_labels = 150
        config.patch_size = 16
        repo_id = "huggingface/label-files"
        filename = "ade20k-id2label.json"
        # 下载并加载 ADE20K 的标签映射文件
        id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        expected_shape = [1, 150, 480, 480]

    return config, expected_shape


# 移除 state_dict 中特定的键
def remove_ignore_keys_(state_dict):
    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
    for k in ignore_keys:
        state_dict.pop(k, None)


# 重命名键名,将 "pretrained.model" 替换为 "dpt.encoder"
def rename_key(name):
    if (
        "pretrained.model" in name
        and "cls_token" not in name
        and "pos_embed" not in name
        and "patch_embed" not in name
    ):
        name = name.replace("pretrained.model", "dpt.encoder")
    # 检查字符串 "pretrained.model" 是否在变量 name 中
    if "pretrained.model" in name:
        # 将字符串 "pretrained.model" 替换为 "dpt.embeddings"
        name = name.replace("pretrained.model", "dpt.embeddings")

    # 检查字符串 "patch_embed" 是否在变量 name 中
    if "patch_embed" in name:
        # 将字符串 "patch_embed" 替换为空字符串 ""
        name = name.replace("patch_embed", "")

    # 检查字符串 "pos_embed" 是否在变量 name 中
    if "pos_embed" in name:
        # 将字符串 "pos_embed" 替换为 "position_embeddings"
        name = name.replace("pos_embed", "position_embeddings")

    # 检查字符串 "attn.proj" 是否在变量 name 中
    if "attn.proj" in name:
        # 将字符串 "attn.proj" 替换为 "attention.output.dense"
        name = name.replace("attn.proj", "attention.output.dense")

    # 检查字符串 "proj" 是否在变量 name 中,并且 "project" 不在 name 中
    if "proj" in name and "project" not in name:
        # 将字符串 "proj" 替换为 "projection"
        name = name.replace("proj", "projection")

    # 检查字符串 "blocks" 是否在变量 name 中
    if "blocks" in name:
        # 将字符串 "blocks" 替换为 "layer"
        name = name.replace("blocks", "layer")

    # 检查字符串 "mlp.fc1" 是否在变量 name 中
    if "mlp.fc1" in name:
        # 将字符串 "mlp.fc1" 替换为 "intermediate.dense"
        name = name.replace("mlp.fc1", "intermediate.dense")

    # 检查字符串 "mlp.fc2" 是否在变量 name 中
    if "mlp.fc2" in name:
        # 将字符串 "mlp.fc2" 替换为 "output.dense"
        name = name.replace("mlp.fc2", "output.dense")

    # 检查字符串 "norm1" 是否在变量 name 中,并且 "backbone" 不在 name 中
    if "norm1" in name and "backbone" not in name:
        # 将字符串 "norm1" 替换为 "layernorm_before"
        name = name.replace("norm1", "layernorm_before")

    # 检查字符串 "norm2" 是否在变量 name 中,并且 "backbone" 不在 name 中
    if "norm2" in name and "backbone" not in name:
        # 将字符串 "norm2" 替换为 "layernorm_after"
        name = name.replace("norm2", "layernorm_after")

    # 检查字符串 "scratch.output_conv" 是否在变量 name 中
    if "scratch.output_conv" in name:
        # 将字符串 "scratch.output_conv" 替换为 "head"
        name = name.replace("scratch.output_conv", "head")

    # 检查字符串 "scratch" 是否在变量 name 中
    if "scratch" in name:
        # 将字符串 "scratch" 替换为 "neck"
        name = name.replace("scratch", "neck")

    # 检查字符串 "layer1_rn" 是否在变量 name 中
    if "layer1_rn" in name:
        # 将字符串 "layer1_rn" 替换为 "convs.0"
        name = name.replace("layer1_rn", "convs.0")

    # 检查字符串 "layer2_rn" 是否在变量 name 中
    if "layer2_rn" in name:
        # 将字符串 "layer2_rn" 替换为 "convs.1"
        name = name.replace("layer2_rn", "convs.1")

    # 检查字符串 "layer3_rn" 是否在变量 name 中
    if "layer3_rn" in name:
        # 将字符串 "layer3_rn" 替换为 "convs.2"
        name = name.replace("layer3_rn", "convs.2")

    # 检查字符串 "layer4_rn" 是否在变量 name 中
    if "layer4_rn" in name:
        # 将字符串 "layer4_rn" 替换为 "convs.3"
        name = name.replace("layer4_rn", "convs.3")

    # 检查字符串 "refinenet" 是否在变量 name 中
    if "refinenet" in name:
        # 获取 refinenet 后面的数字索引
        layer_idx = int(name[len("neck.refinenet"): len("neck.refinenet") + 1])
        # 根据索引映射替换字符串,例如 refinenet4 替换为 fusion_stage.layers.0
        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")

    # 检查字符串 "out_conv" 是否在变量 name 中
    if "out_conv" in name:
        # 将字符串 "out_conv" 替换为 "projection"
        name = name.replace("out_conv", "projection")

    # 检查字符串 "resConfUnit1" 是否在变量 name 中
    if "resConfUnit1" in name:
        # 将字符串 "resConfUnit1" 替换为 "residual_layer1"
        name = name.replace("resConfUnit1", "residual_layer1")

    # 检查字符串 "resConfUnit2" 是否在变量 name 中
    if "resConfUnit2" in name:
        # 将字符串 "resConfUnit2" 替换为 "residual_layer2"
        name = name.replace("resConfUnit2", "residual_layer2")

    # 检查字符串 "conv1" 是否在变量 name 中
    if "conv1" in name:
        # 将字符串 "conv1" 替换为 "convolution1"
        name = name.replace("conv1", "convolution1")

    # 检查字符串 "conv2" 是否在变量 name 中
    if "conv2" in name:
        # 将字符串 "conv2" 替换为 "convolution2"
        name = name.replace("conv2", "convolution2")

    # 检查字符串 "pretrained.act_postprocess1.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess1.0.project.0" in name:
        # 将字符串 "pretrained.act_postprocess1.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.0.0"
        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")

    # 检查字符串 "pretrained.act_postprocess2.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess2.0.project.0" in name:
        # 将字符串 "pretrained.act_postprocess2.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.1.0"
        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")

    # 检查字符串 "pretrained.act_postprocess3.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess3.0.project.0" in name:
        # 将字符串 "pretrained.act_postprocess3.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.2.0"
        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")

    # 检查字符串 "pretrained.act_postprocess4.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess4.0.project.0" in name:
        # 将字符串 "pretrained.act_postprocess4.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.3.0"
        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")

    # resize blocks
    # 检查字符串 "pretrained.act_postprocess1.3" 是否在变量 name 中
    if "pretrained.act_postprocess1.3" in name:
        # 将字符串 "pretrained.act_postprocess1.3" 替换为 "neck.reassemble_stage.layers.0.projection"
        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
    # 检查字符串 "pretrained.act_postprocess1.4" 是否在变量 name 中
    if "pretrained.act_postprocess1.4" in name:
        # 将字符串 "pretrained.act_postprocess1.4" 替换为 "neck.reassemble_stage.layers.0.resize"
        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
    # 检查字符串 "pretrained.act_postprocess2.3" 是否在变量 name 中
    if "pretrained.act_postprocess2.3" in name:
        # 将字符串 "pretrained.act_postprocess2.3" 替换为 "neck.reassemble_stage.layers.1.projection"
        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
    # 检查字符串 "pretrained.act_postprocess2.4" 是否在变量 name 中
    if "pretrained.act_postprocess2.4" in name:
        # 将字符串 "pretrained.act_postprocess2.4" 替换为 "neck.reassemble_stage.layers.1.resize"
        name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
    # 检查字符串 "pretrained.act_postprocess3.3" 是否在变量 name 中
    if "pretrained.act_postprocess3.3" in name:
        # 将字符串 "pretrained.act_postprocess3.3" 替换为 "neck.reassemble_stage.layers.2.projection"
        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
    # 检查字符串 "pretrained.act_postprocess4.3" 是否在变量 name 中
    if "pretrained.act_postprocess4.3" in name:
        # 将字符串 "pretrained.act_postprocess4.3" 替换为 "neck.reassemble_stage.layers.3.projection"
        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
    # 检查字符串 "pretrained.act_postprocess4.4" 是否在变量 name 中
    if "pretrained.act_postprocess4.4" in name:
        # 将字符串 "pretrained.act_postprocess4.4" 替换为 "neck.reassemble_stage.layers.3.resize"
        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
    # 检查字符串 "pretrained" 是否在变量 name 中
    if "pretrained" in name:
        # 将字符串 "pretrained" 替换为 "dpt"
        name = name.replace("pretrained", "dpt")
    # 检查字符串 "bn" 是否在变量 name 中
    if "bn" in name:
        # 将字符串 "bn" 替换为 "batch_norm"
        name = name.replace("bn", "batch_norm")
    # 检查字符串 "head" 是否在变量 name 中
    if "head" in name:
        # 将字符串 "head" 替换为 "head.head"
        name = name.replace("head", "head.head")
    # 检查字符串 "encoder.norm" 是否在变量 name 中
    if "encoder.norm" in name:
        # 将字符串 "encoder.norm" 替换为 "layernorm"
        name = name.replace("encoder.norm", "layernorm")
    # 检查字符串 "auxlayer" 是否在变量 name 中
    if "auxlayer" in name:
        # 将字符串 "auxlayer" 替换为 "auxiliary_head.head"
        name = name.replace("auxlayer", "auxiliary_head.head")
    # 检查字符串 "backbone" 是否在变量 name 中
    if "backbone" in name:
        # 将字符串 "backbone" 替换为 "backbone.bit.encoder"
        name = name.replace("backbone", "backbone.bit.encoder")

    # 检查字符串 ".." 是否在变量 name 中
    if ".." in name:
        # 将字符串 ".." 替换为 "."
        name = name.replace("..", ".")

    # 检查字符串 "stem.conv" 是否在变量 name 中
    if "stem.conv" in name:
        # 将字符串 "stem.conv" 替换为 "bit.embedder.convolution"
        name = name.replace("stem.conv", "bit.embedder.convolution")
    # 检查字符串 "blocks" 是否在变量 name 中
    if "blocks" in name:
        # 将字符串 "blocks" 替换为 "layers"
        name = name.replace("blocks", "layers")
    # 检查字符串 "convolution" 和 "backbone" 是否在变量 name 中
    if "convolution" in name and "backbone" in name:
        # 将字符串 "convolution" 替换为 "conv"
        name = name.replace("convolution", "conv")
    # 检查字符串 "layer" 和 "backbone" 是否在变量 name 中
    if "layer" in name and "backbone" in name:
        # 将字符串 "layer" 替换为 "layers"
        name = name.replace("layer", "layers")
    # 检查字符串 "backbone.bit.encoder.bit" 是否在变量 name 中
    if "backbone.bit.encoder.bit" in name:
        # 将字符串 "backbone.bit.encoder.bit" 替换为 "backbone.bit"
        name = name.replace("backbone.bit.encoder.bit", "backbone.bit")
    # 检查字符串 "embedder.conv" 是否在变量 name 中
    if "embedder.conv" in name:
        # 将字符串 "embedder.conv" 替换为 "embedder.convolution"
        name = name.replace("embedder.conv", "embedder.convolution")
    # 检查字符串 "backbone.bit.encoder.stem.norm" 是否在变量 name 中
    if "backbone.bit.encoder.stem.norm" in name:
        # 将字符串 "backbone.bit.encoder.stem.norm" 替换为 "backbone.bit.embedder.norm"
        name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm")
    # 返回处理后的字符串 name
    return name
# 将每个编码器层的权重矩阵分解为查询(queries)、键(keys)和值(values)
def read_in_q_k_v(state_dict, config):
    # 遍历每个编码器层
    for i in range(config.num_hidden_layers):
        # 读取输入投影层的权重和偏置(在timm中,这是一个单独的矩阵加偏置)
        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
        # 将查询、键、值依次添加到状态字典中
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
            config.hidden_size : config.hidden_size * 2
        ]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
            -config.hidden_size :, :
        ]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]


# 我们将在一张可爱猫咪的图片上验证我们的结果
def prepare_img():
    # 图片地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用requests获取图片的原始流,并由PIL库打开为图像对象
    im = Image.open(requests.get(url, stream=True).raw)
    return im


@torch.no_grad()
def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction):
    """
    复制/粘贴/调整模型权重到我们的DPT结构中。
    """

    # 根据URL定义DPT配置
    config, expected_shape = get_dpt_config(checkpoint_url)
    # 从URL加载原始state_dict
    state_dict = torch.load(checkpoint_url, map_location="cpu")
    # 移除特定的键
    remove_ignore_keys_(state_dict)
    # 重命名键
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        state_dict[rename_key(key)] = val
    # 读取qkv矩阵
    read_in_q_k_v(state_dict, config)

    # 加载HuggingFace模型
    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
    model.load_state_dict(state_dict)
    model.eval()

    # 在图片上进行输出检查
    size = 480 if "ade" in checkpoint_url else 384
    image_processor = DPTImageProcessor(size=size)

    # 准备图像
    image = prepare_img()
    encoding = image_processor(image, return_tensors="pt")

    # 前向传播
    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
    # 如果需要展示预测结果
    if show_prediction:
        # 对模型输出进行插值,使其与原始图像大小一致,使用双三次插值,不对齐角落
        prediction = (
            torch.nn.functional.interpolate(
                outputs.unsqueeze(1),
                size=(image.size[1], image.size[0]),
                mode="bicubic",
                align_corners=False,
            )
            # 去除插值后的张量的单维度,将其转移到 CPU 上,并转换为 NumPy 数组
            .squeeze()
            .cpu()
            .numpy()
        )

        # 将 NumPy 数组转换为图像并显示
        Image.fromarray((prediction / prediction.max()) * 255).show()

    # 如果有指定的 PyTorch 模型保存路径
    if pytorch_dump_folder_path is not None:
        # 创建保存模型的文件夹(如果不存在)
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印模型保存的路径
        print(f"Saving model to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 打印图像处理器保存的路径
        print(f"Saving image processor to {pytorch_dump_folder_path}")
        # 将图像处理器保存到指定路径
        image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 推送模型到指定 Hub 仓库
        model.push_to_hub("ybelkada/dpt-hybrid-midas")
        # 推送图像处理器到指定 Hub 仓库
        image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
if __name__ == "__main__":
    # 如果这个脚本是作为主程序运行

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--checkpoint_url",
        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
        type=str,
        help="URL of the original DPT checkpoint you'd like to convert.",
    )
    # 添加一个必需的参数 --checkpoint_url,用于指定原始 DPT 模型的下载链接

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=False,
        help="Path to the output PyTorch model directory.",
    )
    # 添加一个可选的参数 --pytorch_dump_folder_path,用于指定输出的 PyTorch 模型存储目录的路径

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
    )
    # 添加一个布尔参数 --push_to_hub,用于指示是否将转换后的模型推送到模型中心(hub)

    parser.add_argument(
        "--model_name",
        default="dpt-large",
        type=str,
        help="Name of the model, in case you're pushing to the hub.",
    )
    # 添加一个参数 --model_name,用于指定模型的名称,如果将其推送到模型中心(hub)

    parser.add_argument(
        "--show_prediction",
        action="store_true",
    )
    # 添加一个布尔参数 --show_prediction,用于指示是否显示模型的预测结果

    args = parser.parse_args()
    # 解析命令行参数并将其存储在 args 变量中

    convert_dpt_checkpoint(
        args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction
    )
    # 调用 convert_dpt_checkpoint 函数,传递命令行参数中解析得到的参数

.\models\dpt\convert_dpt_swinv2_to_hf.py

#python
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
    rename_keys = []

    # fmt: off
    # stem
    # 返回由 stem 处理的键值对列表
    rename_keys.extend([
    # 添加需要重命名的键值对到列表中,映射预训练模型的权重到新模型的对应位置
    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
    rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
    rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))

    # 转换器编码器部分
    # 注意:非转换器(backbone)如Swinv2、LeViT等不需要后处理激活(读取投影 + 调整块)
    
    # refinenet部分(此处比较棘手)
    # 设置映射关系,将refinenet的输出通道映射到融合阶段的层
    mapping = {1:3, 2:2, 3:1, 4:0}

    # 遍历映射关系,生成重命名的键值对,并添加到列表中
    for i in range(1, 5):
        j = mapping[i]
        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))

    # scratch卷积部分
    # 生成重命名的键值对,将scratch层的权重映射到融合阶段的卷积层
    for i in range(4):
        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))

    # 头部部分
    # 生成重命名的键值对,将scratch的输出卷积权重映射到头部的权重
    for i in range(0, 5, 2):
        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))

    # 返回所有重命名后的键值对列表
    return rename_keys
# 从状态字典中移除指定的键列表
def remove_ignore_keys_(state_dict):
    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
    for k in ignore_keys:
        state_dict.pop(k, None)

# 读取每个编码器层的查询(queries)、键(keys)和值(values)矩阵
def read_in_q_k_v(state_dict, config, model):
    for i in range(len(config.backbone_config.depths)):
        for j in range(config.backbone_config.depths[i]):
            # 获取当前注意力层的全头尺寸
            dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size
            # 读取输入投影层权重和偏置(在原始实现中,这是一个单独的矩阵加偏置)
            in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight")
            # 将查询(query)、键(key)、值(value)依次添加到状态字典中
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[dim: dim * 2, :]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[-dim:, :]

# 重命名字典中的键
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val

# 准备图像数据,从指定的 URL 获取图像
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)
    return im

# 将模型的权重复制/粘贴/调整到我们的 DPT 结构中
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub):
    """
    Copy/paste/tweak model's weights to our DPT structure.
    """
    # 定义模型名称到 URL 的映射
    name_to_url = {
        "dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt",
        "dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt",
        "dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
    }
    
    # 根据模型名称获取检查点 URL
    checkpoint_url = name_to_url[model_name]
    # 根据 URL 获取 DPT 配置和图像大小
    config, image_size = get_dpt_config(model_name)
    # 从 URL 加载原始状态字典
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
    
    # 加载 HuggingFace 模型
    model = DPTForDepthEstimation(config)
    
    # 移除特定的键
    remove_ignore_keys_(state_dict)
    # 创建键重命名映射
    rename_keys = create_rename_keys(config)
    # 对每对源键和目标键执行重命名操作
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    # 读取 QKV 矩阵
    read_in_q_k_v(state_dict, config, model)
    
    # 使用非严格模式加载模型状态字典,并获取缺失和意外的键列表
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    print("Missing keys:", missing_keys)
    print("Unexpected keys:", unexpected_keys)
    # 将模型设置为评估模式
    model.eval()
    
    # 在图像上验证输出结果
    # 创建一个 DPTImageProcessor 对象,设置图像大小为指定的 image_size
    processor = DPTImageProcessor(size={"height": image_size, "width": image_size})

    # 准备图像数据
    image = prepare_img()
    # 使用 processor 对象处理图像数据,返回 PyTorch 张量格式
    processor(image, return_tensors="pt")

    # 如果需要验证 logits
    if verify_logits:
        # 导入必要的库和模块
        from torchvision import transforms
        
        # 从网络下载并打开指定 URL 的图像
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)

        # 定义图像转换操作序列
        transforms = transforms.Compose(
            [
                transforms.Resize((image_size, image_size)),  # 调整图像大小为指定尺寸
                transforms.ToTensor(),  # 转换图像为 PyTorch 张量格式
            ]
        )
        # 对图像进行转换操作
        pixel_values = transforms(image).unsqueeze(0)

        # 执行模型的前向传播
        with torch.no_grad():
            outputs = model(pixel_values)

        # 获取预测的深度图
        predicted_depth = outputs.predicted_depth

        # 打印预测深度图的形状信息
        print("Shape of predicted depth:", predicted_depth.shape)
        # 打印预测深度图的前三行三列数据
        print("First values of predicted depth:", predicted_depth[0, :3, :3])

        # 根据模型名称验证预测深度图的形状和部分切片值
        if model_name == "dpt-swinv2-base-384":
            # 确认预期形状和切片值(已验证)
            expected_shape = torch.Size([1, 384, 384])
            expected_slice = torch.tensor(
                [
                    [1998.5575, 1997.3887, 2009.2981],
                    [1952.8607, 1979.6488, 2001.0854],
                    [1953.7697, 1961.7711, 1968.8904],
                ],
            )
        elif model_name == "dpt-swinv2-tiny-256":
            # 确认预期形状和切片值(已验证)
            expected_shape = torch.Size([1, 256, 256])
            expected_slice = torch.tensor(
                [[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]],
            )
        elif model_name == "dpt-swinv2-large-384":
            # 确认预期形状和切片值(已验证)
            expected_shape = torch.Size([1, 384, 384])
            expected_slice = torch.tensor(
                [
                    [1203.7206, 1200.1495, 1197.8234],
                    [1196.2484, 1183.5033, 1186.4640],
                    [1178.8131, 1182.3260, 1174.3975],
                ],
            )

        # 使用断言确认预测深度图的形状和切片值与期望相符
        assert predicted_depth.shape == torch.Size(expected_shape)
        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
        # 打印确认信息
        print("Looks ok!")

    # 如果指定了 pytorch_dump_folder_path,则保存模型和处理器
    if pytorch_dump_folder_path is not None:
        # 创建目录(如果不存在)
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印保存模型和处理器的信息
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型和处理器推送到 Hub
    if push_to_hub:
        # 打印推送模型和处理器到 Hub 的信息
        print("Pushing model and processor to hub...")
        # 推送模型到指定的 Hub 仓库
        model.push_to_hub(repo_id=f"Intel/{model_name}")
        # 推送处理器到指定的 Hub 仓库
        processor.push_to_hub(repo_id=f"Intel/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行,则执行以下代码
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--model_name",
        default="dpt-swinv2-base-384",
        type=str,
        choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"],
        help="Name of the model you'd like to convert.",
    )
    # 添加名为--model_name的参数,指定默认值为"dpt-swinv2-base-384",类型为字符串
    # 可选值为指定的三种模型名称,用于选择要转换的模型

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加名为--pytorch_dump_folder_path的参数,指定默认值为None,类型为字符串
    # 用于指定输出的PyTorch模型存储目录的路径

    parser.add_argument(
        "--verify_logits",
        action="store_true",
        help="Whether to verify logits after conversion.",
    )
    # 添加名为--verify_logits的参数,当命令行中有该选项时,设置为True
    # 用于指定在转换后是否验证logits(输出层未归一化的概率分布)

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model to the hub after conversion.",
    )
    # 添加名为--push_to_hub的参数,当命令行中有该选项时,设置为True
    # 用于指定是否在转换后将模型推送到hub(模型分享和托管服务)

    args = parser.parse_args()
    # 解析命令行参数,并将其存储在args变量中

    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
    # 调用函数convert_dpt_checkpoint,传入解析得到的参数

.\models\dpt\convert_dpt_to_pytorch.py

# 设置文件编码格式为 UTF-8
# 版权声明,指明此代码的版权归 The HuggingFace Inc. 团队所有
# 根据 Apache 许可证 2.0 版本,除非符合许可证,否则不得使用此文件
# 可以在以下网址获得许可证的副本:http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意,否则按“原样”分发的软件
# 没有任何形式的担保或条件,包括但不限于适销性的担保或适用于特定目的的担保
# 请查阅许可证以获取具体的法律条款和限制条件
"""从原始代码库中转换 DPT 模型的检查点。URL: https://github.com/isl-org/DPT"""

# 导入必要的库和模块
import argparse  # 解析命令行参数的库
import json  # 处理 JSON 格式数据的库
from pathlib import Path  # 处理文件路径的类

import requests  # 发送 HTTP 请求的库
import torch  # PyTorch 深度学习库
from huggingface_hub import cached_download, hf_hub_url  # 使用 HF Hub 的函数
from PIL import Image  # Python 图像处理库

# 导入 DPT 模型相关的类和函数
from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging  # 导入日志记录工具

# 设置日志记录的详细程度为信息级别
logging.set_verbosity_info()
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_dpt_config(checkpoint_url):
    config = DPTConfig()  # 创建 DPTConfig 实例

    # 根据检查点 URL 中的关键词调整配置
    if "large" in checkpoint_url:
        # 调整大模型的配置参数
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16
        config.backbone_out_indices = [5, 11, 17, 23]
        config.neck_hidden_sizes = [256, 512, 1024, 1024]
        expected_shape = (1, 384, 384)  # 预期输入形状为 (batch_size, height, width)

    if "ade" in checkpoint_url:
        # 根据检查点 URL 中的关键词调整配置,这里针对 ADE 模型的配置调整
        config.use_batch_norm_in_fusion_residual = True

        config.num_labels = 150  # ADE 模型的标签数目为 150
        repo_id = "huggingface/label-files"
        filename = "ade20k-id2label.json"
        # 从 HF Hub 下载 ADE 模型的标签映射文件并加载为字典
        id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
        id2label = {int(k): v for k, v in id2label.items()}  # 将键转换为整数类型
        config.id2label = id2label  # 设置 ID 到标签的映射
        config.label2id = {v: k for k, v in id2label.items()}  # 设置标签到 ID 的映射
        expected_shape = [1, 150, 480, 480]  # 预期输入形状为 (batch_size, num_labels, height, width)

    return config, expected_shape  # 返回配置对象和预期输入形状信息


def remove_ignore_keys_(state_dict):
    # 移除状态字典中指定的键
    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
    for k in ignore_keys:
        state_dict.pop(k, None)


def rename_key(name):
    # 根据名称的特定规则重命名键名
    if (
        "pretrained.model" in name
        and "cls_token" not in name
        and "pos_embed" not in name
        and "patch_embed" not in name
    ):
        name = name.replace("pretrained.model", "dpt.encoder")
    if "pretrained.model" in name:
        name = name.replace("pretrained.model", "dpt.embeddings")
    if "patch_embed" in name:
        name = name.replace("patch_embed", "patch_embeddings")
    if "pos_embed" in name:
        name = name.replace("pos_embed", "position_embeddings")
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "proj" in name and "project" not in name:
        name = name.replace("proj", "projection")
    # 检查字符串 "blocks" 是否在变量 name 中
    if "blocks" in name:
        # 将变量 name 中的字符串 "blocks" 替换为 "layer"
        name = name.replace("blocks", "layer")
    
    # 检查字符串 "mlp.fc1" 是否在变量 name 中
    if "mlp.fc1" in name:
        # 将变量 name 中的字符串 "mlp.fc1" 替换为 "intermediate.dense"
        name = name.replace("mlp.fc1", "intermediate.dense")
    
    # 检查字符串 "mlp.fc2" 是否在变量 name 中
    if "mlp.fc2" in name:
        # 将变量 name 中的字符串 "mlp.fc2" 替换为 "output.dense"
        name = name.replace("mlp.fc2", "output.dense")
    
    # 检查字符串 "norm1" 是否在变量 name 中
    if "norm1" in name:
        # 将变量 name 中的字符串 "norm1" 替换为 "layernorm_before"
        name = name.replace("norm1", "layernorm_before")
    
    # 检查字符串 "norm2" 是否在变量 name 中
    if "norm2" in name:
        # 将变量 name 中的字符串 "norm2" 替换为 "layernorm_after"
        name = name.replace("norm2", "layernorm_after")
    
    # 检查字符串 "scratch.output_conv" 是否在变量 name 中
    if "scratch.output_conv" in name:
        # 将变量 name 中的字符串 "scratch.output_conv" 替换为 "head"
        name = name.replace("scratch.output_conv", "head")
    
    # 检查字符串 "scratch" 是否在变量 name 中
    if "scratch" in name:
        # 将变量 name 中的字符串 "scratch" 替换为 "neck"
        name = name.replace("scratch", "neck")
    
    # 检查字符串 "layer1_rn" 是否在变量 name 中
    if "layer1_rn" in name:
        # 将变量 name 中的字符串 "layer1_rn" 替换为 "convs.0"
        name = name.replace("layer1_rn", "convs.0")
    
    # 检查字符串 "layer2_rn" 是否在变量 name 中
    if "layer2_rn" in name:
        # 将变量 name 中的字符串 "layer2_rn" 替换为 "convs.1"
        name = name.replace("layer2_rn", "convs.1")
    
    # 检查字符串 "layer3_rn" 是否在变量 name 中
    if "layer3_rn" in name:
        # 将变量 name 中的字符串 "layer3_rn" 替换为 "convs.2"
        name = name.replace("layer3_rn", "convs.2")
    
    # 检查字符串 "layer4_rn" 是否在变量 name 中
    if "layer4_rn" in name:
        # 将变量 name 中的字符串 "layer4_rn" 替换为 "convs.3"
        name = name.replace("layer4_rn", "convs.3")
    
    # 检查字符串 "refinenet" 是否在变量 name 中
    if "refinenet" in name:
        # 提取 refinenet 后的数字,计算新的索引并替换字符串
        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
    
    # 检查字符串 "out_conv" 是否在变量 name 中
    if "out_conv" in name:
        # 将变量 name 中的字符串 "out_conv" 替换为 "projection"
        name = name.replace("out_conv", "projection")
    
    # 检查字符串 "resConfUnit1" 是否在变量 name 中
    if "resConfUnit1" in name:
        # 将变量 name 中的字符串 "resConfUnit1" 替换为 "residual_layer1"
        name = name.replace("resConfUnit1", "residual_layer1")
    
    # 检查字符串 "resConfUnit2" 是否在变量 name 中
    if "resConfUnit2" in name:
        # 将变量 name 中的字符串 "resConfUnit2" 替换为 "residual_layer2"
        name = name.replace("resConfUnit2", "residual_layer2")
    
    # 检查字符串 "conv1" 是否在变量 name 中
    if "conv1" in name:
        # 将变量 name 中的字符串 "conv1" 替换为 "convolution1"
        name = name.replace("conv1", "convolution1")
    
    # 检查字符串 "conv2" 是否在变量 name 中
    if "conv2" in name:
        # 将变量 name 中的字符串 "conv2" 替换为 "convolution2"
        name = name.replace("conv2", "convolution2")
    
    # 检查字符串 "pretrained.act_postprocess1.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess1.0.project.0" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess1.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.0.0"
        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
    
    # 检查字符串 "pretrained.act_postprocess2.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess2.0.project.0" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess2.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.1.0"
        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
    
    # 检查字符串 "pretrained.act_postprocess3.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess3.0.project.0" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess3.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.2.0"
        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
    
    # 检查字符串 "pretrained.act_postprocess4.0.project.0" 是否在变量 name 中
    if "pretrained.act_postprocess4.0.project.0" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess4.0.project.0" 替换为 "neck.reassemble_stage.readout_projects.3.0"
        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
    
    # 检查字符串 "pretrained.act_postprocess1.3" 是否在变量 name 中
    if "pretrained.act_postprocess1.3" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess1.3" 替换为 "neck.reassemble_stage.layers.0.projection"
        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
    
    # 检查字符串 "pretrained.act_postprocess1.4" 是否在变量 name 中
    if "pretrained.act_postprocess1.4" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess1.4" 替换为 "neck.reassemble_stage.layers.0.resize"
        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
    
    # 检查字符串 "pretrained.act_postprocess2.3" 是否在变量 name 中
    if "pretrained.act_postprocess2.3" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess2.3" 替换为 "neck.reassemble_stage.layers.1.projection"
        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
    
    # 检查字符串 "pretrained.act_postprocess2.4" 是否在变量 name 中
    if "pretrained.act_postprocess2.4" in name:
        # 将变量 name 中的字符串 "pretrained.act_postprocess2.4" 替换为 "neck.reassemble_stage.layers.1
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "pretrained.act_postprocess3.3" in name:
        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "pretrained.act_postprocess4.3" in name:
        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "pretrained.act_postprocess4.4" in name:
        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "pretrained" in name:
        name = name.replace("pretrained", "dpt")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "bn" in name:
        name = name.replace("bn", "batch_norm")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "head" in name:
        name = name.replace("head", "head.head")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "encoder.norm" in name:
        name = name.replace("encoder.norm", "layernorm")
    # 检查名称中是否包含特定字符串,然后替换成指定的新字符串
    if "auxlayer" in name:
        name = name.replace("auxlayer", "auxiliary_head.head")

    # 返回经过所有替换操作后的名称
    return name
# 将每个编码器层的权重矩阵分割为查询(queries)、键(keys)和值(values)
def read_in_q_k_v(state_dict, config):
    # 遍历每个隐藏层
    for i in range(config.num_hidden_layers):
        # 读取输入投影层的权重和偏置(在timm中,这是单个矩阵和偏置)
        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
        
        # 添加查询(query)、键(key)和值(value)到状态字典中,顺序为查询、键、值
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
            config.hidden_size : config.hidden_size * 2
        ]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
            -config.hidden_size :, :
        ]
        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]


# 在一张可爱猫咪的图像上准备我们的结果验证
def prepare_img():
    # 图片URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用requests获取图像原始流,并打开为PIL图像
    im = Image.open(requests.get(url, stream=True).raw)
    return im


@torch.no_grad()
def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name):
    """
    复制/粘贴/调整模型权重到我们的DPT结构。
    """

    # 基于URL定义DPT配置
    config, expected_shape = get_dpt_config(checkpoint_url)
    # 从URL加载原始的state_dict
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
    # 移除特定的键
    remove_ignore_keys_(state_dict)
    # 重命名键
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        state_dict[rename_key(key)] = val
    # 读取qkv矩阵
    read_in_q_k_v(state_dict, config)

    # 根据URL加载HuggingFace模型
    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
    model.load_state_dict(state_dict)
    model.eval()

    # 在一张图像上检查输出
    size = 480 if "ade" in checkpoint_url else 384
    image_processor = DPTImageProcessor(size=size)

    image = prepare_img()
    encoding = image_processor(image, return_tensors="pt")

    # 前向传播
    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth

    # 断言Logits
    expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]])
    # 如果 checkpoint_url 字符串中包含 "ade",则定义预期的切片张量
    if "ade" in checkpoint_url:
        expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]])
    
    # 断言输出张量的形状与预期形状相等
    assert outputs.shape == torch.Size(expected_shape)
    
    # 断言输出张量的部分内容与预期的切片张量在数值上相近
    assert (
        torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4)
        if "ade" in checkpoint_url
        else torch.allclose(outputs[0, :3, :3], expected_slice)
    )
    
    # 打印信息表明一切正常
    print("Looks ok!")

    # 如果指定了 pytorch_dump_folder_path,则保存模型和图像处理器
    if pytorch_dump_folder_path is not None:
        # 创建目录(如果不存在)
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        print(f"Saving model to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        print(f"Saving image processor to {pytorch_dump_folder_path}")
        # 将图像处理器保存到指定路径
        image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型推送到 hub
    if push_to_hub:
        print("Pushing model to hub...")
        # 将模型推送到指定的 hub 仓库
        model.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add model",
            use_temp_dir=True,
        )
        # 将图像处理器推送到指定的 hub 仓库
        image_processor.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add image processor",
            use_temp_dir=True,
        )
if __name__ == "__main__":
    # 如果作为主程序执行,则开始解析命令行参数
    parser = argparse.ArgumentParser()
    
    # 添加必需的参数:checkpoint_url
    parser.add_argument(
        "--checkpoint_url",
        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
        type=str,
        help="URL of the original DPT checkpoint you'd like to convert.",
    )
    
    # 添加可选的参数:pytorch_dump_folder_path
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=False,
        help="Path to the output PyTorch model directory.",
    )
    
    # 添加开关参数:push_to_hub
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
    )
    
    # 添加可选的参数:model_name
    parser.add_argument(
        "--model_name",
        default="dpt-large",
        type=str,
        required=False,
        help="Name of the model, in case you're pushing to the hub.",
    )

    # 解析命令行参数并将其存储在 args 对象中
    args = parser.parse_args()
    
    # 调用函数 convert_dpt_checkpoint,传入命令行参数
    convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)

.\models\dpt\feature_extraction_dpt.py

# 设置编码格式为 utf-8

# 版权声明及许可证信息,保留版权声明,提供 Apache License, Version 2.0 的获取链接

# 导入警告模块
import warnings

# 导入 logging 模块,用于记录日志
from ...utils import logging

# 从 .image_processing_dpt 模块中导入 DPTImageProcessor 类
from .image_processing_dpt import DPTImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 DPTFeatureExtractor 类,继承自 DPTImageProcessor 类
class DPTFeatureExtractor(DPTImageProcessor):
    # 初始化方法
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告,提示 DPTFeatureExtractor 类已弃用,并将在 Transformers 版本 5 中移除
        warnings.warn(
            "The class DPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use DPTImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法
        super().__init__(*args, **kwargs)

.\models\dpt\image_processing_dpt.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for DPT."""

import math
from typing import Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import pad, resize, to_channel_dimension_format
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    is_torch_available,
    is_torch_tensor,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging


if is_torch_available():
    import torch

if is_vision_available():
    import PIL


logger = logging.get_logger(__name__)


def get_resize_output_image_size(
    input_image: np.ndarray,
    output_size: Union[int, Iterable[int]],
    keep_aspect_ratio: bool,
    multiple: int,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    Calculate the output size for resizing an image while optionally constraining to multiples and maintaining aspect ratio.

    Args:
        input_image (np.ndarray): The input image as a NumPy array.
        output_size (Union[int, Iterable[int]]): Desired output size for resizing.
        keep_aspect_ratio (bool): Whether to maintain the aspect ratio of the input image.
        multiple (int): Constraint to resize dimensions to multiples of this value.
        input_data_format (Optional[Union[str, ChannelDimension]], optional):
            Format of the input image data. Defaults to None.

    Returns:
        Tuple[int, int]: Output height and width after resizing.
    """

    def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
        """
        Helper function to constrain a value to be a multiple of another value within specified bounds.

        Args:
            val (float): Value to constrain.
            multiple (int): Constraint value.
            min_val (int, optional): Minimum value constraint. Defaults to 0.
            max_val (int, optional): Maximum value constraint. Defaults to None.

        Returns:
            float: Constrained value.
        """
        x = round(val / multiple) * multiple

        if max_val is not None and x > max_val:
            x = math.floor(val / multiple) * multiple

        if x < min_val:
            x = math.ceil(val / multiple) * multiple

        return x

    # Convert output_size to a tuple if it's an integer
    output_size = (output_size, output_size) if isinstance(output_size, int) else output_size

    # Get dimensions of the input image
    input_height, input_width = get_image_size(input_image, input_data_format)
    output_height, output_width = output_size

    # Calculate scaling factors for height and width
    scale_height = output_height / input_height
    scale_width = output_width / input_width

    # Adjust scaling factors if maintaining aspect ratio
    if keep_aspect_ratio:
        if abs(1 - scale_width) < abs(1 - scale_height):
            scale_height = scale_width  # Fit width
        else:
            scale_width = scale_height  # Fit height

    # Calculate new height and width constrained to multiples
    new_height = constraint_to_multiple_of(scale_height * input_height, multiple=multiple)
    new_width = constraint_to_multiple_of(scale_width * input_width, multiple=multiple)

    return (new_height, new_width)


class DPTImageProcessor(BaseImageProcessor):
    r"""
    Constructs a DPT image processor.
    
    This class extends BaseImageProcessor and provides methods specific to the DPT image processing.
    """
    # 定义了多个参数,用于图像预处理过程中的控制和配置
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            是否调整图像的(高度,宽度)尺寸。可以被 `preprocess` 中的 `do_resize` 覆盖。
        size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
            调整后的图像尺寸。可以被 `preprocess` 中的 `size` 覆盖。
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            定义调整图像时要使用的重采样滤波器。可以被 `preprocess` 中的 `resample` 覆盖。
        keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
            如果为 `True`,则调整图像尺寸以保持宽高比最大可能的大小。可以被 `preprocess` 中的 `keep_aspect_ratio` 覆盖。
        ensure_multiple_of (`int`, *optional*, defaults to 1):
            如果 `do_resize` 为 `True`,则调整图像尺寸为此值的倍数。可以被 `preprocess` 中的 `ensure_multiple_of` 覆盖。
        do_rescale (`bool`, *optional*, defaults to `True`):
            是否按照指定的比例因子 `rescale_factor` 进行图像缩放。可以被 `preprocess` 中的 `do_rescale` 覆盖。
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            如果进行图像缩放,使用的缩放因子。可以被 `preprocess` 中的 `rescale_factor` 覆盖。
        do_normalize (`bool`, *optional*, defaults to `True`):
            是否对图像进行归一化。可以被 `preprocess` 中的 `do_normalize` 参数覆盖。
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            如果进行图像归一化,使用的均值。这是一个浮点数或与图像通道数相等长度的浮点数列表。可以被 `preprocess` 中的 `image_mean` 参数覆盖。
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            如果进行图像归一化,使用的标准差。这是一个浮点数或与图像通道数相等长度的浮点数列表。可以被 `preprocess` 中的 `image_std` 参数覆盖。
        do_pad (`bool`, *optional*, defaults to `False`):
            是否应用中心填充。这在与 DPT 结合使用的 DINOv2 论文中被引入。
        size_divisor (`int`, *optional*):
            如果 `do_pad` 为 `True`,则将图像维度填充为此值的倍数。这在与 DPT 结合使用的 DINOv2 论文中被引入。
    """

    # 定义了模型输入的名称,这是一个包含单个元素 "pixel_values" 的列表
    model_input_names = ["pixel_values"]
    # 初始化方法,用于实例化对象时进行初始化设置
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图片大小调整的标志,默认为True
        size: Dict[str, int] = None,  # 图片大小的字典,包含"height"和"width"两个键,默认为384x384
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # 图片调整大小时的重采样方法,默认为双三次插值
        keep_aspect_ratio: bool = False,  # 是否保持图片宽高比的标志,默认为False
        ensure_multiple_of: int = 1,  # 调整后的图片尺寸需为此值的倍数,默认为1
        do_rescale: bool = True,  # 是否对图片进行重新缩放的标志,默认为True
        rescale_factor: Union[int, float] = 1 / 255,  # 图片缩放因子,将像素值缩放到[0, 1]区间,默认为1/255
        do_normalize: bool = True,  # 是否对图片进行归一化处理的标志,默认为True
        image_mean: Optional[Union[float, List[float]]] = None,  # 图片归一化的均值,默认为ImageNet数据集的标准均值
        image_std: Optional[Union[float, List[float]]] = None,  # 图片归一化的标准差,默认为ImageNet数据集的标准差
        do_pad: bool = False,  # 是否对图片进行填充的标志,默认为False
        size_divisor: int = None,  # 图片调整后尺寸需为此值的倍数,默认为None
        **kwargs,  # 其他可选参数
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 如果未提供size参数,则使用默认值384x384
        size = size if size is not None else {"height": 384, "width": 384}
        # 根据提供的size参数获取标准化后的尺寸字典
        size = get_size_dict(size)
        # 初始化对象的各个属性
        self.do_resize = do_resize  # 是否进行图片大小调整的标志
        self.size = size  # 图片大小的字典
        self.keep_aspect_ratio = keep_aspect_ratio  # 是否保持图片宽高比的标志
        self.ensure_multiple_of = ensure_multiple_of  # 调整后的图片尺寸需为此值的倍数
        self.resample = resample  # 图片调整大小时的重采样方法
        self.do_rescale = do_rescale  # 是否对图片进行重新缩放的标志
        self.rescale_factor = rescale_factor  # 图片缩放因子
        self.do_normalize = do_normalize  # 是否对图片进行归一化处理的标志
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN  # 图片归一化的均值
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD  # 图片归一化的标准差
        self.do_pad = do_pad  # 是否对图片进行填充的标志
        self.size_divisor = size_divisor  # 图片调整后尺寸需为此值的倍数
        # 验证处理器可接受的键列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "keep_aspect_ratio",
            "ensure_multiple_of",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_pad",
            "size_divisor",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    ) -> np.ndarray:
        """
        Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
        set, the image is resized to a size that is a multiple of this value.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Target size of the output image.
            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
                If `True`, the image is resized while preserving its aspect ratio.
            ensure_multiple_of (`int`, *optional*, defaults to 1):
                The image is resized to a size that is a multiple of this value.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the output image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        size = get_size_dict(size)  # 调用函数 `get_size_dict` 将 `size` 参数转换为标准尺寸字典
        if "height" not in size or "width" not in size:
            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")

        output_size = get_resize_output_image_size(
            image,
            output_size=(size["height"], size["width"]),
            keep_aspect_ratio=keep_aspect_ratio,
            multiple=ensure_multiple_of,
            input_data_format=input_data_format,
        )
        # 调用 `get_resize_output_image_size` 函数计算输出图像的尺寸,并确保尺寸为 `size` 的倍数
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def pad_image(
        self,
        image: np.array,
        size_divisor: int,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,  # 允许额外的关键字参数
    ):
        """
        Pad an image to ensure its dimensions are divisible by `size_divisor`.

        Args:
            image (`np.array`):
                Image to pad.
            size_divisor (`int`):
                The divisor that the dimensions of the padded image should be divisible by.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the output image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
            **kwargs:
                Additional keyword arguments to be passed to the `resize` function.

        Returns:
            `np.ndarray`: Padded image.
        """
    ):
        """
        Center pad an image to be a multiple of `multiple`.

        Args:
            image (`np.ndarray`):
                Image to pad.
            size_divisor (`int`):
                The width and height of the image will be padded to a multiple of this number.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """

        def _get_pad(size, size_divisor):
            """
            Calculate padding sizes for an image dimension to be a multiple of `size_divisor`.

            Args:
                size (`int`): Original size of the image dimension.
                size_divisor (`int`): The width or height will be padded to a multiple of this number.

            Returns:
                tuple: Left and right padding sizes.
            """
            new_size = math.ceil(size / size_divisor) * size_divisor
            pad_size = new_size - size
            pad_size_left = pad_size // 2
            pad_size_right = pad_size - pad_size_left
            return pad_size_left, pad_size_right

        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        
        height, width = get_image_size(image, input_data_format)

        pad_size_left, pad_size_right = _get_pad(height, size_divisor)
        pad_size_top, pad_size_bottom = _get_pad(width, size_divisor)

        return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format)

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: int = None,
        keep_aspect_ratio: bool = None,
        ensure_multiple_of: int = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_pad: bool = None,
        size_divisor: int = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Preprocess images according to specified transformations.

        Args:
            images (ImageInput): Input images to be preprocessed.
            do_resize (bool, optional): Whether to resize the images.
            size (int, optional): Size to which images should be resized.
            keep_aspect_ratio (bool, optional): Whether to maintain aspect ratio during resizing.
            ensure_multiple_of (int, optional): Ensure image dimensions are multiples of this number.
            resample (PILImageResampling, optional): Resampling method for resizing.
            do_rescale (bool, optional): Whether to rescale image pixel values.
            rescale_factor (float, optional): Factor to rescale image pixel values.
            do_normalize (bool, optional): Whether to normalize image pixel values.
            image_mean (float or List[float], optional): Mean values for image normalization.
            image_std (float or List[float], optional): Standard deviation values for image normalization.
            do_pad (bool, optional): Whether to pad images.
            size_divisor (int, optional): Pad image dimensions to be multiples of this number.
            return_tensors (str or TensorType, optional): Desired tensor type for output images.
            data_format (ChannelDimension, optional): Output image channel format.
            input_data_format (str or ChannelDimension, optional): Input image channel format.

            **kwargs: Additional keyword arguments for preprocessing.

        Returns:
            Preprocessed images according to the specified transformations.
        """
        # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
    # 后处理语义分割模型输出,将[`DPTForSemanticSegmentation`]的输出转换为语义分割地图。仅支持PyTorch。

    # 获取模型输出中的逻辑回归值
    logits = outputs.logits

    # 调整逻辑回归值的大小并计算语义分割地图
    if target_sizes is not None:
        # 检查目标大小列表长度是否与逻辑回归值的批次维度相匹配
        if len(logits) != len(target_sizes):
            raise ValueError(
                "Make sure that you pass in as many target sizes as the batch dimension of the logits"
            )

        # 如果目标大小是PyTorch张量,则转换为NumPy数组
        if is_torch_tensor(target_sizes):
            target_sizes = target_sizes.numpy()

        # 初始化语义分割结果列表
        semantic_segmentation = []

        # 对每个样本的逻辑回归值进行处理
        for idx in range(len(logits)):
            # 调整逻辑回归值的大小,使用双线性插值方法
            resized_logits = torch.nn.functional.interpolate(
                logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
            )
            # 计算每个像素点的语义类别(最大概率对应的类别)
            semantic_map = resized_logits[0].argmax(dim=0)
            # 将处理后的语义分割地图添加到结果列表中
            semantic_segmentation.append(semantic_map)
    else:
        # 若未指定目标大小,则直接计算每个样本的语义分割结果
        semantic_segmentation = logits.argmax(dim=1)
        semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]

    # 返回所有样本的语义分割结果列表
    return semantic_segmentation

.\models\dpt\modeling_dpt.py

    """
    PyTorch DPT (Dense Prediction Transformers) model.

    This implementation is heavily inspired by OpenMMLab's implementation, found here:
    https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.
    """

# 导入必要的模块和库
import collections.abc
import math
from dataclasses import dataclass
from typing import List, Optional, Set, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 导入工具函数和类
from ...activations import ACT2FN
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import ModelOutput, logging
from ...utils.backbone_utils import load_backbone
from .configuration_dpt import DPTConfig

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 用于文档的常量和模型配置信息
_CONFIG_FOR_DOC = "DPTConfig"
_CHECKPOINT_FOR_DOC = "Intel/dpt-large"
_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024]

# 预训练模型列表
DPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Intel/dpt-large",
    "Intel/dpt-hybrid-midas",
    # See all DPT models at https://huggingface.co/models?filter=dpt
]

@dataclass
class BaseModelOutputWithIntermediateActivations(ModelOutput):
    """
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    """
    last_hidden_states: torch.FloatTensor = None
    intermediate_activations: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    """
    # 最后一个隐藏状态:模型最后一层输出的隐藏状态,形状为(batch_size, sequence_length, hidden_size)
    last_hidden_state: torch.FloatTensor = None
    
    # 汇聚器输出:经过进一步处理后,序列中第一个标记(分类标记)的最后一层隐藏状态。例如,对于BERT系列模型,这是经过线性层和tanh激活函数处理后的分类标记。
    # 线性层的权重在预训练阶段通过下一个句子预测(分类)目标进行训练。
    pooler_output: torch.FloatTensor = None
    
    # 隐藏状态:模型在每一层输出的隐藏状态的元组,如果模型有嵌入层,则包括嵌入层的输出。
    # 形状为(batch_size, sequence_length, hidden_size)。
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    
    # 注意力权重:注意力softmax后的注意力权重,用于在自注意力头中计算加权平均值。
    # 形状为(batch_size, num_heads, sequence_length, sequence_length)的元组,每个元素对应一个层。
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    
    # 中间激活:可用于计算各层模型隐藏状态的中间激活。
    intermediate_activations: Optional[Tuple[torch.FloatTensor, ...]] = None
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config, feature_size=None):
        super().__init__()
        
        # Extract configuration parameters
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # Ensure image_size and patch_size are iterable, defaulting to tuple if not
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        
        # Calculate number of patches based on image and patch size
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])

        # Load backbone model defined by configuration
        self.backbone = load_backbone(config)
        feature_dim = self.backbone.channels[-1]

        # Ensure backbone has exactly 3 output features
        if len(self.backbone.channels) != 3:
            raise ValueError(f"Expected backbone to have 3 output features, got {len(self.backbone.channels)}")

        # Define indices of backbone stages to use for residual feature maps
        self.residual_feature_map_index = [0, 1]  # Always take the output of the first and second backbone stage

        # Determine feature size based on configuration or input feature_size
        if feature_size is None:
            feat_map_shape = config.backbone_featmap_shape
            feature_size = feat_map_shape[-2:]
            feature_dim = feat_map_shape[1]
        else:
            feature_size = (
                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
            )
            feature_dim = self.backbone.channels[-1]

        # Store relevant configuration parameters
        self.image_size = image_size
        self.patch_size = patch_size[0]
        self.num_channels = num_channels

        # Projection layer to convert feature dimension to hidden size
        self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=1)

        # Initialize classification token and positional embeddings
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))

    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
        """
        Resize the positional embeddings to match a specified grid size.

        Args:
            posemb (torch.Tensor): Positional embeddings tensor.
            grid_size_height (int): Target grid height.
            grid_size_width (int): Target grid width.
            start_index (int, optional): Starting index for grid reshaping. Defaults to 1.

        Returns:
            torch.Tensor: Resized positional embeddings tensor.
        """
        # Separate token and grid positional embeddings
        posemb_tok = posemb[:, :start_index]
        posemb_grid = posemb[0, start_index:]

        # Determine current grid size
        old_grid_size = int(math.sqrt(len(posemb_grid)))

        # Reshape grid embeddings and interpolate to target size
        posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
        posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)

        # Concatenate token and resized grid embeddings
        posemb = torch.cat([posemb_tok, posemb_grid], dim=1)

        return posemb

    def forward(
        self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, return_dict: bool = False
    ):
        """
        Perform forward pass of the model.

        Args:
            pixel_values (torch.Tensor): Input tensor of shape `(batch_size, num_channels, height, width)`.
            interpolate_pos_encoding (bool): Whether to interpolate positional embeddings. Defaults to False.
            return_dict (bool): Whether to return output as dictionary. Defaults to False.

        Returns:
            torch.Tensor or dict: Output tensor or dictionary depending on `return_dict`.
        """
    ) -> torch.Tensor:
        batch_size, num_channels, height, width = pixel_values.shape
        # 获取输入张量的维度信息,分别是批量大小、通道数、高度和宽度

        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 如果输入的通道数与模型期望的通道数不一致,则抛出数值错误

        if not interpolate_pos_encoding:
            if height != self.image_size[0] or width != self.image_size[1]:
                raise ValueError(
                    f"Input image size ({height}*{width}) doesn't match model"
                    f" ({self.image_size[0]}*{self.image_size[1]})."
                )
        # 如果不需要插值位置编码,并且输入图像的高度或宽度与模型期望的不匹配,则抛出数值错误

        position_embeddings = self._resize_pos_embed(
            self.position_embeddings, height // self.patch_size, width // self.patch_size
        )
        # 调整位置编码的大小,以匹配当前输入图像的尺寸,将其存储在 position_embeddings 中

        backbone_output = self.backbone(pixel_values)
        # 使用预训练的骨干网络处理输入的像素值,获取骨干网络的输出

        features = backbone_output.feature_maps[-1]
        # 从骨干网络的输出中获取最后一层的特征图作为特征

        # Retrieve also the intermediate activations to use them at later stages
        output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index]
        # 获取中间层的激活结果,以备后续使用

        embeddings = self.projection(features).flatten(2).transpose(1, 2)
        # 将提取的特征投影到嵌入空间,并进行展平和转置操作

        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        # 扩展分类令牌,以匹配当前批次的大小

        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
        # 将分类令牌和特征嵌入拼接在一起,形成最终的嵌入表示

        # add positional encoding to each token
        embeddings = embeddings + position_embeddings
        # 将位置编码添加到每个令牌的嵌入表示中

        if not return_dict:
            return (embeddings, output_hidden_states)
        # 如果不需要返回字典形式的输出,则直接返回嵌入表示和中间层激活结果的元组

        # Return hidden states and intermediate activations
        return BaseModelOutputWithIntermediateActivations(
            last_hidden_states=embeddings,
            intermediate_activations=output_hidden_states,
        )
        # 否则,返回包含最终隐藏状态和中间激活状态的 BaseModelOutputWithIntermediateActivations 对象
# 定义一个名为 DPTViTEmbeddings 的 nn.Module 类,用于处理 DPT-ViT 模型的嵌入部分,包括 CLS token、位置编码和图像补丁的嵌入。

class DPTViTEmbeddings(nn.Module):
    """
    Construct the CLS token, position and patch embeddings.
    构建 CLS token、位置编码和图像补丁的嵌入。

    """

    def __init__(self, config):
        super().__init__()

        # 初始化 CLS token 参数
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        # 初始化图像补丁的嵌入
        self.patch_embeddings = DPTViTPatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches
        # 初始化位置编码的参数
        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
        # 初始化 Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 保存配置信息
        self.config = config

    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
        # 提取位置编码中的 token 部分
        posemb_tok = posemb[:, :start_index]
        # 提取位置编码中的 grid 部分
        posemb_grid = posemb[0, start_index:]

        # 获取旧的 grid 尺寸
        old_grid_size = int(math.sqrt(len(posemb_grid)))

        # 重塑 grid 部分并进行双线性插值
        posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
        posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)

        # 合并 token 和插值后的 grid
        posemb = torch.cat([posemb_tok, posemb_grid], dim=1)

        return posemb

    def forward(self, pixel_values, return_dict=False):
        batch_size, num_channels, height, width = pixel_values.shape

        # 可能需要插值位置编码以处理不同大小的图像
        patch_size = self.config.patch_size
        position_embeddings = self._resize_pos_embed(
            self.position_embeddings, height // patch_size, width // patch_size
        )

        # 计算图像补丁的嵌入
        embeddings = self.patch_embeddings(pixel_values)

        batch_size, seq_len, _ = embeddings.size()

        # 将 [CLS] token 添加到嵌入的补丁 token 中
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)

        # 添加位置编码到每个 token
        embeddings = embeddings + position_embeddings

        # 应用 Dropout
        embeddings = self.dropout(embeddings)

        # 如果不需要返回字典,则返回嵌入结果元组
        if not return_dict:
            return (embeddings,)

        # 如果需要返回字典,则构建 BaseModelOutputWithIntermediateActivations 并返回
        return BaseModelOutputWithIntermediateActivations(last_hidden_states=embeddings)


class DPTViTPatchEmbeddings(nn.Module):
    """
    Image to Patch Embedding.
    图像到补丁的嵌入。

    """
    # 初始化函数,用于初始化一个类实例
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 从配置中获取图像大小和patch大小
        image_size, patch_size = config.image_size, config.patch_size
        # 从配置中获取通道数和隐藏层大小
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 如果图像大小和patch大小不是可迭代对象,则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        
        # 计算patch的数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 将相关信息保存到类实例的属性中
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 使用 nn.Conv2d 定义一个投影层,将输入的通道数映射到隐藏层大小
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    # 前向传播函数,接受像素值作为输入并返回嵌入向量
    def forward(self, pixel_values):
        # 获取输入张量的批次大小、通道数、高度和宽度
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 如果输入通道数与配置中设置的不符,则抛出 ValueError 异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        
        # 将输入张量通过投影层,然后展平、转置,得到嵌入向量
        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
        
        # 返回嵌入向量作为结果
        return embeddings
# 从transformers.models.vit.modeling_vit.ViTSelfAttention复制并修改为DPTViTSelfAttention
class DPTViTSelfAttention(nn.Module):
    def __init__(self, config: DPTConfig) -> None:
        super().__init__()
        # 检查hidden_size是否是attention头数的整数倍,若不是则抛出异常
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义query、key、value线性变换层,用于构造注意力机制中的查询、键、值
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)

        # 定义dropout层,用于在注意力机制中应用dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 将输入张量x转置以匹配注意力头的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数,执行自注意力机制的计算过程
    def forward(
        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 计算混合查询层
        mixed_query_layer = self.query(hidden_states)

        # 计算转置后的键和值张量,以便进行注意力分数计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算原始的注意力分数,即查询与键的点积
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 对注意力分数进行缩放
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 对注意力分数进行softmax操作,得到注意力权重
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 应用dropout到注意力权重上,以减少过拟合风险
        attention_probs = self.dropout(attention_probs)

        # 如果提供了head_mask,则应用到注意力权重上
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文层,即注意力权重与值的乘积
        context_layer = torch.matmul(attention_probs, value_layer)

        # 调整上下文层张量的形状以匹配原始输入张量的形状
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 如果需要输出注意力权重,将其添加到输出中
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DPT
class DPTViTSelfOutput(nn.Module):
    """
    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: DPTConfig) -> None:
        super().__init__()
        # 定义一个全连接层,输入和输出的维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 dropout 层,用于在训练时随机置零输入张量的部分元素
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 通过全连接层 self.dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的 hidden_states 应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


class DPTViTAttention(nn.Module):
    def __init__(self, config: DPTConfig) -> None:
        super().__init__()
        # 创建 DPTViTSelfAttention 类的实例,该类负责注意力计算
        self.attention = DPTViTSelfAttention(config)
        # 创建 DPTViTSelfOutput 类的实例,该类负责自注意力输出和残差连接
        self.output = DPTViTSelfOutput(config)
        # 初始化一个集合,用于记录被修剪的注意力头的索引
        self.pruned_heads = set()

    # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
    def prune_heads(self, heads: Set[int]) -> None:
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数,找到需要修剪的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 修剪注意力机制中的线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录修剪过的注意力头
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # Copied from transformers.models.vit.modeling_vit.ViTAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 调用 DPTViTSelfAttention 的 forward 方法进行注意力计算
        self_outputs = self.attention(hidden_states, head_mask, output_attentions)

        # 将 self_outputs[0] 作为输入,通过 DPTViTSelfOutput 的 forward 方法进行输出计算
        attention_output = self.output(self_outputs[0], hidden_states)

        # 如果需要输出注意力权重,将它们添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->DPT
class DPTViTIntermediate(nn.Module):
    # 初始化方法,用于初始化对象
    def __init__(self, config: DPTConfig) -> None:
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层,将输入特征的大小调整为中间特征的大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 检查隐藏激活函数是否为字符串类型,选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法,处理输入张量并返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量通过线性层
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数到线性层的输出
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回处理后的张量作为输出
        return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->DPT
class DPTViTOutput(nn.Module):
    def __init__(self, config: DPTConfig) -> None:
        super().__init__()
        # 定义一个全连接层,将输入维度为 config.intermediate_size 的向量映射到维度为 config.hidden_size 的向量
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 定义一个 dropout 层,以 config.hidden_dropout_prob 的概率随机将输入设置为 0,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 应用全连接层
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行 dropout 处理
        hidden_states = self.dropout(hidden_states)

        # 将 dropout 处理后的 hidden_states 与输入的 input_tensor 相加,实现残差连接
        hidden_states = hidden_states + input_tensor

        return hidden_states


# copied from transformers.models.vit.modeling_vit.ViTLayer with ViTConfig->DPTConfig, ViTAttention->DPTViTAttention, ViTIntermediate->DPTViTIntermediate, ViTOutput->DPTViTOutput
class DPTViTLayer(nn.Module):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(self, config: DPTConfig) -> None:
        super().__init__()
        # 设置用于分块前向传播的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度设置为1,通常在处理序列时使用
        self.seq_len_dim = 1
        # 创建自注意力层对象 DPTViTAttention
        self.attention = DPTViTAttention(config)
        # 创建中间层对象 DPTViTIntermediate
        self.intermediate = DPTViTIntermediate(config)
        # 创建输出层对象 DPTViTOutput
        self.output = DPTViTOutput(config)
        # 在隐藏状态维度上应用 LayerNorm,epsilon 设置为 config.layer_norm_eps
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 在隐藏状态维度上应用 LayerNorm,epsilon 设置为 config.layer_norm_eps
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 在自注意力层之前应用 LayerNorm
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),
            head_mask,
            output_attentions=output_attentions,
        )
        # 获取自注意力层的输出
        attention_output = self_attention_outputs[0]
        # 如果需要输出注意力权重,则添加自注意力层的注意力权重到 outputs
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 第一个残差连接
        hidden_states = attention_output + hidden_states

        # 在自注意力层之后应用 LayerNorm
        layer_output = self.layernorm_after(hidden_states)
        # 经过中间层处理
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接在这里完成
        layer_output = self.output(layer_output, hidden_states)

        # 将本层的输出添加到 outputs
        outputs = (layer_output,) + outputs

        return outputs


# copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig -> DPTConfig, ViTLayer->DPTViTLayer
class DPTViTEncoder(nn.Module):
    def __init__(self, config: DPTConfig) -> None:
        super().__init__()
        self.config = config
        # 创建一个包含 config.num_hidden_layers 个 DPTViTLayer 层的 ModuleList
        self.layer = nn.ModuleList([DPTViTLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点设置为 False,用于指示是否使用梯度检查点来节省内存
        self.gradient_checkpointing = False
    # 定义前向传播函数,接收隐藏状态、头部掩码、是否输出注意力、是否输出隐藏状态、是否返回字典作为参数,返回元组或BaseModelOutput对象
    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # 如果需要输出隐藏状态,则初始化一个空元组用于存储所有的隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力,则初始化一个空元组用于存储所有的自注意力矩阵
        all_self_attentions = () if output_attentions else None

        # 遍历所有的层次模块
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,则将当前的隐藏状态添加到all_hidden_states元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层次的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用了梯度检查点并且处于训练状态,则使用梯度检查点函数来计算当前层的输出
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层次模块计算输出
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力,则将当前层的自注意力矩阵添加到all_self_attentions元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态,则将最终的隐藏状态添加到all_hidden_states元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出,则将所有非空的元组打包成一个元组返回
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则,返回一个BaseModelOutput对象,包含最终的隐藏状态、所有隐藏状态和所有自注意力矩阵
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
class DPTReassembleStage(nn.Module):
    """
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    """

    def __init__(self, config):
        super().__init__()

        self.config = config  # 存储模型配置信息
        self.layers = nn.ModuleList()  # 初始化层列表

        # 根据配置选择初始化 DPT-Hybrid 或者普通的 DPT 模型
        if config.is_hybrid:
            self._init_reassemble_dpt_hybrid(config)
        else:
            self._init_reassemble_dpt(config)

        self.neck_ignore_stages = config.neck_ignore_stages  # 存储忽略的 neck stages

    def _init_reassemble_dpt_hybrid(self, config):
        r"""
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        """
        # 根据配置初始化 DPT-Hybrid 模型的各层
        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
            if i <= 1:
                self.layers.append(nn.Identity())  # 前两层设置为 nn.Identity()
            elif i > 1:
                self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))

        # 如果 readout_type 不是 "project",则抛出错误
        if config.readout_type != "project":
            raise ValueError(f"Readout type {config.readout_type} is not supported for DPT-Hybrid.")

        # 当使用 DPT-Hybrid 时,readout type 被设置为 "project",在配置文件中进行了检查
        self.readout_projects = nn.ModuleList()  # 初始化 readout projects 模块列表
        hidden_size = _get_backbone_hidden_size(config)  # 获取 backbone 隐藏层大小
        for i in range(len(config.neck_hidden_sizes)):
            if i <= 1:
                self.readout_projects.append(nn.Sequential(nn.Identity()))  # 前两层设置为 nn.Identity()
            elif i > 1:
                self.readout_projects.append(
                    nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
                )

    def _init_reassemble_dpt(self, config):
        # 根据配置初始化普通的 DPT 模型的各层
        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
            self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))

        # 如果 readout_type 是 "project",则初始化 readout projects
        if config.readout_type == "project":
            self.readout_projects = nn.ModuleList()  # 初始化 readout projects 模块列表
            hidden_size = _get_backbone_hidden_size(config)  # 获取 backbone 隐藏层大小
            for _ in range(len(config.neck_hidden_sizes)):
                self.readout_projects.append(
                    nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
                )
    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
        """
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        """
        out = []  # 用于存储每个阶段处理后的输出

        for i, hidden_state in enumerate(hidden_states):
            if i not in self.neck_ignore_stages:
                # 将隐藏状态重塑为(batch_size, num_channels, height, width)
                cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:]
                batch_size, sequence_length, num_channels = hidden_state.shape
                
                # 根据输入的patch_height和patch_width或者自动计算的size,重塑隐藏状态的形状
                if patch_height is not None and patch_width is not None:
                    hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
                else:
                    size = int(math.sqrt(sequence_length))
                    hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()  # 将维度顺序调整为(batch_size, num_channels, height, width)

                feature_shape = hidden_state.shape
                
                if self.config.readout_type == "project":
                    # 将隐藏状态展平为(batch_size, height*width, num_channels),并调整维度顺序
                    hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
                    readout = cls_token.unsqueeze(1).expand_as(hidden_state)
                    # 将读出向量连接到隐藏状态并投影
                    hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
                    # 将隐藏状态形状调整回(batch_size, num_channels, height, width)
                    hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
                elif self.config.readout_type == "add":
                    # 将隐藏状态展平并加上CLS标记后,重新调整形状
                    hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
                    hidden_state = hidden_state.reshape(feature_shape)
                
                # 经过特定阶段的层处理后的隐藏状态
                hidden_state = self.layers[i](hidden_state)
                
            out.append(hidden_state)  # 将处理后的隐藏状态添加到输出列表中

        return out
# 根据配置获取骨干网络的隐藏层大小
def _get_backbone_hidden_size(config):
    # 如果配置中包含骨干网络配置,并且不是混合模式,则返回骨干网络配置的隐藏层大小
    if config.backbone_config is not None and config.is_hybrid is False:
        return config.backbone_config.hidden_size
    else:
        # 否则返回配置中的隐藏层大小
        return config.hidden_size


class DPTReassembleLayer(nn.Module):
    def __init__(self, config, channels, factor):
        super().__init__()
        # 投影层,将输入通道数为骨干网络隐藏层大小,输出通道数为指定的通道数
        hidden_size = _get_backbone_hidden_size(config)
        self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)

        # 根据因子进行上/下采样
        if factor > 1:
            # 上采样:使用转置卷积进行尺寸扩展
            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
        elif factor == 1:
            # 不变:恒等映射
            self.resize = nn.Identity()
        elif factor < 1:
            # 下采样:使用普通卷积进行尺寸缩小
            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)

    def forward(self, hidden_state):
        # 将输入的隐藏状态通过投影层进行特征映射
        hidden_state = self.projection(hidden_state)
        # 将映射后的特征通过尺寸调整层进行尺寸调整
        hidden_state = self.resize(hidden_state)
        return hidden_state


class DPTFeatureFusionStage(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.ModuleList()
        # 根据配置中的隐藏层大小列表创建多个特征融合层
        for _ in range(len(config.neck_hidden_sizes)):
            self.layers.append(DPTFeatureFusionLayer(config))

    def forward(self, hidden_states):
        # 反转隐藏状态列表,从最后一个开始处理
        hidden_states = hidden_states[::-1]

        fused_hidden_states = []
        # 第一个融合层仅使用最后一个隐藏状态
        fused_hidden_state = self.layers[0](hidden_states[0])
        fused_hidden_states.append(fused_hidden_state)
        # 从倒数第二层开始循环到第二层
        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
            # 使用当前融合层对前一个融合状态和当前隐藏状态进行融合
            fused_hidden_state = layer(fused_hidden_state, hidden_state)
            fused_hidden_states.append(fused_hidden_state)

        return fused_hidden_states


class DPTPreActResidualLayer(nn.Module):
    """
    预激活残差层,即ResidualConvUnit。

    Args:
        config (`[DPTConfig]`):
            定义模型架构的模型配置类。
    """
    # 初始化函数,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()

        # 根据配置确定是否使用批量归一化
        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
        # 根据配置确定是否在融合残差中使用偏置,若配置为None则根据是否使用批量归一化来确定
        use_bias_in_fusion_residual = (
            config.use_bias_in_fusion_residual
            if config.use_bias_in_fusion_residual is not None
            else not self.use_batch_norm
        )

        # 第一个激活函数层使用ReLU激活函数
        self.activation1 = nn.ReLU()
        # 第一个卷积层,输入输出大小相同,使用指定的卷积核大小、步长和填充,并根据前面的配置确定是否使用偏置
        self.convolution1 = nn.Conv2d(
            config.fusion_hidden_size,
            config.fusion_hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=use_bias_in_fusion_residual,
        )

        # 第二个激活函数层使用ReLU激活函数
        self.activation2 = nn.ReLU()
        # 第二个卷积层,输入输出大小相同,使用指定的卷积核大小、步长和填充,并根据前面的配置确定是否使用偏置
        self.convolution2 = nn.Conv2d(
            config.fusion_hidden_size,
            config.fusion_hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=use_bias_in_fusion_residual,
        )

        # 如果配置中指定使用批量归一化,则创建批量归一化层
        if self.use_batch_norm:
            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)

    # 前向传播函数,接收一个张量作为输入,返回一个张量作为输出
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # 将输入张量作为残差保存下来
        residual = hidden_state
        # 对输入张量应用第一个ReLU激活函数
        hidden_state = self.activation1(hidden_state)

        # 将经过第一个ReLU激活函数的张量输入第一个卷积层中进行卷积操作
        hidden_state = self.convolution1(hidden_state)

        # 如果配置中指定使用批量归一化,则将经过第一个卷积层后的张量输入第一个批量归一化层中进行归一化操作
        if self.use_batch_norm:
            hidden_state = self.batch_norm1(hidden_state)

        # 对经过第一个ReLU激活函数和可能的批量归一化后的张量应用第二个ReLU激活函数
        hidden_state = self.activation2(hidden_state)
        # 将经过第二个ReLU激活函数的张量输入第二个卷积层中进行卷积操作
        hidden_state = self.convolution2(hidden_state)

        # 如果配置中指定使用批量归一化,则将经过第二个卷积层后的张量输入第二个批量归一化层中进行归一化操作
        if self.use_batch_norm:
            hidden_state = self.batch_norm2(hidden_state)

        # 将经过所有层操作后的张量与保存的残差相加作为最终的输出张量
        return hidden_state + residual
        """
        Inputs:
            hidden_state (`torch.Tensor`):
                The input tensor representing the feature maps.
            residual (`torch.Tensor`, *optional*):
                The tensor representing residual feature maps from previous stages.
                Default is `None`.
        
        Returns:
            `torch.Tensor`: The output tensor after feature fusion.
        
        Raises:
            None
        """
        if residual is not None:
            # Resize the residual tensor if its shape doesn't match `hidden_state`
            if hidden_state.shape != residual.shape:
                residual = nn.functional.interpolate(
                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
                )
            # Add residual to `hidden_state` using the first residual layer
            hidden_state = hidden_state + self.residual_layer1(residual)
        
        # Process `hidden_state` through the second residual layer
        hidden_state = self.residual_layer2(hidden_state)
        
        # Upsample `hidden_state` using bilinear interpolation
        hidden_state = nn.functional.interpolate(
            hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
        )
        
        # Project `hidden_state` using a 1x1 convolution
        hidden_state = self.projection(hidden_state)
        
        return hidden_state
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 输入参数,表示像素值的张量,形状为(batch_size, num_channels, height, width)。
            # 像素值可以通过 `AutoImageProcessor` 获取。详见 `DPTImageProcessor.__call__` 的说明。

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 可选参数,用于掩盖自注意力模块中选择的头部的掩码。掩码值范围在 `[0, 1]`:

            # - 1 表示头部 **未被掩码**,
            # - 0 表示头部 **被掩码**。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。返回的张量中的 `attentions` 字段有更多细节。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回的张量中的 `hidden_states` 字段有更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~file_utils.ModelOutput`] 而不是普通元组。
"""
定义一个 DPT 模型,该模型是 DPTPreTrainedModel 的子类,用于输出原始的隐藏状态,没有额外的特定输出头部。

:param config: 模型的配置对象,包含了模型的各种参数和设置
:param add_pooling_layer: 是否添加池化层,默认为 True
"""
class DPTModel(DPTPreTrainedModel):

    def __init__(self, config, add_pooling_layer=True):
        # 调用父类的初始化方法,传入配置对象
        super().__init__(config)
        self.config = config

        # 根据配置选择使用混合模式或者普通模式的 ViT 嵌入层
        if config.is_hybrid:
            self.embeddings = DPTViTHybridEmbeddings(config)
        else:
            self.embeddings = DPTViTEmbeddings(config)
        
        # 创建 ViT 编码器
        self.encoder = DPTViTEncoder(config)

        # LayerNorm 层,用于归一化隐藏状态向量
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 如果指定添加池化层,则创建 DPTViTPooler 对象
        self.pooler = DPTViTPooler(config) if add_pooling_layer else None

        # 初始化模型权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 如果是混合模式,则返回整个 embeddings 对象;否则返回 patch_embeddings 属性
        if self.config.is_hybrid:
            return self.embeddings
        else:
            return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        修剪模型中的注意力头部。

        :param heads_to_prune: 要修剪的头部的字典,格式为 {层编号: 要修剪的头部列表}
        """
        for layer, heads in heads_to_prune.items():
            # 遍历每个层中要修剪的头部列表,并调用 attention.prune_heads 进行修剪
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
        self,
        pixel_values: torch.FloatTensor,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
    """
    DPT 模型的前向传播方法,处理输入数据并返回相应的输出。

    :param pixel_values: 输入的像素值张量,大小为 [batch_size, num_channels, height, width]
    :param head_mask: 可选参数,用于掩盖某些注意力头部的掩码张量
    :param output_attentions: 可选参数,是否输出注意力权重
    :param output_hidden_states: 可选参数,是否输出所有隐藏状态
    :param return_dict: 可选参数,是否以字典形式返回输出
    :return: 模型的输出,具体格式根据参数决定
    """
        ) -> Union[Tuple, BaseModelOutputWithPoolingAndIntermediateActivations]:
        # 如果输出注意力分布未指定,则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果输出隐藏状态未指定,则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果返回字典未指定,则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 准备头部掩码(head mask),如果需要的话
        # 在头部掩码中,1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 的形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 将像素值嵌入到模型的嵌入层中
        embedding_output = self.embeddings(pixel_values, return_dict=return_dict)

        # 提取嵌入层的最后隐藏状态
        embedding_last_hidden_states = embedding_output[0] if not return_dict else embedding_output.last_hidden_states

        # 将最后隐藏状态输入编码器(encoder)中进行编码
        encoder_outputs = self.encoder(
            embedding_last_hidden_states,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取编码器输出的序列输出
        sequence_output = encoder_outputs[0]

        # 对序列输出进行 LayerNormalization
        sequence_output = self.layernorm(sequence_output)

        # 如果存在池化器(pooler),则对序列输出进行池化
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        # 如果未使用返回字典,则返回编码器的输出和池化器的输出(如果有)
        if not return_dict:
            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
            return head_outputs + encoder_outputs[1:] + embedding_output[1:]

        # 如果使用返回字典,则构造返回的 BaseModelOutputWithPoolingAndIntermediateActivations 对象
        return BaseModelOutputWithPoolingAndIntermediateActivations(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            intermediate_activations=embedding_output.intermediate_activations,
        )
# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->DPT
class DPTViTPooler(nn.Module):
    def __init__(self, config: DPTConfig):
        super().__init__()
        # Initialize a linear transformation layer for pooling
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Define activation function for the pooled output
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # Extract the hidden state corresponding to the first token for pooling
        first_token_tensor = hidden_states[:, 0]
        # Pass through linear layer for pooling
        pooled_output = self.dense(first_token_tensor)
        # Apply activation function to the pooled output
        pooled_output = self.activation(pooled_output)
        # Return the pooled output
        return pooled_output


class DPTNeck(nn.Module):
    """
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config

        # Postprocessing stage: reassemble_stage handles reassembling the hidden states
        # depending on the backbone type (only required for non-hierarchical backbones)
        if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
            self.reassemble_stage = None
        else:
            self.reassemble_stage = DPTReassembleStage(config)

        # Initialize a list of convolutional layers for fusion
        self.convs = nn.ModuleList()
        for channel in config.neck_hidden_sizes:
            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))

        # Fusion stage: feature fusion stage for combining processed features
        self.fusion_stage = DPTFeatureFusionStage(config)

    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
        """
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        """
        if not isinstance(hidden_states, (tuple, list)):
            raise ValueError("hidden_states should be a tuple or list of tensors")

        if len(hidden_states) != len(self.config.neck_hidden_sizes):
            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")

        # Postprocess hidden states if reassemble_stage is defined
        if self.reassemble_stage is not None:
            hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)

        # Apply convolutional layers to each hidden state in hidden_states
        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]

        # Apply fusion stage to combine processed features
        output = self.fusion_stage(features)

        # Return the fused output
        return output


class DPTDepthEstimationHead(nn.Module):
    """
    Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    """

    # Note: The class DPTDepthEstimationHead is not fully provided in the given snippet.
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 保存传入的配置参数
        self.config = config

        # 初始化投影层为 None
        self.projection = None
        # 如果配置要求添加投影层,则创建一个卷积层作为投影层
        if config.add_projection:
            self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))

        # 从配置中获取特征的大小
        features = config.fusion_hidden_size
        # 定义神经网络的前向结构
        self.head = nn.Sequential(
            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),  # 第一个卷积层
            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),      # 上采样层
            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),       # 第二个卷积层
            nn.ReLU(),                                                             # ReLU 激活函数
            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),                   # 第三个卷积层
            nn.ReLU(),                                                             # ReLU 激活函数
        )

    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
        # 使用索引获取最后的隐藏状态
        hidden_states = hidden_states[self.config.head_in_index]

        # 如果投影层不为 None,则将隐藏状态投影到新的空间,并应用 ReLU 激活函数
        if self.projection is not None:
            hidden_states = self.projection(hidden_states)
            hidden_states = nn.ReLU()(hidden_states)

        # 通过神经网络头部处理隐藏状态,生成深度预测
        predicted_depth = self.head(hidden_states)

        # 去除预测结果中的通道维度,使得结果为二维张量
        predicted_depth = predicted_depth.squeeze(dim=1)

        # 返回预测的深度张量
        return predicted_depth
"""
DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
"""
# 导入所需的库和模块
@add_start_docstrings(
    """
    Add docstring for model initialization with DPT-specific documentation.
    """,
    DPT_START_DOCSTRING,
)
# 定义 DPTForDepthEstimation 类,继承自 DPTPreTrainedModel
class DPTForDepthEstimation(DPTPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 根据配置加载背景骨干网络或创建 DPT 模型
        self.backbone = None
        if config.backbone_config is not None and config.is_hybrid is False:
            self.backbone = load_backbone(config)
        else:
            self.dpt = DPTModel(config, add_pooling_layer=False)

        # 初始化 DPTNeck 和 DPTDepthEstimationHead
        self.neck = DPTNeck(config)
        self.head = DPTDepthEstimationHead(config)

        # 执行初始化权重和最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
    # 实现前向传播函数,接受像素值、头部掩码、标签等参数
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        head_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
        """
        Perform forward pass of the DPT model for depth estimation.
        """
        # 实现模型的前向计算过程,生成深度估计输出
        # ...
        pass

"""
DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
"""
# 定义 DPTForSemanticSegmentation 类,继承自 DPTPreTrainedModel
@add_start_docstrings(
    """
    Add docstring for model initialization with DPT-specific documentation.
    """,
    DPT_START_DOCSTRING,
)
class DPTForSemanticSegmentation(DPTPreTrainedModel):
    # 初始化函数,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法,传入配置对象
        super().__init__(config)

        # 创建一个 DPTModel 的实例,不添加池化层
        self.dpt = DPTModel(config, add_pooling_layer=False)

        # 创建一个 DPTNeck 的实例作为模型的颈部(neck)
        self.neck = DPTNeck(config)

        # 创建一个 DPTSemanticSegmentationHead 的实例作为模型的主要分割头(head)
        self.head = DPTSemanticSegmentationHead(config)

        # 如果配置允许使用辅助头部(auxiliary_head),则创建一个 DPTAuxiliaryHead 的实例
        self.auxiliary_head = DPTAuxiliaryHead(config) if config.use_auxiliary_head else None

        # 执行初始化权重和最终处理
        self.post_init()

    # 前向传播函数的装饰器,添加模型输入的文档字符串
    @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
    # 替换前向传播函数返回值的文档字符串,输出类型为 SemanticSegmenterOutput,配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

.\models\dpt\__init__.py

# 版权声明和许可证信息
# 版权所有 2022 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)许可;
# 除非符合许可证的要求,否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件以“原样”分发,
# 不附带任何形式的明示或暗示担保或条件。
# 有关详细信息,请参阅许可证。

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入 LazyModule 和依赖检查函数
from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available
from ...utils import OptionalDependencyNotAvailable

# 定义模块导入结构
_import_structure = {"configuration_dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"]}

# 检查视觉处理依赖是否可用,如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加相应的导入结构
    _import_structure["feature_extraction_dpt"] = ["DPTFeatureExtractor"]
    _import_structure["image_processing_dpt"] = ["DPTImageProcessor"]

# 检查 PyTorch 是否可用,如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加相应的导入结构
    _import_structure["modeling_dpt"] = [
        "DPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DPTForDepthEstimation",
        "DPTForSemanticSegmentation",
        "DPTModel",
        "DPTPreTrainedModel",
    ]

# 如果是类型检查模式,则导入特定的类和函数
if TYPE_CHECKING:
    from .configuration_dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig

    # 检查视觉处理依赖是否可用,如果可用则导入相应的类
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .feature_extraction_dpt import DPTFeatureExtractor
        from .image_processing_dpt import DPTImageProcessor

    # 检查 PyTorch 是否可用,如果可用则导入相应的类
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_dpt import (
            DPT_PRETRAINED_MODEL_ARCHIVE_LIST,
            DPTForDepthEstimation,
            DPTForSemanticSegmentation,
            DPTModel,
            DPTPreTrainedModel,
        )

# 如果不是类型检查模式,则配置 LazyModule 并设置当前模块
else:
    import sys

    # 将当前模块配置为 LazyModule 的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\efficientformer\configuration_efficientformer.py

# 设置文件编码为 UTF-8
# 版权声明:2022 年由 HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权,除非遵守许可证的要求,否则不得使用此文件
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则软件根据“原样”的基础分发,
# 不提供任何形式的明示或暗示担保或条件
# 有关更多信息,请参阅许可证
""" EfficientFormer 模型配置 """

from typing import List

# 从 transformers 库中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从 transformers 库中导入日志工具 logging
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 EfficientFormer 预训练模型配置文件的映射字典,指定模型名称及其配置文件的 URL
EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "snap-research/efficientformer-l1-300": (
        "https://huggingface.co/snap-research/efficientformer-l1-300/resolve/main/config.json"
    ),
}


class EfficientFormerConfig(PretrainedConfig):
    r"""
    这是配置类,用于存储 [`EfficientFormerModel`] 的配置信息。根据指定的参数实例化 EfficientFormer 模型,
    定义模型架构。使用默认值实例化配置将产生类似于 EfficientFormer
    [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) 架构的配置。

    配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    示例:

    ```
    >>> from transformers import EfficientFormerConfig, EfficientFormerModel

    >>> # 初始化 EfficientFormer efficientformer-l1 风格的配置
    >>> configuration = EfficientFormerConfig()

    >>> # 从 efficientformer-l3 风格的配置初始化 EfficientFormerModel(具有随机权重)
    >>> model = EfficientFormerModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型标识符为 "efficientformer"
    model_type = "efficientformer"
    # 定义一个构造函数,初始化一个新的对象
    def __init__(
        self,
        depths: List[int] = [3, 2, 6, 4],  # 设置深度列表,默认为 [3, 2, 6, 4]
        hidden_sizes: List[int] = [48, 96, 224, 448],  # 设置隐藏层大小列表,默认为 [48, 96, 224, 448]
        downsamples: List[bool] = [True, True, True, True],  # 设置是否下采样列表,默认为 [True, True, True, True]
        dim: int = 448,  # 设置维度大小,默认为 448
        key_dim: int = 32,  # 设置键维度大小,默认为 32
        attention_ratio: int = 4,  # 设置注意力比例,默认为 4
        resolution: int = 7,  # 设置分辨率大小,默认为 7
        num_hidden_layers: int = 5,  # 设置隐藏层数量,默认为 5
        num_attention_heads: int = 8,  # 设置注意力头数量,默认为 8
        mlp_expansion_ratio: int = 4,  # 设置MLP扩展比率,默认为 4
        hidden_dropout_prob: float = 0.0,  # 设置隐藏层dropout概率,默认为 0.0
        patch_size: int = 16,  # 设置补丁大小,默认为 16
        num_channels: int = 3,  # 设置通道数量,默认为 3
        pool_size: int = 3,  # 设置池化大小,默认为 3
        downsample_patch_size: int = 3,  # 设置下采样补丁大小,默认为 3
        downsample_stride: int = 2,  # 设置下采样步长,默认为 2
        downsample_pad: int = 1,  # 设置下采样填充大小,默认为 1
        drop_path_rate: float = 0.0,  # 设置DropPath概率,默认为 0.0
        num_meta3d_blocks: int = 1,  # 设置Meta3D块数量,默认为 1
        distillation: bool = True,  # 是否进行蒸馏,默认为 True
        use_layer_scale: bool = True,  # 是否使用层比例,默认为 True
        layer_scale_init_value: float = 1e-5,  # 层比例初始化值,默认为 1e-5
        hidden_act: str = "gelu",  # 隐藏层激活函数,默认为 "gelu"
        initializer_range: float = 0.02,  # 初始化范围,默认为 0.02
        layer_norm_eps: float = 1e-12,  # 层归一化epsilon值,默认为 1e-12
        image_size: int = 224,  # 图像大小,默认为 224
        batch_norm_eps: float = 1e-05,  # 批归一化epsilon值,默认为 1e-05
        **kwargs,  # 接收任意额外的关键字参数
    ) -> None:  # 返回值为 None
        super().__init__(**kwargs)  # 调用父类的构造函数
    
        self.hidden_act = hidden_act  # 将隐藏层激活函数赋值给对象属性
        self.hidden_dropout_prob = hidden_dropout_prob  # 将隐藏层dropout概率赋值给对象属性
        self.hidden_sizes = hidden_sizes  # 将隐藏层大小列表赋值给对象属性
        self.num_hidden_layers = num_hidden_layers  # 将隐藏层数量赋值给对象属性
        self.num_attention_heads = num_attention_heads  # 将注意力头数量赋值给对象属性
        self.initializer_range = initializer_range  # 将初始化范围赋值给对象属性
        self.layer_norm_eps = layer_norm_eps  # 将层归一化epsilon值赋值给对象属性
        self.patch_size = patch_size  # 将补丁大小赋值给对象属性
        self.num_channels = num_channels  # 将通道数量赋值给对象属性
        self.depths = depths  # 将深度列表赋值给对象属性
        self.mlp_expansion_ratio = mlp_expansion_ratio  # 将MLP扩展比率赋值给对象属性
        self.downsamples = downsamples  # 将是否下采样列表赋值给对象属性
        self.dim = dim  # 将维度大小赋值给对象属性
        self.key_dim = key_dim  # 将键维度大小赋值给对象属性
        self.attention_ratio = attention_ratio  # 将注意力比例赋值给对象属性
        self.resolution = resolution  # 将分辨率大小赋值给对象属性
        self.pool_size = pool_size  # 将池化大小赋值给对象属性
        self.downsample_patch_size = downsample_patch_size  # 将下采样补丁大小赋值给对象属性
        self.downsample_stride = downsample_stride  # 将下采样步长赋值给对象属性
        self.downsample_pad = downsample_pad  # 将下采样填充大小赋值给对象属性
        self.drop_path_rate = drop_path_rate  # 将DropPath概率赋值给对象属性
        self.num_meta3d_blocks = num_meta3d_blocks  # 将Meta3D块数量赋值给对象属性
        self.distillation = distillation  # 将是否进行蒸馏赋值给对象属性
        self.use_layer_scale = use_layer_scale  # 将是否使用层比例赋值给对象属性
        self.layer_scale_init_value = layer_scale_init_value  # 将层比例初始化值赋值给对象属性
        self.image_size = image_size  # 将图像大小赋值给对象属性
        self.batch_norm_eps = batch_norm_eps  # 将批归一化epsilon值赋值给对象属性

.\models\efficientformer\convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py

# 设置文件编码为UTF-8,确保可以正确解析中文和其他非ASCII字符
# 版权声明,声明代码版权归The HuggingFace Inc.团队所有,使用Apache License 2.0许可证
# 仅在符合许可证的情况下可以使用本文件
# 可以在以下链接获取完整的许可证文本:http://www.apache.org/licenses/LICENSE-2.0
# 根据适用法律或书面同意的情况,软件按"原样"提供,不附带任何明示或暗示的担保或条件
# 详见许可证,限制软件使用的条件

"""从原始仓库转换EfficientFormer检查点。

URL: https://github.com/snap-research/EfficientFormer
"""

import argparse  # 导入用于解析命令行参数的模块
import re  # 导入正则表达式模块
from pathlib import Path  # 导入用于处理文件和目录路径的模块

import requests  # 导入用于发送HTTP请求的模块
import torch  # 导入PyTorch深度学习框架
from PIL import Image  # 导入PIL图像处理库中的Image模块
from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor  # 导入图像转换函数

from transformers import (  # 导入transformers库中的相关类和函数
    EfficientFormerConfig,
    EfficientFormerForImageClassificationWithTeacher,
    EfficientFormerImageProcessor,
)
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling  # 导入图像处理相关常量和函数

def rename_key(old_name, num_meta4D_last_stage):
    new_name = old_name

    if "patch_embed" in old_name:  # 如果旧键名中包含'patch_embed'
        _, layer, param = old_name.split(".")  # 拆分键名,获取层和参数信息

        if layer == "0":  # 如果层为'0'
            new_name = old_name.replace("0", "convolution1")  # 替换为'convolution1'
        elif layer == "1":  # 如果层为'1'
            new_name = old_name.replace("1", "batchnorm_before")  # 替换为'batchnorm_before'
        elif layer == "3":  # 如果层为'3'
            new_name = old_name.replace("3", "convolution2")  # 替换为'convolution2'
        else:  # 其他情况
            new_name = old_name.replace("4", "batchnorm_after")  # 替换为'batchnorm_after'
    # 如果旧名称中包含"network"且包含形如数字.数字的模式
    if "network" in old_name and re.search(r"\d\.\d", old_name):
        # 匹配两位数字的正则表达式模式
        two_digit_num = r"\b\d{2}\b"
        # 如果旧名称中存在两位数字,则匹配数字.数字.的模式
        if bool(re.search(two_digit_num, old_name)):
            match = re.search(r"\d\.\d\d.", old_name).group()
        else:
            match = re.search(r"\d\.\d.", old_name).group()
        # 如果匹配到的第一个数字小于6
        if int(match[0]) < 6:
            # 删除匹配的部分,并替换"network"为第一个数字.meta4D_layers.blocks.剩余部分
            trimmed_name = old_name.replace(match, "")
            trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
            # 新名称为intermediate_stages.修剪后的名称
            new_name = "intermediate_stages." + trimmed_name
        else:
            # 删除匹配的部分,并根据条件替换"network"为不同的字符串
            trimmed_name = old_name.replace(match, "")
            if int(match[2]) < num_meta4D_last_stage:
                trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
            else:
                layer_index = str(int(match[2]) - num_meta4D_last_stage)
                trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
                # 如果名称中包含"norm1",替换为"layernorm1"
                if "norm1" in old_name:
                    trimmed_name = trimmed_name.replace("norm1", "layernorm1")
                elif "norm2" in old_name:
                    trimmed_name = trimmed_name.replace("norm2", "layernorm2")
                elif "fc1" in old_name:
                    trimmed_name = trimmed_name.replace("fc1", "linear_in")
                elif "fc2" in old_name:
                    trimmed_name = trimmed_name.replace("fc2", "linear_out")

            # 新名称为last_stage.修剪后的名称
            new_name = "last_stage." + trimmed_name

    # 如果旧名称中包含"network"且包含形如.数字.的模式
    elif "network" in old_name and re.search(r".\d.", old_name):
        # 将"network"替换为"intermediate_stages"
        new_name = old_name.replace("network", "intermediate_stages")

    # 如果新名称中包含"fc",替换为"convolution"
    if "fc" in new_name:
        new_name = new_name.replace("fc", "convolution")
    # 如果新名称中包含"norm1"且不包含"layernorm1",替换为"batchnorm_before"
    elif ("norm1" in new_name) and ("layernorm1" not in new_name):
        new_name = new_name.replace("norm1", "batchnorm_before")
    # 如果新名称中包含"norm2"且不包含"layernorm2",替换为"batchnorm_after"
    elif ("norm2" in new_name) and ("layernorm2" not in new_name):
        new_name = new_name.replace("norm2", "batchnorm_after")
    # 如果新名称中包含"proj",替换为"projection"
    if "proj" in new_name:
        new_name = new_name.replace("proj", "projection")
    # 如果新名称中包含"dist_head",替换为"distillation_classifier"
    if "dist_head" in new_name:
        new_name = new_name.replace("dist_head", "distillation_classifier")
    # 如果新名称中包含"head",替换为"classifier"
    elif "head" in new_name:
        new_name = new_name.replace("head", "classifier")
    # 如果新名称中包含"patch_embed",在新名称前添加"efficientformer."
    elif "patch_embed" in new_name:
        new_name = "efficientformer." + new_name
    # 如果新名称为"norm.weight"或"norm.bias",替换为"layernorm."并在新名称前添加"efficientformer."
    elif new_name == "norm.weight" or new_name == "norm.bias":
        new_name = new_name.replace("norm", "layernorm")
        new_name = "efficientformer." + new_name
    else:
        # 否则在新名称前添加"efficientformer.encoder."
        new_name = "efficientformer.encoder." + new_name

    # 返回处理后的新名称
    return new_name
# 将给定的检查点中的键重命名,返回更新后的检查点
def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
    # 使用循环遍历检查点的副本中的所有键
    for key in checkpoint.copy().keys():
        # 弹出当前键对应的值,并用新键重新放置到检查点中
        val = checkpoint.pop(key)
        checkpoint[rename_key(key, num_meta4D_last_stage)] = val

    # 返回更新后的检查点
    return checkpoint


# 我们将在 COCO 图像上验证我们的结果
def prepare_img():
    # 定义 COCO 数据集中的图像 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 库获取图像的原始字节流,并用 PIL 打开图像
    image = Image.open(requests.get(url, stream=True).raw)

    # 返回打开的图像对象
    return image


def convert_efficientformer_checkpoint(
    checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
):
    # 加载原始检查点的状态字典,定位到模型部分
    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
    # 从 JSON 文件中加载 EfficientFormer 的配置
    config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
    # 创建 EfficientFormer 模型对象
    model = EfficientFormerForImageClassificationWithTeacher(config)
    # 提取模型名称,用于后续操作
    model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])

    # 计算最后一个阶段的元 4D 数量
    num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
    # 转换原始状态字典的键
    new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)

    # 加载转换后的状态字典到模型中
    model.load_state_dict(new_state_dict)
    # 将模型设置为评估模式
    model.eval()

    # 定义 Pillow 库中支持的图像重采样方式
    pillow_resamplings = {
        "bilinear": PILImageResampling.BILINEAR,
        "bicubic": PILImageResampling.BICUBIC,
        "nearest": PILImageResampling.NEAREST,
    }

    # 准备图像
    image = prepare_img()
    image_size = 256
    crop_size = 224
    # 创建 EfficientFormer 图像处理器实例
    processor = EfficientFormerImageProcessor(
        size={"shortest_edge": image_size},
        crop_size={"height": crop_size, "width": crop_size},
        resample=pillow_resamplings["bicubic"],
    )
    # 使用处理器处理图像并获取像素值张量
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    # 原始的图像处理流程
    image_transforms = Compose(
        [
            Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
            CenterCrop(crop_size),
            ToTensor(),
            Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
        ]
    )
    # 对原始图像进行变换并添加批次维度
    original_pixel_values = image_transforms(image).unsqueeze(0)

    # 断言保证处理后的像素值与原始像素值的接近度
    assert torch.allclose(original_pixel_values, pixel_values)

    # 将像素值输入模型并获取输出
    outputs = model(pixel_values)
    logits = outputs.logits

    # 预期的输出形状
    expected_shape = (1, 1000)

    # 根据模型名称验证不同情况下的输出对数是否正确
    if "l1" in model_name:
        expected_logits = torch.Tensor(
            [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
        )
        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
        assert logits.shape == expected_shape
    elif "l3" in model_name:
        expected_logits = torch.Tensor(
            [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
        )
        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
        assert logits.shape == expected_shape
    # 如果模型名称包含 "l7"
    elif "l7" in model_name:
        # 预期的 logits 值,作为模型输出的期望值
        expected_logits = torch.Tensor(
            [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
        )
        # 断言当前模型输出的形状符合预期的形状
        assert logits.shape == expected_shape
    else:
        # 如果模型名称不在已知的支持列表中,抛出异常
        raise ValueError(
            f"Unknown model checkpoint: {checkpoint_path}. Supported versions of efficientformer are l1, l3, and l7"
        )

    # 保存检查点到指定路径
    Path(pytorch_dump_path).mkdir(exist_ok=True)
    # 将模型保存到 PyTorch 的预训练模型路径下
    model.save_pretrained(pytorch_dump_path)
    # 打印保存成功的信息和路径
    print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
    # 将处理器保存到 PyTorch 的预训练模型路径下
    processor.save_pretrained(pytorch_dump_path)
    # 打印保存成功的信息和路径
    print(f"Processor successfully saved at {pytorch_dump_path}")

    # 如果需要推送到 Hub
    if push_to_hub:
        # 提示开始将模型推送到 Hub
        print("Pushing model to the hub...")
        # 将模型推送到指定的 Hub 仓库
        model.push_to_hub(
            repo_id=f"Bearnardd/{pytorch_dump_path}",
            commit_message="Add model",
            use_temp_dir=True,
        )
        # 将处理器推送到指定的 Hub 仓库
        processor.push_to_hub(
            repo_id=f"Bearnardd/{pytorch_dump_path}",
            commit_message="Add image processor",
            use_temp_dir=True,
        )
if __name__ == "__main__":
    # 如果作为主程序执行,开始解析命令行参数
    parser = argparse.ArgumentParser()

    # 必需的参数
    parser.add_argument(
        "--pytorch_model_path",
        default=None,
        type=str,
        required=True,
        help="Path to EfficientFormer pytorch checkpoint.",
    )
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The json file for EfficientFormer model config.",
    )
    parser.add_argument(
        "--pytorch_dump_path",
        default=None,
        type=str,
        required=True,
        help="Path to the output PyTorch model."
    )

    # 可选参数:是否将模型推送到 hub
    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
    parser.add_argument(
        "--no-push_to_hub",
        dest="push_to_hub",
        action="store_false",
        help="Do not push model and image processor to the hub",
    )
    parser.set_defaults(push_to_hub=True)

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数来转换 EfficientFormer 检查点
    convert_efficientformer_checkpoint(
        checkpoint_path=args.pytorch_model_path,
        efficientformer_config_file=args.config_file,
        pytorch_dump_path=args.pytorch_dump_path,
        push_to_hub=args.push_to_hub,
    )
posted @ 2024-06-30 15:36  绝不原创的飞龙  阅读(11)  评论(0编辑  收藏  举报