Transformers-源码解析-六十五-

Transformers 源码解析（六十五）

`.\models\llama\tokenization_llama.py`

# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tokenization classes for LLaMA."""
import os
from shutil import copyfile
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

import sentencepiece as spm  # 导入 sentencepiece 库，用于处理分词

from ...convert_slow_tokenizer import import_protobuf  # 导入从 protobuf 格式转换的函数
from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入自定义的 token 类和预训练的 tokenizer 类
from ...utils import logging  # 导入日志工具模块


if TYPE_CHECKING:
    from ...tokenization_utils_base import TextInput  # 导入类型检查时所需的 TextInput 类型

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}  # 词汇表文件名映射

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
    },
    "tokenizer_file": {
        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
    },
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "hf-internal-testing/llama-tokenizer": 2048,  # 预训练位置嵌入大小
}
SPIECE_UNDERLINE = "▁"  # 分词符号

B_INST, E_INST = "[INST]", "[/INST]"  # 实例开始和结束标记
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"  # 系统开始和结束标记

# fmt: off
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
# fmt: on


class LlamaTokenizer(PreTrainedTokenizer):
    """
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇表文件名
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 预训练词汇文件映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 最大模型输入尺寸映射
    # 定义模型输入的名称列表，包含 "input_ids" 和 "attention_mask"
    model_input_names = ["input_ids", "attention_mask"]
    
    # 初始化函数，用于初始化一个新的Tokenizer对象
    def __init__(
        self,
        vocab_file,  # 词汇表文件路径
        unk_token="<unk>",  # 未知token的表示，默认为"<unk>"
        bos_token="<s>",  # 起始token的表示，默认为"<s>"
        eos_token="</s>",  # 终止token的表示，默认为"</s>"
        pad_token=None,  # 填充token的表示，默认为None
        sp_model_kwargs: Optional[Dict[str, Any]] = None,  # SentencePiece模型的参数字典，默认为None
        add_bos_token=True,  # 是否添加起始token，默认为True
        add_eos_token=False,  # 是否添加终止token，默认为False
        clean_up_tokenization_spaces=False,  # 是否清理token化空格，默认为False
        use_default_system_prompt=False,  # 是否使用默认系统提示，默认为False
        spaces_between_special_tokens=False,  # 特殊token之间是否有空格，默认为False
        legacy=None,  # 是否使用旧版行为，默认为None
        add_prefix_space=True,  # 特殊token前是否添加空格，默认为True
        **kwargs,  # 其他关键字参数
    ):
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs  # 如果sp_model_kwargs为None，则设为空字典
        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token  # 如果bos_token是字符串，则创建一个特殊的AddedToken对象
        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token  # 如果eos_token是字符串，则创建一个特殊的AddedToken对象
        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token  # 如果unk_token是字符串，则创建一个特殊的AddedToken对象
        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token  # 如果pad_token是字符串，则创建一个特殊的AddedToken对象
    
        if legacy is None:
            logger.warning_once(
                f"You are using the default legacy behaviour of the {self.__class__}. This is"
                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
                " means, and thoroughly read the reason why this was added as explained in"
                " https://github.com/huggingface/transformers/pull/24565"
            )
            legacy = True  # 如果legacy为None，则设置为True
    
        self.legacy = legacy  # 将legacy属性设置为传入的legacy值
        self.vocab_file = vocab_file  # 设置词汇表文件路径
        self.add_bos_token = add_bos_token  # 设置是否添加起始token
        self.add_eos_token = add_eos_token  # 设置是否添加终止token
        self.use_default_system_prompt = use_default_system_prompt  # 设置是否使用默认系统提示
        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))  # 获取SentencePiece模型处理器
        self.add_prefix_space = add_prefix_space  # 设置特殊token前是否添加空格
    
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            sp_model_kwargs=self.sp_model_kwargs,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            use_default_system_prompt=use_default_system_prompt,
            spaces_between_special_tokens=spaces_between_special_tokens,
            legacy=legacy,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
    
    # 属性装饰器，返回未知token在SentencePiece模型中的长度
    @property
    def unk_token_length(self):
        return len(self.sp_model.encode(str(self.unk_token)))
    
    # 从transformers库中复制而来的函数，用于获取SentencePiece模型处理器
    # 获取 SentencePieceProcessor 对象用于处理文本
    def get_spm_processor(self, from_slow=False):
        # 根据给定的参数初始化 SentencePieceProcessor 对象
        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        
        # 如果设置了 legacy 或者 from_slow 标志，不依赖于 protobuf，直接从文件加载词汇表
        if self.legacy or from_slow:  # no dependency on protobuf
            tokenizer.Load(self.vocab_file)
            return tokenizer

        # 否则，从文件中读取序列化后的 protobuf 模型，并进行必要的配置
        with open(self.vocab_file, "rb") as f:
            sp_model = f.read()
            # 动态导入 protobuf 模块，并使用其对应的模型类进行反序列化
            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
            model = model_pb2.ModelProto.FromString(sp_model)
            # 配置模型的规范化器
            normalizer_spec = model_pb2.NormalizerSpec()
            normalizer_spec.add_dummy_prefix = False
            model.normalizer_spec.MergeFrom(normalizer_spec)
            # 序列化模型为字符串，并加载到 tokenizer 中
            sp_model = model.SerializeToString()
            tokenizer.LoadFromSerializedProto(sp_model)
        
        return tokenizer

    # 序列化对象时需要调用的方法，保存对象的状态
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None  # 清空 sp_model，因为其无法直接序列化
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()  # 保存序列化后的模型 proto
        return state

    # 反序列化对象时需要调用的方法，加载对象的状态
    def __setstate__(self, d):
        self.__dict__ = d  # 恢复对象的属性
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)  # 重新创建 sp_model 对象
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)  # 从保存的 proto 数据中加载模型

    # 返回词汇表的大小
    @property
    def vocab_size(self):
        """Returns vocab size"""
        return self.sp_model.get_piece_size()

    # 返回词汇表作为字典
    def get_vocab(self):
        """Returns vocab as a dict"""
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 根据指定的文本进行分词，返回 token 列表
    # 该方法源自 transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
        """
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        """
        # 如果 legacy 设置为 True 或者文本长度为 0，则调用父类的 tokenize 方法处理
        if self.legacy or len(text) == 0:
            return super().tokenize(text, **kwargs)

        # 将特殊符号 SPIECE_UNDERLINE 替换为空格，并根据 add_prefix_space 设置添加前缀空格
        text = text.replace(SPIECE_UNDERLINE, " ")
        if self.add_prefix_space:
            text = SPIECE_UNDERLINE + text

        # 调用父类的 tokenize 方法获取 token 列表
        tokens = super().tokenize(text, **kwargs)

        # 如果 tokens 长度大于 1 并且第一个 token 是 SPIECE_UNDERLINE，并且第二个 token 是特殊 token，则去掉第一个 token
        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
            tokens = tokens[1:]
        return tokens

    # 该方法复制自 transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
    def _tokenize(self, text, **kwargs):
        """
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        """
        # 使用 sentencepiece 模型对文本进行编码，返回字符串形式的编码结果
        tokens = self.sp_model.encode(text, out_type=str)
        
        # 如果是传统模式或者文本不以 SPIECE_UNDERLINE 或空格开头，则直接返回编码结果
        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
            return tokens

        # 1. 对带有未知标记的文本进行编码，例如 "<unk> Hey"
        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
        # 2. 从编码结果中移除 unk_token，例如 ['<','unk','>', '▁Hey'] 中移除 '<unk>'
        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用词汇表将 token 转换为对应的 id
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用词汇表将 index 转换为对应的 token
        token = self.sp_model.IdToPiece(index)
        return token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 因为我们手动添加了前缀空格，所以在解码时需要移除
        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
            tokens[0] = tokens[0][1:]

        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for i, token in enumerate(tokens):
            # 确保特殊标记不会使用 sentencepiece 模型进行解码
            if token in self.all_special_tokens:
                if not prev_is_special and i != 0 and self.legacy:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string
    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 组合输出的词汇文件路径，如果指定了文件名前缀，则加在文件名之前
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇文件的绝对路径不等于输出路径的绝对路径，并且当前词汇文件存在，则复制文件
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇文件不存在，则将序列化后的模型写入输出文件
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回保存的文件路径的元组
        return (out_vocab_file,)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        # 构建包含特殊令牌的输入列表
        output = bos_token_id + token_ids_0 + eos_token_id

        # 如果存在第二个输入列表，则将其也加入到输出中
        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id

        return output

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        # 此处将创建一个特殊令牌掩码，用于标记特殊令牌的位置
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # Check if the token list already has special tokens
        if already_has_special_tokens:
            # If yes, delegate to the parent class's method to get special tokens mask
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Determine the beginning of sentence (bos) and end of sentence (eos) token IDs
        bos_token_id = [1] if self.add_bos_token else []
        eos_token_id = [1] if self.add_eos_token else []

        # If token_ids_1 is not provided, return mask for single sequence
        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
        
        # Return mask for sequence pairs
        return (
            bos_token_id
            + ([0] * len(token_ids_0))
            + eos_token_id
            + bos_token_id
            + ([0] * len(token_ids_1))
            + eos_token_id
        )

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Determine the beginning of sentence (bos) and end of sentence (eos) token IDs
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        # Initialize the output list with zeros based on the length of the sequences with added tokens
        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)

        # If token_ids_1 is provided, extend the output list to accommodate the second sequence
        if token_ids_1 is not None:
            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)

        return output

`.\models\llama\tokenization_llama_fast.py`

# 设置编码格式为 UTF-8

# 导入所需的模块和函数
import os  # 导入操作系统相关的功能
from shutil import copyfile  # 从 shutil 模块导入 copyfile 函数
from typing import Optional, Tuple  # 导入类型提示相关的类和函数

from tokenizers import processors  # 从 tokenizers 模块导入 processors

# 导入所需的自定义模块和函数
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入预训练的快速分词器
from ...utils import is_sentencepiece_available, logging  # 从 utils 模块导入检查是否安装了 sentencepiece 的函数和日志功能
from ...utils.versions import require_version  # 从 utils.versions 模块导入版本要求函数

# 要求使用的 tokenizers 版本至少为 0.13.3
require_version("tokenizers>=0.13.3")

# 如果安装了 sentencepiece，则导入 LlamaTokenizer；否则置为 None
if is_sentencepiece_available():
    from .tokenization_llama import LlamaTokenizer
else:
    LlamaTokenizer = None

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}

# 定义预训练词汇文件的映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
    },
    "tokenizer_file": {
        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
    },
}

# 定义特定格式的起始和结束标记
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

# fmt: off
# 默认系统提示文本，采用三重引号多行字符串
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
# fmt: on


class LlamaTokenizerFast(PreTrainedTokenizerFast):
    """
    构建 Llama 快速分词器，基于字节级别的 Byte-Pair-Encoding。

    这个分词器使用了 ByteFallback 和不进行任何标准化处理。

    ```
    >>> from transformers import LlamaTokenizerFast

    >>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
    >>> tokenizer.encode("Hello this is a test")
    [1, 15043, 445, 338, 263, 1243]
    ```

    如果需要修改 `bos_token` 或 `eos_token`，请在初始化模型时指定，或调用 `tokenizer.update_post_processor()` 确保后处理正确执行
    （否则编码序列的第一个和最后一个标记的值将不正确）。更多详情，请参阅
    # 定义了一个名为 `vocab_files_names` 的变量，其值来自外部常量 VOCAB_FILES_NAMES
    vocab_files_names = VOCAB_FILES_NAMES
    
    # 定义了一个名为 `pretrained_vocab_files_map` 的变量，其值来自外部常量 PRETRAINED_VOCAB_FILES_MAP
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    
    # 将类 `LlamaTokenizer` 赋值给变量 `slow_tokenizer_class`
    slow_tokenizer_class = LlamaTokenizer
    
    # 指定填充操作在输入的左侧进行
    padding_side = "left"
    
    # 定义了一个包含字符串元素 "input_ids" 和 "attention_mask" 的列表，并赋值给变量 `model_input_names`
    model_input_names = ["input_ids", "attention_mask"]
    
    # 定义了一个初始化函数 `__init__`，用于实例化一个新的 Tokenizer 对象
    def __init__(
        self,
        vocab_file=None,  # 可选参数：指定包含词汇的文件名
        tokenizer_file=None,  # 可选参数：指定包含 tokenizer 配置的文件名
        clean_up_tokenization_spaces=False,  # 可选参数：是否清理解码后的空格
        unk_token="<unk>",  # 可选参数：未知 token，默认为 "<unk>"
        bos_token="<s>",  # 可选参数：序列开头 token，默认为 "<s>"
        eos_token="</s>",  # 可选参数：序列结尾 token，默认为 "</s>"
        add_bos_token=True,  # 可选参数：是否在序列开头添加 bos_token，默认为 True
        add_eos_token=False,  # 可选参数：是否在序列结尾添加 eos_token，默认为 False
        use_default_system_prompt=False,  # 可选参数：是否使用默认的系统提示语（针对 Llama）
        add_prefix_space=None,  # 可选参数：是否自动在 tokenizer 前添加空格
        **kwargs,  # 其他未指定的关键字参数
    ):
    ):
        if add_prefix_space is not None:
            # 如果设置了 add_prefix_space，则发出警告信息，说明需要将分词器从慢速分词器转换过来
            logger.warning_once(
                "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
            )
            # 设置参数 from_slow=True，以便在初始化时使用
            kwargs["from_slow"] = True

        # 调用父类的初始化方法，传入各种参数来初始化对象
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            use_default_system_prompt=use_default_system_prompt,
            **kwargs,
        )
        # 将 add_bos_token 和 add_eos_token 设置到当前对象的属性中
        self._add_bos_token = add_bos_token
        self._add_eos_token = add_eos_token
        # 调用更新后处理器的方法，确保后处理器与当前的 bos_token 和 eos_token 保持同步
        self.update_post_processor()
        # 设置 use_default_system_prompt 属性
        self.use_default_system_prompt = use_default_system_prompt
        # 设置 vocab_file 属性
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查 vocab_file 是否存在，从而判断是否可以保存慢速分词器的状态
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def update_post_processor(self):
        """
        更新后处理器，使用当前的 bos_token 和 eos_token。
        """
        bos = self.bos_token
        bos_token_id = self.bos_token_id
        # 如果 add_bos_token 为 True，但 bos_token 为 None，则抛出错误
        if bos is None and self.add_bos_token:
            raise ValueError("add_bos_token = True but bos_token = None")

        eos = self.eos_token
        eos_token_id = self.eos_token_id
        # 如果 add_eos_token 为 True，但 eos_token 为 None，则抛出错误
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

        # 根据 add_bos_token 和 add_eos_token 的设置，生成单句和双句的模板
        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"

        special_tokens = []
        # 如果 add_bos_token 为 True，则将 bos 和其对应的 token_id 加入特殊标记列表
        if self.add_bos_token:
            special_tokens.append((bos, bos_token_id))
        # 如果 add_eos_token 为 True，则将 eos 和其对应的 token_id 加入特殊标记列表
        if self.add_eos_token:
            special_tokens.append((eos, eos_token_id))
        # 将生成的模板和特殊标记设置到 tokenizer 的后处理器中
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=single, pair=pair, special_tokens=special_tokens
        )

    @property
    def add_eos_token(self):
        # 返回当前对象的 add_eos_token 属性值
        return self._add_eos_token

    @property
    def add_bos_token(self):
        # 返回当前对象的 add_bos_token 属性值
        return self._add_bos_token

    @add_eos_token.setter
    def add_eos_token(self, value):
        # 设置 add_eos_token 的值，并更新后处理器
        self._add_eos_token = value
        self.update_post_processor()

    @add_bos_token.setter
    def add_bos_token(self, value):
        # 设置 add_bos_token 的值，并更新后处理器
        self._add_bos_token = value
        self.update_post_processor()
    # 保存词汇表到指定目录和文件名前缀下的文件中，并返回保存的文件路径
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果无法保存慢速分词器的词汇表，则抛出数值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存目录不存在，则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建输出词汇表文件的完整路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与输出路径不一致，则复制词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回保存的词汇表文件路径的元组
        return (out_vocab_file,)

    @property
    # 从 LlamaTokenizer.default_chat_template 复制而来，建议使用模板处理器，重构所有快速分词器
    # 从 LlamaTokenizer.build_inputs_with_special_tokens 复制而来
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        # 如果需要添加开头特殊标记，则使用开头的标记 ID；否则为空列表
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        # 如果需要添加结尾特殊标记，则使用结尾的标记 ID；否则为空列表
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        # 构建输出的特殊标记输入，包括开头标记、第一个序列的标记和结尾标记
        output = bos_token_id + token_ids_0 + eos_token_id

        # 如果有第二个序列的标记存在，则添加第二个序列的开头标记、标记和结尾标记
        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id

        # 返回构建好的特殊标记输入列表
        return output

`.\models\llama\init.py`

# 版权声明和许可信息，指出代码受 Apache 许可证版本 2.0 保护，详见许可证链接
# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入延迟加载模块和依赖检查函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，用于动态导入依赖项
_import_structure = {
    "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
}

# 检查是否存在 SentencePiece 库，若不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 tokenization_llama 模块到导入结构中
    _import_structure["tokenization_llama"] = ["LlamaTokenizer"]

# 检查是否存在 Tokenizers 库，若不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 tokenization_llama_fast 模块到导入结构中
    _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]

# 检查是否存在 PyTorch 库，若不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_llama 模块到导入结构中
    _import_structure["modeling_llama"] = [
        "LlamaForCausalLM",
        "LlamaModel",
        "LlamaPreTrainedModel",
        "LlamaForSequenceClassification",
        "LlamaForQuestionAnswering",
    ]

# 检查是否存在 Flax 库，若不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_flax_llama 模块到导入结构中
    _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"]

# 如果是类型检查模式，导入配置和模型相关的类和函数
if TYPE_CHECKING:
    from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig

    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，导入 tokenization_llama 模块的 LlamaTokenizer 类
        from .tokenization_llama import LlamaTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，导入 tokenization_llama_fast 模块的 LlamaTokenizerFast 类
        from .tokenization_llama_fast import LlamaTokenizerFast

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，导入 modeling_llama 模块的相关类
        from .modeling_llama import (
            LlamaForCausalLM,
            LlamaForQuestionAnswering,
            LlamaForSequenceClassification,
            LlamaModel,
            LlamaPreTrainedModel,
        )
    try:
        # 检查是否存在名为 is_flax_available 的函数，用于检测是否可用 Flax 库
        if not is_flax_available():
            # 如果 Flax 库不可用，则引发 OptionalDependencyNotAvailable 异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被引发，则忽略并继续执行后续代码
        pass
    else:
        # 如果没有异常发生，则从当前包导入 Flax 模型相关类
        from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel
else:
    # 如果前面的条件不满足，则执行以下代码块
    import sys
    # 导入 sys 模块，用于在运行时操作 Python 解释器的功能

    # 将当前模块注册为一个延迟加载模块的实例，将其赋值给当前模块的名称
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\llava\configuration_llava.py`

# 设置文件编码为UTF-8
# 版权声明，指明了版权归属及许可信息
# 根据Apache License, Version 2.0规定，除非符合许可条件，否则不得使用此文件
# 可在以下链接获取许可协议的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则依照"AS IS"的方式分发本软件
# 不提供任何明示或默示的担保或条件
# 查看许可协议以了解更多细节

""" Llava model configuration"""

# 引入警告模块，用于处理警告信息
import warnings

# 从transformers模块中导入预训练配置类PretrainedConfig
from ...configuration_utils import PretrainedConfig

# 从transformers.utils中导入日志模块logging
from ...utils import logging

# 从transformers.models.auto中导入配置映射字典CONFIG_MAPPING
from ..auto import CONFIG_MAPPING

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义一个映射字典，将预训练模型名称映射到其配置文件的URL
LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "llava-hf/llava-v1.5-7b": "https://huggingface.co/llava-hf/llava-v1.5-7b/resolve/main/config.json",
}

# 定义LlavaConfig类，继承自PretrainedConfig类
class LlavaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llava-9B.

    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
            The config object or dictionary of the vision backbone.
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
            The config object or dictionary of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        image_token_index (`int`, *optional*, defaults to 32000):
            The image token index to encode the image prompt.
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The activation function used by the multimodal projector.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`.
        vision_feature_layer (`int`, *optional*, defaults to -2):
            The index of the layer to select the vision feature.

    Example:

    ```
    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig

    >>> # Initializing a CLIP-vision config
    >>> vision_config = CLIPVisionConfig()
    ```

    注释:
    此类用于存储`LlavaForConditionalGeneration`的配置信息，根据给定的参数实例化一个Llava模型，定义模型架构。
    使用默认参数实例化配置将产生类似于Llava-9B的配置。
    配置对象继承自`PretrainedConfig`，可用于控制模型输出。阅读`PretrainedConfig`的文档获取更多信息。
    """
    # 设置模型类型为 "llava"
    model_type = "llava"
    # 是否为复合模型，默认为 False
    is_composition = False

    # 初始化函数，接收多个参数来配置模型
    def __init__(
        self,
        vision_config=None,  # 视觉配置，可以是字典或预定义配置对象，默认为 None
        text_config=None,    # 文本配置，可以是字典或预定义配置对象，默认为 None
        ignore_index=-100,    # 忽略索引，默认为 -100
        image_token_index=32000,  # 图像令牌索引，默认为 32000
        projector_hidden_act="gelu",  # 投影器隐藏层激活函数，默认为 "gelu"
        vision_feature_select_strategy="default",  # 视觉特征选择策略，默认为 "default"
        vision_feature_layer=-2,  # 视觉特征层，默认为 -2
        **kwargs,  # 其余关键字参数
    ):
        # 初始化对象的各种属性
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act

        # 检查视觉特征选择策略是否有效，否则抛出 ValueError 异常
        if vision_feature_select_strategy not in ["default", "full"]:
            raise ValueError(
                "vision_feature_select_strategy should be one of 'default', 'full'."
                f"Got: {vision_feature_select_strategy}"
            )

        # 若传入了不推荐使用的 'vocab_size' 参数，则发出警告
        if "vocab_size" in kwargs:
            warnings.warn(
                "The `vocab_size` argument is deprecated and will be removed in v4.42, since it can be inferred from the `text_config`. Passing this argument has no effect",
                FutureWarning,
            )

        # 设置视觉特征选择策略和视觉特征层
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.vision_feature_layer = vision_feature_layer

        # 根据传入的视觉配置，创建对应的配置对象，若为 None 则使用默认的 "clip_vision_model" 配置
        if isinstance(vision_config, dict):
            vision_config["model_type"] = (
                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
            )
            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
        elif vision_config is None:
            vision_config = CONFIG_MAPPING["clip_vision_model"](
                intermediate_size=4096,
                hidden_size=1024,
                patch_size=14,
                image_size=336,
                num_hidden_layers=24,
                num_attention_heads=16,
                vocab_size=32000,
                projection_dim=768,
            )

        # 设置视觉配置
        self.vision_config = vision_config

        # 根据传入的文本配置，创建对应的配置对象，若为 None 则使用默认的 "llama" 配置
        if isinstance(text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
        elif text_config is None:
            text_config = CONFIG_MAPPING["llama"]()

        # 设置文本配置和词汇表大小
        self.text_config = text_config
        self._vocab_size = self.text_config.vocab_size

        # 调用父类的初始化方法，传入剩余的关键字参数
        super().__init__(**kwargs)
    # 发出警告，提示 `vocab_size` 属性已经被废弃，在 v4.42 版本中将被移除，建议使用 `text_config.vocab_size` 替代
    warnings.warn(
        "The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.",
        FutureWarning,
    )
    # 返回对象的 `_vocab_size` 属性值
    return self._vocab_size

    # 将对象转换为字典类型并获取输出
    def to_dict(self):
        output = super().to_dict()
        # 从输出字典中删除 `_vocab_size` 键对应的值
        output.pop("_vocab_size", None)
        return output

`.\models\llava\convert_llava_weights_to_hf.py`

# 导入必要的模块和库
import argparse  # 导入命令行参数解析模块

import torch  # 导入PyTorch库
from huggingface_hub import hf_hub_download  # 从HuggingFace Hub下载模块

from transformers import (  # 导入transformers库中的多个类和函数
    AddedToken,
    AutoConfig,
    AutoTokenizer,
    CLIPImageProcessor,
    LlavaConfig,
    LlavaForConditionalGeneration,
    LlavaProcessor,
)

# 示例代码的额外说明文本
EPILOG_TXT = """Example:
    python transformers/src/transformers/models/llava/convert_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/llava-v1.5-7b-conv --old_state_dict_id liuhaotian/llava-v1.5-7b

Example for creating the old state dict file with Python:

    import torch
    from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM

    # load model
    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
    model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", low_cpu_mem_usage=True, **kwargs)

    # load vision tower
    model.get_vision_tower().load_model()

    # Save state dict
    torch.save(model.state_dict(), "tmp/hf_models/llava-v1.5-7b/model_state_dict.bin")
"""

# 用于修改state_dict中键名的映射关系
KEYS_TO_MODIFY_MAPPING = {
    "model.vision_tower.": "",  # 将"model.vision_tower."替换为""
    "model.mm_projector": "multi_modal_projector",  # 将"model.mm_projector"替换为"multi_modal_projector"
    "model": "model.model",  # 将"model"替换为"model.model"
    "vision_model.model": "vision_model",  # 将"vision_model.model"替换为"vision_model"
    "lm_head": "language_model.lm_head",  # 将"lm_head"替换为"language_model.lm_head"
    "model.model": "language_model.model",  # 将"model.model"替换为"language_model.model"
    "multi_modal_projector.0": "multi_modal_projector.linear_1",  # 将"multi_modal_projector.0"替换为"multi_modal_projector.linear_1"
    "multi_modal_projector.2": "multi_modal_projector.linear_2",  # 将"multi_modal_projector.2"替换为"multi_modal_projector.linear_2"
}

# 将state_dict转换为适用于Hugging Face模型的格式
def convert_state_dict_to_hf(state_dict):
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.endswith(".inv_freq"):  # 忽略以".inv_freq"结尾的键
            continue
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            if key_to_modify in key:
                key = key.replace(key_to_modify, new_key)
        new_state_dict[key] = value
    return new_state_dict

# 将llava_llama模型转换为适用于Hugging Face模型的格式
def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
    torch.set_default_dtype(torch.float16)  # 设置PyTorch默认数据类型为float16

    text_config = AutoConfig.from_pretrained(text_model_id)  # 从预训练模型ID加载文本配置
    tokenizer = AutoTokenizer.from_pretrained(text_model_id)  # 从预训练模型ID加载tokenizer
    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)  # 添加特殊token"<image>"
    tokenizer.add_special_tokens({"pad_token": "<pad>"})  # 添加特殊token"<pad>"

    image_processor = CLIPImageProcessor.from_pretrained(vision_model_id)  # 从预训练模型ID加载图像处理器
    # 创建一个 LlavaProcessor 实例，使用给定的 tokenizer 和 image_processor
    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)

    # 创建一个 LlavaConfig 实例，设置 text_config，并将 pad_token_id 设置为 32001
    config = LlavaConfig(text_config=text_config)
    config.pad_token_id = 32001

    # 使用 "meta" 设备创建 LlavaForConditionalGeneration 模型实例
    with torch.device("meta"):
        model = LlavaForConditionalGeneration(config)

    # 为了性能考虑，将 pad 形状设为 64
    pad_shape = 64

    # 下载旧的模型状态字典并加载到 state_dict_path
    state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin")

    # 使用 torch 加载状态字典，并将其转换为适合 Hugging Face 模型的格式
    state_dict = torch.load(state_dict_path, map_location="cpu")
    state_dict = convert_state_dict_to_hf(state_dict)

    # 将加载的状态字典加载到模型中，strict=True 表示严格匹配模型参数
    model.load_state_dict(state_dict, strict=True, assign=True)

    # 获取模型的预扩展嵌入
    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data

    # 计算嵌入的均值 mu 和标准差 sigma
    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
    n = pre_expansion_embeddings.size()[0]
    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n

    # 使用 mu 和 sigma 创建多变量正态分布
    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)

    # 增加一个图像标记以重新调整模型的标记嵌入
    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)

    # 用从多变量正态分布中采样得到的值来填充模型的标记嵌入权重
    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
        dim=0,
    )

    # 用从多变量正态分布中采样得到的值来填充模型语言模型头部的权重
    model.language_model.lm_head.weight.data[32000:] = torch.stack(
        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
        dim=0,
    )

    # 将模型推送到 Hugging Face Hub 上的输出路径
    model.push_to_hub(output_hub_path)

    # 将处理器推送到 Hugging Face Hub 上的输出路径
    processor.push_to_hub(output_hub_path)
# 主程序入口函数，用于处理命令行参数并调用转换函数
def main():
    # 创建参数解析器对象
    parser = argparse.ArgumentParser(
        epilog=EPILOG_TXT,  # 添加自定义的结尾文本
        formatter_class=argparse.RawDescriptionHelpFormatter,  # 使用原始描述帮助格式
    )
    # 添加命令行参数：--text_model_id，用于指定文本模型的 Hub 地址
    parser.add_argument(
        "--text_model_id",
        help="Hub location of the text model",
    )
    # 添加命令行参数：--vision_model_id，用于指定视觉模型的 Hub 地址
    parser.add_argument(
        "--vision_model_id",
        help="Hub location of the vision model",
    )
    # 添加命令行参数：--output_hub_path，用于指定转换模型输出的 Hub 路径
    parser.add_argument(
        "--output_hub_path",
        help="Location on the hub of the converted model",
    )
    # 添加命令行参数：--old_state_dict_id，用于指定原始模型状态字典的 Hub 地址
    parser.add_argument(
        "--old_state_dict_id",
        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
    )
    # 解析命令行参数
    args = parser.parse_args()
    # 调用转换函数，将 llava_llama 转换为 HF 模型
    convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)


# 如果该脚本作为主程序运行，则调用 main 函数
if __name__ == "__main__":
    main()

`.\models\llava\modeling_llava.py`

# 指定编码格式为 UTF-8
# 版权声明，版权归 HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本，除非符合许可证规定，否则不得使用本文件
# 可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发本软件，
# 没有任何明示或暗示的保证或条件
# 有关特定语言的权限，请参阅许可证
""" PyTorch Llava model. """

# 导入必要的库和模块
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# 导入 HuggingFace 通用模型基类
from ... import PreTrainedModel
# 导入激活函数映射表
from ...activations import ACT2FN
# 导入缓存工具函数
from ...cache_utils import Cache
# 导入模型输出基类
from ...modeling_outputs import ModelOutput
# 导入实用工具函数：添加文档字符串、模型前向方法文档字符串、日志记录、替换返回文档字符串
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入自动加载模型和自动加载 Causal LM 模型
from ..auto import AutoModel, AutoModelForCausalLM
# 导入 Llava 模型的配置文件
from .configuration_llava import LlavaConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置名
_CONFIG_FOR_DOC = "LlavaConfig"

# 预训练模型存档列表
LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-1.5-13b-hf",
    "llava-hf/bakLlava-v1-hf",
    # 查看所有 Llava 模型的链接 https://huggingface.co/models?filter=llava
]

@dataclass
# 从 transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast 复制，并将 Idefics 改为 Llava
class LlavaCausalLMOutputWithPast(ModelOutput):
    """
    Llava 因果语言模型（或自回归模型）输出的基类。
    继承自 ModelOutput 类。
    包含预测输出和过去的信息。
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    """

    # Loss值，用于语言模型的下一个标记预测的损失
    loss: Optional[torch.FloatTensor] = None
    # 预测得分，语言模型头部的预测分数（SoftMax之前的每个词汇标记的分数）
    logits: torch.FloatTensor = None
    # 过去键值对，用于加速顺序解码的预先计算的隐藏状态（自注意块中的键和值）
    past_key_values: Optional[List[torch.FloatTensor]] = None
    # 隐藏状态，每层模型输出的隐藏状态加上可选的初始嵌入输出
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 注意力权重，自注意头部中用于计算加权平均值的注意力softmax后的注意力权重
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 图像隐藏状态，视觉编码器产生的模型的图像嵌入输出
    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 定义一个名为 LlavaMultiModalProjector 的类，继承自 nn.Module 类
class LlavaMultiModalProjector(nn.Module):
    # 构造函数，初始化方法
    def __init__(self, config: LlavaConfig):
        # 调用父类的初始化方法
        super().__init__()

        # 使用线性层，将视觉配置的隐藏大小映射到文本配置的隐藏大小，并添加偏置
        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
        # 获取激活函数，并赋值给 self.act
        self.act = ACT2FN[config.projector_hidden_act]
        # 再次使用线性层，将文本配置的隐藏大小映射到文本配置的隐藏大小，并添加偏置
        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)

    # 前向传播方法
    def forward(self, image_features):
        # 使用第一个线性层进行计算，得到隐藏状态
        hidden_states = self.linear_1(image_features)
        # 使用预定义的激活函数进行激活
        hidden_states = self.act(hidden_states)
        # 使用第二个线性层进行计算，得到最终的隐藏状态
        hidden_states = self.linear_2(hidden_states)
        # 返回计算结果作为输出
        return hidden_states


# 定义 LLAVA_START_DOCSTRING，包含一段原始文档字符串
LLAVA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LlavaConfig`] or [`LlavaVisionConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


# 使用装饰器 @add_start_docstrings 添加文档字符串，描述 LlavaPreTrainedModel 类的作用
@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAVA_START_DOCSTRING,
)
# 定义 LlavaPreTrainedModel 类，继承自 PreTrainedModel 类
class LlavaPreTrainedModel(PreTrainedModel):
    # 指定配置类为 LlavaConfig
    config_class = LlavaConfig
    # 基础模型前缀设为 "model"
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不分割的模块列表
    _no_split_modules = ["LlavaVisionAttention"]
    # 跳过设备位置置入的键
    _skip_keys_device_placement = "past_key_values"
    # 支持 flash_attn_2 特性
    _supports_flash_attn_2 = True

    # 初始化权重的私有方法，根据配置参数进行初始化
    def _init_weights(self, module):
        # 根据配置选择合适的标准差
        std = (
            self.config.initializer_range
            if hasattr(self.config, "initializer_range")
            else self.config.text_config.initializer_range
        )

        # 如果模块具有 class_embedding 属性，对其进行正态分布初始化
        if hasattr(module, "class_embedding"):
            module.class_embedding.data.normal_(mean=0.0, std=std)

        # 根据模块的类型选择初始化方法：线性层或卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层，同样进行正态分布初始化
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
    @property
    def _supports_sdpa(self):
        """
        返回 language_model 对象的属性，用于检查模型是否支持 SDPA。
        """
        # 返回语言模型对象的 _supports_sdpa 属性
        return self.language_model._supports_sdpa
LLAVA_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    """The LLAVA model which consists of a vision backbone and a language model.""",
    LLAVA_START_DOCSTRING,
)
class LlavaForConditionalGeneration(LlavaPreTrainedModel):
    def __init__(self, config: LlavaConfig):
        super().__init__(config)
        # 初始化视觉塔模型，从给定的视觉配置中创建
        self.vision_tower = AutoModel.from_config(config.vision_config)

        # 初始化多模态投影层
        self.multi_modal_projector = LlavaMultiModalProjector(config)
        
        # 设置词汇表大小
        self.vocab_size = config.text_config.vocab_size
        
        # 初始化语言模型，基于给定的文本配置和注意力机制实现
        self.language_model = AutoModelForCausalLM.from_config(
            config.text_config, attn_implementation=config._attn_implementation
        )
        
        # 设置填充标记的ID，如果没有指定则设为-1
        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
        
        # 执行初始化后的操作
        self.post_init()

    def get_input_embeddings(self):
        # 获取语言模型的输入嵌入层
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        # 设置语言模型的输入嵌入层
        self.language_model.set_input_embeddings(value)

    def get_output_embeddings(self):
        # 获取语言模型的输出嵌入层
        return self.language_model.get_output_embeddings()

    def set_output_embeddings(self, new_embeddings):
        # 设置语言模型的输出嵌入层
        self.language_model.set_output_embeddings(new_embeddings)

    def set_decoder(self, decoder):
        # 设置语言模型的解码器
        self.language_model.set_decoder(decoder)

    def get_decoder(self):
        # 获取语言模型的解码器
        return self.language_model.get_decoder()

    def tie_weights(self):
        # 绑定语言模型的权重
        return self.language_model.tie_weights()

    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
        # 调整语言模型的词汇嵌入层大小
        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        
        # 更新配置中的词汇表大小
        self.config.text_config.vocab_size = model_embeds.num_embeddings
        self.vocab_size = model_embeds.num_embeddings
        
        # 返回调整后的嵌入层
        return model_embeds

    @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        pixel_values: torch.FloatTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        vision_feature_layer: Optional[int] = None,
        vision_feature_select_strategy: Optional[str] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        LLAVA模型的前向传播方法，接受多种输入参数并返回模型的输出。

        Args:
            input_ids (torch.LongTensor, optional): 输入的token IDs序列.
            pixel_values (torch.FloatTensor, optional): 图像的像素值.
            attention_mask (torch.Tensor, optional): 注意力掩码.
            position_ids (torch.LongTensor, optional): 位置IDs.
            past_key_values (List[torch.FloatTensor], optional): 上下文关键值.
            inputs_embeds (torch.FloatTensor, optional): 输入的嵌入向量.
            vision_feature_layer (int, optional): 视觉特征层.
            vision_feature_select_strategy (str, optional): 视觉特征选择策略.
            labels (torch.LongTensor, optional): 标签序列.
            use_cache (bool, optional): 是否使用缓存.
            output_attentions (bool, optional): 是否输出注意力.
            output_hidden_states (bool, optional): 是否输出隐藏状态.
            return_dict (bool, optional): 是否返回字典形式的输出.

        Returns:
            output (LlavaCausalLMOutputWithPast or torch.Tensor): LLAVA模型的输出，可能包含上下文关键值的信息.

        """
        # 实现LLAVA模型的前向传播逻辑，具体细节在其他方法中处理
        raise NotImplementedError

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
    ):
        """
        准备生成任务的输入，适用于生成文本的情景.

        Args:
            input_ids: 输入的token IDs序列.
            past_key_values: 上下文关键值.
            inputs_embeds: 输入的嵌入向量.
            pixel_values: 图像的像素值.
            attention_mask: 注意力掩码.
            **kwargs: 其他关键字参数.

        Returns:
            dict: 生成任务的输入字典.

        """
        # 实现为生成任务准备输入的逻辑，具体细节在其他方法中处理
        raise NotImplementedError
    ):
        # 如果过去的键值对不为空
        if past_key_values is not None:
            # 如果过去的键值对是Cache类型
            if isinstance(past_key_values, Cache):
                # 获取缓存中的序列长度
                cache_length = past_key_values.get_seq_length()
                # 获取已经处理的token数
                past_length = past_key_values.seen_tokens
            else:
                # 否则，从past_key_values中获取第一个元素的第一个维度的长度，作为缓存长度和已处理长度
                cache_length = past_length = past_key_values[0][0].shape[2]

            # 保留未处理的token：
            # 1 - 如果attention_mask的长度超过input_ids的长度，说明一些输入仅作为缓存的一部分传入（例如当传入input_embeds时）
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - 如果past_length小于input_ids的长度，则input_ids包含所有输入token。我们可以根据past_length丢弃input_ids。
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - 否则（past_length >= input_ids.shape[1]），假设input_ids只有未处理的token。
            elif self.config.image_token_index in input_ids:
                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
            # 如果缓存已经处理的token数超过了它可以容纳的最大限制，那么缓存有一个大小限制。让我们丢弃旧的attention值，因为它们对应的值不是输入的一部分。
            if cache_length < past_length and attention_mask is not None:
                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # 在批处理生成时动态创建position_ids
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # 如果传入了`inputs_embeds`，我们只在第一个生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新model_inputs字典
        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
                "pixel_values": pixel_values,
            }
        )
        # 返回model_inputs
        return model_inputs

    # 重新排序缓存
    def _reorder_cache(self, *args, **kwargs):
        return self.language_model._reorder_cache(*args, **kwargs)

`.\models\llava\processing_llava.py`

    # 设置文件编码为UTF-8
    # 版权声明归HuggingFace Inc.团队所有，使用Apache License, Version 2.0授权
    # 除非符合许可证要求或书面同意，否则不得使用此文件
    # 获取许可证的副本，请访问http://www.apache.org/licenses/LICENSE-2.0
    # 本软件根据"原样"基础分发，不提供任何明示或暗示的担保或条件
    # 请参阅许可证以获取特定语言的权限和限制

    """
    Llava的处理器类。
    """

    from typing import List, Optional, Union

    from ...feature_extraction_utils import BatchFeature
    from ...image_utils import ImageInput
    from ...processing_utils import ProcessorMixin
    from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
    from ...utils import TensorType

    class LlavaProcessor(ProcessorMixin):
        """
        构建一个Llava处理器，将Llava图像处理器和Llava分词器封装到单个处理器中。

        [`LlavaProcessor`] 提供了 [`CLIPImageProcessor`] 和 [`LlamaTokenizerFast`] 的所有功能。查看
        [`~LlavaProcessor.__call__`] 和 [`~LlavaProcessor.decode`] 获取更多信息。

        Args:
            image_processor ([`CLIPImageProcessor`], *optional*):
                图像处理器，必需的输入。
            tokenizer ([`LlamaTokenizerFast`], *optional*):
                分词器，必需的输入。
        """

        attributes = ["image_processor", "tokenizer"]
        image_processor_class = "CLIPImageProcessor"
        tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")

        def __init__(self, image_processor=None, tokenizer=None):
            super().__init__(image_processor, tokenizer)

        def __call__(
            self,
            text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
            images: ImageInput = None,
            padding: Union[bool, str, PaddingStrategy] = False,
            truncation: Union[bool, str, TruncationStrategy] = None,
            max_length=None,
            return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
        ):
            """
            调用处理器进行文本和图像处理。

            Args:
                text: 输入文本或预分词输入的列表。
                images: 输入的图像数据。
                padding: 是否进行填充的策略。
                truncation: 是否进行截断的策略。
                max_length: 最大长度限制。
                return_tensors: 返回的张量类型。

            Returns:
                处理后的文本和图像特征。
            """

        def batch_decode(self, *args, **kwargs):
            """
            此方法将所有参数转发到LlamaTokenizerFast的[`~PreTrainedTokenizer.batch_decode`]。请参阅该方法的文档字符串获取更多信息。
            """
            return self.tokenizer.batch_decode(*args, **kwargs)

        # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        # 将所有参数转发给 LlamaTokenizerFast 的 decode 方法，并返回其结果
        return self.tokenizer.decode(*args, **kwargs)

    @property
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
    def model_input_names(self):
        # 获取 tokenizer 的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取 image_processor 的模型输入名称列表
        image_processor_input_names = self.image_processor.model_input_names
        # 将 tokenizer 和 image_processor 的输入名称合并成一个无重复元素的列表，并返回
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

`.\models\llava\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从 utils 模块中导入自定义异常 OptionalDependencyNotAvailable、_LazyModule 和 is_torch_available 函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包括配置和处理相关的模块和类
_import_structure = {
    "configuration_llava": ["LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaConfig"],
    "processing_llava": ["LlavaProcessor"],
}

# 尝试检查是否有 torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加 modeling_llava 模块的内容到导入结构中
    _import_structure["modeling_llava"] = [
        "LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LlavaForConditionalGeneration",
        "LlavaPreTrainedModel",
    ]

# 如果 TYPE_CHECKING 为 True，即在类型检查环境下
if TYPE_CHECKING:
    # 从 configuration_llava 模块中导入特定的类和变量
    from .configuration_llava import LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaConfig
    # 从 processing_llava 模块中导入特定的类
    from .processing_llava import LlavaProcessor

    # 尝试检查是否有 torch 可用，如果不可用则忽略异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 torch 可用，则从 modeling_llava 模块中导入特定的类和变量
        from .modeling_llava import (
            LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
            LlavaForConditionalGeneration,
            LlavaPreTrainedModel,
        )

# 如果不在类型检查环境下
else:
    import sys

    # 将当前模块设置为一个 LazyModule 的实例，延迟加载模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\llava_next\configuration_llava_next.py`

# coding=utf-8
# 定义编码格式为 UTF-8，确保文件能正确处理各种字符
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# 版权声明，指出此代码的版权归HuggingFace Inc.团队所有
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可协议授权使用此代码
# you may not use this file except in compliance with the License.
# 除非你遵守许可协议，否则不得使用此文件
# You may obtain a copy of the License at
# 你可以在以下网址获取许可协议的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 根据许可协议分发的软件是基于"AS IS"的基础上分发
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的任何保证或条件
# See the License for the specific language governing permissions and
# 请查看许可协议以了解详细的授权条款和条件
# limitations under the License.
# 限制条件在许可协议下适用

""" Llava-NeXT model configuration"""
# Llava-NeXT 模型配置

from ...configuration_utils import PretrainedConfig
# 从相对路径导入 PretrainedConfig 类
from ...utils import logging
# 从相对路径导入 logging 模块
from ..auto import CONFIG_MAPPING
# 从相对路径导入 CONFIG_MAPPING 变量


logger = logging.get_logger(__name__)
# 获取当前模块的 logger 实例

LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "llava-hf/llava-v1.6-mistral-7b-hf": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf/resolve/main/config.json",
}
# 定义 LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP 字典，映射预训练模型名到配置文件 URL

class LlavaNextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an
    Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
    model.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # LlavaNextConfig 类继承自 PretrainedConfig 类，用于存储 Llava-NeXT 模型的配置信息，指定模型的架构
    # 设置模型类型为 "llava_next"
    model_type = "llava_next"
    # 设置是否为组合模型为 False
    is_composition = False

    # 定义模型初始化方法
    def __init__(
        self,
        vision_config=None,  # 视觉部分的配置对象或字典，默认为 None
        text_config=None,    # 文本部分的配置对象或字典，默认为 None
        ignore_index=-100,   # 损失函数中的忽略索引，默认为 -100
        image_token_index=32000,  # 编码图像提示时的图像标记索引，默认为 32000
        projector_hidden_act="gelu",  # 多模态投影器使用的激活函数，默认为 "gelu"
        vision_feature_select_strategy="default",  # 从视觉主干中选择特征的策略，默认为 "default"
        vision_feature_layer=-2,    # 选择视觉特征的层索引，默认为 -2
        image_grid_pinpoints=None,  # 用于处理高分辨率图像的可能分辨率列表，默认为 None
        **kwargs,   # 其他可选关键字参数
        ):
            # 初始化类的实例变量
            self.ignore_index = ignore_index
            self.image_token_index = image_token_index
            self.projector_hidden_act = projector_hidden_act

            # 检查视觉特征选择策略是否合法
            if vision_feature_select_strategy not in ["default", "full"]:
                raise ValueError(
                    "vision_feature_select_strategy should be one of 'default', 'full'."
                    f"Got: {vision_feature_select_strategy}"
                )

            self.vision_feature_select_strategy = vision_feature_select_strategy
            self.vision_feature_layer = vision_feature_layer

            # 设置图像网格的固定标记点，默认为指定的坐标
            image_grid_pinpoints = (
                image_grid_pinpoints
                if image_grid_pinpoints is not None
                else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
            )
            self.image_grid_pinpoints = image_grid_pinpoints

            # 根据视觉配置初始化视觉模型
            if isinstance(vision_config, dict):
                vision_config["model_type"] = (
                    vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
                )
                vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
            elif vision_config is None:
                vision_config = CONFIG_MAPPING["clip_vision_model"](
                    intermediate_size=4096,
                    hidden_size=1024,
                    patch_size=14,
                    image_size=336,
                    num_hidden_layers=24,
                    num_attention_heads=16,
                    vocab_size=32000,
                    projection_dim=768,
                )

            self.vision_config = vision_config

            # 根据文本配置初始化文本模型
            if isinstance(text_config, dict):
                text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
                text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
            elif text_config is None:
                text_config = CONFIG_MAPPING["llama"]()

            self.text_config = text_config

            # 调用父类的初始化方法，传入额外的关键字参数
            super().__init__(**kwargs)

`.\models\llava_next\convert_llava_next_weights_to_hf.py`

# 导入必要的模块和库
import argparse  # 解析命令行参数的库
import glob  # 匹配文件路径名的模式扩展库
import json  # 处理 JSON 格式数据的库
from pathlib import Path  # 处理文件路径的对象模块

import requests  # 发送 HTTP 请求的库
import torch  # PyTorch 深度学习库
from accelerate import init_empty_weights  # 初始化空的模型权重的加速库函数
from huggingface_hub import hf_hub_download, snapshot_download  # 从Hugging Face Hub下载模型和快照的函数
from PIL import Image  # Python Imaging Library，处理图像的库
from safetensors import safe_open  # 安全地打开张量数据的库函数

from transformers import (  # 导入 Hugging Face Transformers 库中的相关模块和类
    AddedToken,
    AutoConfig,
    AutoTokenizer,
    LlavaNextConfig,
    LlavaNextForConditionalGeneration,
    LlavaNextImageProcessor,
    LlavaNextProcessor,
)

# 将需要修改的键值映射关系定义为常量
KEYS_TO_MODIFY_MAPPING = {
    "model.vision_tower.": "",  # 替换模型视觉塔相关的键
    "model.mm_projector": "multi_modal_projector",  # 替换多模态投影器的键
    "model": "model.model",  # 替换模型的键
    "vision_model.model": "vision_model",  # 替换视觉模型的键
    "lm_head": "language_model.lm_head",  # 替换语言模型头部的键
    "model.model": "language_model.model",  # 替换模型的键
    "multi_modal_projector.0": "multi_modal_projector.linear_1",  # 替换多模态投影器的第一层线性层键
    "multi_modal_projector.2": "multi_modal_projector.linear_2",  # 替换多模态投影器的第二层线性层键
    "language_model.model.image_newline": "image_newline",  # 替换语言模型中的图像换行键
}


# 加载原始状态字典的函数
def load_original_state_dict(model_id):
    # 从指定的模型 ID 下载并解压快照，只允许安全张量文件格式
    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])

    # 创建一个空的原始状态字典
    original_state_dict = {}
    # 遍历所有解压后的文件
    for path in glob.glob(f"{directory_path}/*"):
        # 如果文件是安全张量文件
        if path.endswith(".safetensors"):
            # 安全地打开文件并使用 PyTorch 框架读取
            with safe_open(path, framework="pt", device="cpu") as f:
                # 遍历文件中的每个键和对应的张量
                for key in f.keys():
                    original_state_dict[key] = f.get_tensor(key)

    # 返回完整的原始状态字典
    return original_state_dict


# 将状态字典转换为适合 Hugging Face 的格式的函数
def convert_state_dict_to_hf(state_dict):
    # 创建一个新的状态字典
    new_state_dict = {}
    # 遍历原始状态字典中的每个键值对
    for key, value in state_dict.items():
        # 如果键以 ".inv_freq" 结尾，则跳过
        if key.endswith(".inv_freq"):
            continue
        # 遍历预定义的需要修改的键值映射关系
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            # 如果需要修改的键值映射关系存在于当前键中
            if key_to_modify in key:
                # 替换当前键中的相应部分为新的键
                key = key.replace(key_to_modify, new_key)

        # 将当前处理后的键值对加入新的状态字典，并将值转换为 float16 类型
        new_state_dict[key] = value.to(torch.float16)

    # 返回转换后的新状态字典
    return new_state_dict


# 加载图像的函数
def load_image():
    # 图像的 URL 地址
    url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
    # 从指定的 URL 获取图像数据，以流的方式读取
    image = Image.open(requests.get(url, stream=True).raw)
    # 返回读取的图像数据
    return image
def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
    # 使用指定的 model_id 从 HF Hub 下载模型配置文件 config.json
    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
    
    # 打开并读取 JSON 文件内容
    with open(filepath) as f:
        data = json.load(f)
        print(data)

    # 根据 model_id 不同设置对应的 text_model_id 和 image_token_index
    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
        text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
        image_token_index = 32000
    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
        text_model_id = "lmsys/vicuna-7b-v1.5"
        image_token_index = 32000
    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
        text_model_id = "lmsys/vicuna-13b-v1.5"
        image_token_index = 32000
    elif model_id == "liuhaotian/llava-v1.6-34b":
        text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
        image_token_index = 64000
    
    # 从模型配置文件中获取 vision_model_id
    vision_model_id = data["mm_vision_tower"]

    # 设置默认的 torch 数据类型为 torch.float16
    torch.set_default_dtype(torch.float16)
    
    # 使用 text_model_id 创建 AutoConfig 对象
    text_config = AutoConfig.from_pretrained(text_model_id)

    # 根据 model_id 确定是否使用 fast tokenizer
    use_fast = False if model_id == "liuhaotian/llava-v1.6-34b" else True
    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast)
    
    # 添加特殊的 "<image>" token 到 tokenizer
    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)

    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
        # 对于 Mistral-7B 模型，添加 "<pad>" 作为 padding token
        tokenizer.add_special_tokens({"pad_token": "<pad>"})

    # 使用 vision_model_id 创建 LlavaNextImageProcessor 对象
    image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
    
    # 创建 LlavaNextProcessor 对象，传入 tokenizer 和 image_processor
    processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor)

    # 构建 LlavaNextConfig 对象，包括 text_config、image_grid_pinpoints 等参数
    config = LlavaNextConfig(
        text_config=text_config.to_dict(),
        image_grid_pinpoints=image_processor.image_grid_pinpoints,
        use_image_newline_parameter=True,
        image_token_index=image_token_index,
    )

    # 初始化空的权重，并创建 LlavaNextForConditionalGeneration 模型
    with init_empty_weights():
        model = LlavaNextForConditionalGeneration(config)

    # 加载原始状态字典
    state_dict = load_original_state_dict(model_id)
    state_dict = convert_state_dict_to_hf(state_dict)
    
    # 加载转换后的状态字典到模型中
    model.load_state_dict(state_dict, assign=True)
    
    # 设置模型为评估模式
    model.eval()

    # 获取模型中预扩展的 embeddings
    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
    
    # 计算 embeddings 的均值 mu
    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
    n = pre_expansion_embeddings.size()[0]
    
    # 计算 embeddings 的协方差矩阵 sigma
    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
    
    # 创建多变量正态分布对象 dist
    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)

    # 添加一个 "<image>" token 以调整模型大小
    # 为了性能原因，将模型的填充形状设为 64
    pad_shape = 64
    vocab_size = config.text_config.vocab_size
    
    if model_id == "liuhaotian/llava-v1.6-34b":
        # 对于该模型，有 3 个额外的 token，即 "<|startoftext|>", "<|endoftext|>" 和 "<image>"
        num_tokens = vocab_size + 3
    else:
        # 对于其他模型，有 2 个额外的 token，即 "<image>" 和 "<pad>"
        num_tokens = vocab_size + 2
    # 调整模型的词嵌入大小，使其能容纳给定的词汇量，并且将其填充到指定的形状
    model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)

    # 使用分布采样填充词嵌入权重的未初始化部分
    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
        tuple(
            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
        ),
        dim=0,
    )

    # 使用分布采样填充语言模型头部的未初始化部分
    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
        dim=0,
    )

    # 设置模型计算设备为 CUDA 第二块GPU
    device = "cuda:2"
    model.to(device)

    # 准备输入数据
    image = load_image()
    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
        # 根据模型ID选择相应的提示文本
        prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
    elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]:
        prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
    elif model_id == "liuhaotian/llava-v1.6-34b":
        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
    # 使用处理器对图像和提示文本进行处理，返回PyTorch张量格式的输入数据
    inputs = processor(images=image, text=prompt, return_tensors="pt")

    # 验证输入数据
    # 下载并加载原始像素值数据文件
    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset")
    original_pixel_values = torch.load(filepath, map_location="cpu")
    # 断言原始像素值与输入数据中的像素值相近（使用半精度浮点数比较）
    assert torch.allclose(original_pixel_values, inputs.pixel_values.half())

    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
        # 下载并加载原始输入ID数据文件
        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset")
        original_input_ids = torch.load(filepath, map_location="cpu")
        # 将原始输入中的特殊标记 -200 替换为图像标记索引
        original_input_ids[original_input_ids == -200] = image_token_index
        # 解码并打印处理后的输入ID数据（排除特殊标记 -200）
        print(tokenizer.decode([id for id in original_input_ids.tolist()[0] if id != -200]))
        # 断言处理后的输入ID与模型输入ID相同
        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()

    elif model_id == "liuhaotian/llava-v1.6-34b":
        # 下载并加载特定模型版本的原始输入ID数据文件
        filepath = hf_hub_download(
            repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset"
        )
        original_input_ids = torch.load(filepath, map_location="cpu")
        # 将原始输入中的特殊标记 -200 替换为图像标记索引
        original_input_ids[original_input_ids == -200] = image_token_index
        # 断言处理后的输入ID与模型输入ID相同
        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()

    # 断言图像尺寸与输入数据中的图像尺寸相同
    image_sizes = torch.tensor([[899, 1024]])
    assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()

    # 执行单次前向传播验证
    print("Single forward pass")
    # 进入推断模式，此模式下不会进行梯度计算
    with torch.inference_mode():
        # 将输入数据移到指定设备上
        inputs = inputs.to(device)
        # 使用模型进行推断，获取输出结果
        outputs = model(**inputs)
        # 打印输出 logits 的形状
        print("Shape of logits:", outputs.logits.shape)
        # 打印 logits 的前几个值
        print("First values of logits:", outputs.logits[0, :3, :3])

        # 根据不同的模型 ID 设置预期的输出切片
        if model_id == "liuhaotian/llava-v1.6-mistral-7b":
            expected_slice = torch.tensor(
                [[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]],
                dtype=torch.float32,
                device=device,
            )
        elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
            expected_slice = torch.tensor(
                [[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]],
                dtype=torch.float32,
                device=device,
            )
        elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
            expected_slice = torch.tensor(
                [[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]],
                dtype=torch.float32,
                device=device,
            )
        elif model_id == "liuhaotian/llava-v1.6-34b":
            expected_slice = torch.tensor(
                [[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]],
                dtype=torch.float32,
                device=device,
            )
        else:
            # 如果模型 ID 不在预期范围内，抛出异常
            raise ValueError(f"Model {model_id} not supported")

        # 断言实际输出的 logits 切片与预期的非常接近
        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
        # 打印确认 logits 正确
        print("Logits are ok!")

    # 验证生成过程
    output_ids = model.generate(
        **inputs,
        max_new_tokens=100,
        use_cache=True,
    )

    # 解码生成的文本并去除特殊标记
    generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

    # 打印生成的文本
    print("Generated text:", repr(generated_text))

    # 根据模型 ID 验证生成的文本是否符合预期
    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
        expected_text = '[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "'
    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
        expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V"""
    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
        expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM"
    elif model_id == "liuhaotian/llava-v1.6-34b":
        expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-"
    else:
        raise ValueError(f"Model {model_id} not supported")

    # 确保生成的文本与预期文本一致
    assert generated_text == expected_text
    # 打印确认信息
    print("Generated text is ok!")

    # 验证批量生成
    print("Batched generation...")
    # 指定图像 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 下载并打开图像
    cats_image = Image.open(requests.get(url, stream=True).raw)

    # 处理器接收图像和文本输入，并进行填充和张量化处理
    inputs = processor(
        images=[image, cats_image],  # 图像列表
        text=[prompt, "[INST] <image>\nHow many cats are there? [/INST]"],  # 文本列表
        padding=True,  # 是否填充
        return_tensors="pt",  # 返回 PyTorch 张量
    ).to(device)

    # 打印每个输入项的形状
    for k, v in inputs.items():
        print(k, v.shape)

    # 打印图像尺寸信息
    print("Image sizes:", inputs.image_sizes)

    # 确保图像尺寸相同，以确保批量生成正常工作
    inputs.image_sizes[1] = inputs.image_sizes[0]

    # 再次确认批量生成正在进行
    print("Batched generation...")
    # 使用模型生成输出序列，接收输入参数，并指定最大新增标记数为20，启用缓存
    output_ids = model.generate(
        **inputs,
        max_new_tokens=20,
        use_cache=True,
    )

    # 使用分词器批量解码生成的输出标识符序列，跳过特殊标记并返回文本输出列表
    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    # 打印生成的文本输出列表
    print(outputs)

    # 如果指定了 PyTorch 模型导出路径
    if pytorch_dump_folder_path is not None:
        # 打印保存模型和处理器的消息，并创建必要的文件夹（如果不存在）
        print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 从模型 ID 中提取仓库 ID
        repo_id = model_id.split("/")[-1]
        # 推送模型到 Hub，命名规则为 llava-hf/{repo_id}-hf
        model.push_to_hub(f"llava-hf/{repo_id}-hf")
        # 推送处理器到 Hub，命名规则为 llava-hf/{repo_id}-hf
        processor.push_to_hub(f"llava-hf/{repo_id}-hf")
# 如果这个脚本被直接运行，执行以下操作
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()
    
    # 添加一个命令行参数，用于指定模型的Hub位置以进行转换
    parser.add_argument(
        "--model_id",
        help="Hub location of the model to convert",
        default="liuhaotian/llava-v1.6-mistral-7b",
        choices=[
            "liuhaotian/llava-v1.6-mistral-7b",
            "liuhaotian/llava-v1.6-vicuna-7b",
            "liuhaotian/llava-v1.6-vicuna-13b",
            "liuhaotian/llava-v1.6-34b",
        ],
        required=False,
    )
    
    # 添加一个命令行参数，用于指定输出的PyTorch模型目录的路径
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    
    # 添加一个命令行参数，设置为True表示是否将转换后的模型推送到🤗 hub
    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    
    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_llava_to_hf，传递解析后的命令行参数作为参数
    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\llava_next\image_processing_llava_next.py`

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for LLaVa-NeXT."""

import math
from typing import Dict, List, Optional, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
from ...image_transforms import (
    convert_to_rgb,
    get_resize_output_image_size,
    pad,
    resize,
    to_channel_dimension_format,
)
from ...image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging


logger = logging.get_logger(__name__)


if is_vision_available():
    from PIL import Image


def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
    """
    Divides an image into patches of a specified size.

    Args:
        image (`np.array`):
            The input image.
        patch_size (`int`):
            The size of each patch.
        input_data_format (`ChannelDimension` or `str`):
            The channel dimension format of the input image.

    Returns:
        list: A list of np.array representing the patches.
    """
    patches = []
    # 获取输入图像的高度和宽度
    height, width = get_image_size(image, channel_dim=input_data_format)
    # 循环遍历图像并划分成指定大小的补丁
    for i in range(0, height, patch_size):
        for j in range(0, width, patch_size):
            if input_data_format == ChannelDimension.LAST:
                # 如果通道维度在最后，则按照 (行, 列) 的方式提取补丁
                patch = image[i : i + patch_size, j : j + patch_size]
            else:
                # 否则按照 (通道, 行, 列) 的方式提取补丁
                patch = image[:, i : i + patch_size, j : j + patch_size]
            patches.append(patch)

    return patches


def expand_to_square(image: np.array, background_color, input_data_format) -> np.array:
    """
    Expands an image to a square by adding a background color.

    Args:
        image (`np.array`):
            The input image to be expanded.
        background_color:
            The color value to be used for the background.
        input_data_format (`ChannelDimension` or `str`):
            The channel dimension format of the input image.

    Returns:
        np.array: The expanded square image.
    """
    # 获取输入图像的高度和宽度
    height, width = get_image_size(image, channel_dim=input_data_format)
    # 如果图像已经是正方形，则直接返回
    if width == height:
        return image
    elif width > height:
        # 如果宽度大于高度，需要在顶部和底部添加背景颜色，使其变为正方形
        result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
        result[(width - height) // 2 : (width - height) // 2 + height, :] = image
        return result
    # 如果图像宽度小于高度，则在宽度方向上填充背景色，使其与高度相等的正方形图像
    else:
        # 创建一个与原图像相同高度和通道数的全 1 数组，并用背景色填充
        result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
        # 将原图像嵌入到正方形图像中心区域，保持其原有的宽度
        result[:, (height - width) // 2 : (height - width) // 2 + width] = image
        # 返回处理后的正方形图像
        return result
# 计算调整后图像的尺寸，使其符合目标分辨率
def _get_patch_output_size(image, target_resolution, input_data_format):
    # 获取原始图像的高度和宽度
    original_height, original_width = get_image_size(image, channel_dim=input_data_format)
    
    # 获取目标分辨率的高度和宽度
    target_height, target_width = target_resolution

    # 计算宽度和高度的缩放比例
    scale_w = target_width / original_width
    scale_h = target_height / original_height

    # 根据缩放比例确定新的宽度和高度
    if scale_w < scale_h:
        new_width = target_width
        new_height = min(math.ceil(original_height * scale_w), target_height)
    else:
        new_height = target_height
        new_width = min(math.ceil(original_width * scale_h), target_width)

    # 返回调整后的新高度和新宽度
    return new_height, new_width


class LlavaNextImageProcessor(BaseImageProcessor):
    r"""
    构造一个LLaVa-NeXT图像处理器。基于`CLIPImageProcessor`，结合了处理高分辨率图像的额外技术，
    此技术详见[LLaVa论文](https://arxiv.org/abs/2310.03744)。

    """

    # 模型输入的名称列表
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        image_grid_pinpoints: List = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_center_crop: bool = True,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        
        # 如果未提供size，则默认使用{"shortest_edge": 224}
        size = size if size is not None else {"shortest_edge": 224}
        # 将size参数转换为标准化的尺寸字典，不默认使用方形
        size = get_size_dict(size, default_to_square=False)
        
        # 如果未提供image_grid_pinpoints，则使用默认值[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
        image_grid_pinpoints = (
            image_grid_pinpoints
            if image_grid_pinpoints is not None
            else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
        )
        
        # 如果未提供crop_size，则默认使用{"height": 224, "width": 224}
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 将crop_size参数转换为标准化的尺寸字典，默认使用方形
        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")

        # 设置对象的各种属性
        self.do_resize = do_resize
        self.size = size
        self.image_grid_pinpoints = image_grid_pinpoints
        self.resample = resample
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.do_convert_rgb = do_convert_rgb

    # 从transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize复制而来，用于LLaVa
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
        resized to keep the input aspect ratio.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        default_to_square = True  # 默认按最短边缩放到指定尺寸，保持长宽比
        if "shortest_edge" in size:
            size = size["shortest_edge"]  # 将尺寸设置为最短边的长度
            default_to_square = False  # 不默认按正方形缩放
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])  # 将尺寸设置为给定的高度和宽度
        else:
            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
            # 如果尺寸参数不包含最短边或者高度和宽度，则抛出数值错误异常

        output_size = get_resize_output_image_size(
            image,
            size=size,
            default_to_square=default_to_square,
            input_data_format=input_data_format,
        )
        # 调用函数计算输出图像的大小

        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
        # 调用自身方法进行图像缩放操作，并返回缩放后的图像

    def _preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: int = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        """
        Preprocess images with optional resizing, cropping, rescaling, and normalization.

        Args:
            images (`ImageInput`):
                Input images to preprocess.
            do_resize (`bool`, *optional*):
                Whether to perform resizing.
            size (`Dict[str, int]`, *optional*):
                Target size for resizing.
            resample (`PILImageResampling`, *optional*):
                Resampling filter to use for resizing.
            do_center_crop (`bool`, *optional*):
                Whether to perform center cropping.
            crop_size (`int`, *optional*):
                Size for center cropping.
            do_rescale (`bool`, *optional*):
                Whether to perform rescaling.
            rescale_factor (`float`, *optional*):
                Factor for rescaling.
            do_normalize (`bool`, *optional*):
                Whether to perform normalization.
            image_mean (`float` or `List[float]`, *optional*):
                Mean value(s) for normalization.
            image_std (`float` or `List[float]`, *optional*):
                Standard deviation value(s) for normalization.
            data_format (`ChannelDimension`, *optional*):
                Channel dimension format of the images.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                Input channel dimension format.

        """
        # 该方法用于对图像进行预处理，包括可选的缩放、裁剪、重新缩放和归一化操作

    def _resize_for_patching(
        self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
    ):
        """
        Resize an image for patching.

        Args:
            image (`np.array`):
                Image to resize.
            target_resolution (`tuple`):
                Target resolution (height, width) for resizing.
            resample:
                Resampling filter to use for resizing.
            input_data_format (`ChannelDimension`):
                Input channel dimension format.

        """
        # 该方法用于为图像裁剪调整大小操作
    ) -> np.array:
        """
        Resizes an image to a target resolution while maintaining aspect ratio.

        Args:
            image (np.array):
                The input image.
            target_resolution (tuple):
                The target resolution (height, width) of the image.
            resample (`PILImageResampling`):
                Resampling filter to use if resizing the image.
            input_data_format (`ChannelDimension` or `str`):
                The channel dimension format of the input image.

        Returns:
            np.array: The resized and padded image.
        """
        # Calculate the new height and width for resizing
        new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)

        # Resize the image using specified parameters
        resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)

        return resized_image

    def _pad_for_patching(
        self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
    ) -> np.array:
        """
        Pad an image to a target resolution while maintaining aspect ratio.
        """
        # Extract target height and width from target resolution
        target_height, target_width = target_resolution
        
        # Calculate new output size for resizing
        new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)

        # Calculate paste positions for padding
        paste_x = (target_width - new_width) // 2
        paste_y = (target_height - new_height) // 2

        # Pad the image symmetrically
        padded_image = pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))

        return padded_image

    def get_image_patches(
        self,
        image: np.array,
        grid_pinpoints,
        size: tuple,
        patch_size: int,
        resample: PILImageResampling,
        data_format: ChannelDimension,
        input_data_format: ChannelDimension,
    ):
        """
        Extract patches from an image based on specified grid points and patch size.
        """
    ) -> List[np.array]:
        """
        Process an image with variable resolutions by dividing it into patches.

        Args:
            image (np.array):
                The input image to be processed.
            grid_pinpoints (List):
                A string representation of a list of possible resolutions.
            size (`tuple`):
                Size to resize the original image to.
            patch_size (`int`):
                Size of the patches to divide the image into.
            resample (`PILImageResampling`):
                Resampling filter to use if resizing the image.
            data_format (`ChannelDimension` or `str`):
                The channel dimension format for the output image.
            input_data_format (`ChannelDimension` or `str`):
                The channel dimension format of the input image.

        Returns:
            List[np.array]: A list of NumPy arrays containing the processed image patches.
        """
        # Check if grid_pinpoints is a list; raise error if not
        if not isinstance(grid_pinpoints, list):
            raise ValueError("grid_pinpoints must be a list of possible resolutions.")

        # Assign grid_pinpoints to possible_resolutions for clarity
        possible_resolutions = grid_pinpoints

        # Determine the size of the input image
        image_size = get_image_size(image, channel_dim=input_data_format)

        # Select the best resolution from possible_resolutions based on image_size
        best_resolution = select_best_resolution(image_size, possible_resolutions)

        # Resize the original image to best_resolution
        resized_image = self._resize_for_patching(
            image, best_resolution, resample=resample, input_data_format=input_data_format
        )

        # Pad the resized image for patching purposes
        padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)

        # Divide the padded image into patches of size patch_size
        patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format)

        # Ensure all patches are in the desired output data format
        patches = [
            to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format)
            for patch in patches
        ]

        # Resize the original image to size specified by `size`
        resized_original_image = resize(
            image,
            size=size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
        )

        # Combine the resized original image and processed patches into image_patches list
        image_patches = [resized_original_image] + patches

        # Return the list of image patches
        return image_patches
    # 图像预处理方法，用于对输入图像进行多种处理操作
    def preprocess(
        self,
        # 输入图像，可以是单张图像或图像列表
        images: ImageInput,
        # 是否进行调整图像大小的操作，默认为 None
        do_resize: bool = None,
        # 调整后的目标大小，字典格式，包含宽度和高度信息
        size: Dict[str, int] = None,
        # 图像网格定位点列表，用于特定的图像处理任务
        image_grid_pinpoints: List = None,
        # 重采样方法，例如最近邻法、双线性插值等
        resample: PILImageResampling = None,
        # 是否进行中心裁剪操作，默认为 None
        do_center_crop: bool = None,
        # 裁剪后的目标大小
        crop_size: int = None,
        # 是否进行图像重新缩放操作，默认为 None
        do_rescale: bool = None,
        # 重新缩放因子，控制缩放的比例
        rescale_factor: float = None,
        # 是否进行图像标准化操作，默认为 None
        do_normalize: bool = None,
        # 图像的均值，可以是单一值或者通道均值列表
        image_mean: Optional[Union[float, List[float]]] = None,
        # 图像的标准差，可以是单一值或者通道标准差列表
        image_std: Optional[Union[float, List[float]]] = None,
        # 是否将图像转换为 RGB 格式，默认为 None
        do_convert_rgb: bool = None,
        # 返回的张量类型，例如字符串或张量类型
        return_tensors: Optional[Union[str, TensorType]] = None,
        # 数据格式，控制通道的位置顺序
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        # 输入数据的格式描述，可以是字符串或通道维度对象
        input_data_format: Optional[Union[str, ChannelDimension]] = None,

`.\models\llava_next\modeling_llava_next.py`

# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Llava-NeXT model."""

from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from ... import PreTrainedModel
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...image_processing_utils import select_best_resolution
from ...modeling_outputs import ModelOutput
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..auto import AutoModel, AutoModelForCausalLM
from .configuration_llava_next import LlavaNextConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LlavaNextConfig"

LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "llava-hf/llava-v1.6-mistral-7b-hf",
    # See all LLaVA-NeXT models at https://huggingface.co/models?filter=llava_next
]

# 定义一个函数用于计算图像预处理后的图像补丁网格形状
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    """
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (`tuple`):
            The size of the input image in the format (width, height).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    """
    if not isinstance(grid_pinpoints, list):
        raise ValueError("grid_pinpoints should be a list of tuples or lists")

    # 从可能的分辨率中选择最佳的分辨率
    height, width = select_best_resolution(image_size, grid_pinpoints)
    # 计算图像补丁网格的形状
    return height // patch_size, width // patch_size


# 定义一个函数用于解压经过填充和调整大小的图像的 PyTorch 张量
def unpad_image(tensor, original_size):
    """
    Unpads a PyTorch tensor of a padded and resized image.

    Args:
        tensor (`torch.Tensor`):
            The image tensor, assumed to be of shape (num_channels, height, width).
        original_size (`tuple`):
            The original size of the image (height, width).

    Returns:
        `torch.Tensor`: The unpadded image tensor.
    """
    original_height, original_width = original_size
    current_height, current_width = tensor.shape[1:]

    # 计算原始图像的宽高比
    original_aspect_ratio = original_width / original_height
    # 计算当前图像的宽高比
    current_aspect_ratio = current_width / current_height

    # 检查原始图像的宽高比是否大于当前图像的宽高比
    if original_aspect_ratio > current_aspect_ratio:
        # 如果是，按照宽度比例缩放当前图像，并计算新的高度
        scale_factor = current_width / original_width
        new_height = int(original_height * scale_factor)
        # 计算垂直方向上的填充量，使得缩放后的图像居中
        padding = (current_height - new_height) // 2
        # 在垂直方向上截取不带填充的部分
        unpadded_tensor = tensor[:, padding : current_height - padding, :]
    else:
        # 如果原始图像的宽高比小于等于当前图像的宽高比，按照高度比例缩放当前图像
        scale_factor = current_height / original_height
        new_width = int(original_width * scale_factor)
        # 计算水平方向上的填充量，使得缩放后的图像居中
        padding = (current_width - new_width) // 2
        # 在水平方向上截取不带填充的部分
        unpadded_tensor = tensor[:, :, padding : current_width - padding]

    # 返回经过处理的不带填充的部分图像张量
    return unpadded_tensor
@dataclass
# 定义了一个数据类，用于表示LlavaNext模型的因果语言模型输出及其历史信息
class LlavaNextCausalLMOutputWithPast(ModelOutput):
    """
    LlavaNext因果语言模型（或自回归模型）输出的基类。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当提供`labels`时返回):
            语言建模损失（用于下一个标记预测）。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            语言建模头的预测分数（SoftMax之前的每个词汇标记的分数）。
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, 当传递`use_cache=True`或`config.use_cache=True`时返回):
            长度为`config.n_layers`的元组，每个元组包含2个形状为`(batch_size, num_heads, sequence_length, embed_size_per_head)`的张量。

            包含预先计算的隐藏状态（注意力块中的键和值），可用于加速顺序解码。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当传递`output_hidden_states=True`或`config.output_hidden_states=True`时返回):
            元组`torch.FloatTensor`（如果模型有嵌入层则为一个，每个层的输出为一个）的形状为`(batch_size, sequence_length, hidden_size)`。

            模型每一层输出的隐藏状态加上可选的初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当传递`output_attentions=True`或`config.output_attentions=True`时返回):
            元组`torch.FloatTensor`（每个层一个）的形状为`(batch_size, num_heads, sequence_length, sequence_length)`。

            自注意力头中注意力softmax后的注意力权重，用于计算自注意力头中加权平均值。
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            元组`torch.FloatTensor`（图像嵌入的输出一个）的形状为`(batch_size, num_images, sequence_length, hidden_size)`。

            模型通过视觉编码器生成的图像隐藏状态，以及可选的感知器生成的图像隐藏状态。
    """
    
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None


# 从transformers.models.llava.modeling_llava.LlavaMultiModalProjector复制并改为LlavaNext
class LlavaNextMultiModalProjector(nn.Module):
    # 初始化函数，用于创建一个新的神经网络模型对象
    def __init__(self, config: LlavaNextConfig):
        # 调用父类（nn.Module）的初始化方法
        super().__init__()

        # 创建一个线性层，将输入特征的大小映射到文本配置中隐藏层的大小
        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
        
        # 选择激活函数，根据配置文件中指定的激活函数类型从预定义的字典中选择
        self.act = ACT2FN[config.projector_hidden_act]
        
        # 创建第二个线性层，将第一个线性层的输出映射到文本配置中隐藏层的大小
        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)

    # 前向传播函数，定义了数据从输入到输出的流动方式
    def forward(self, image_features):
        # 第一层线性变换，将输入特征映射到文本配置中隐藏层的大小
        hidden_states = self.linear_1(image_features)
        
        # 应用选定的激活函数到第一层的输出
        hidden_states = self.act(hidden_states)
        
        # 第二层线性变换，将第一层的输出映射到文本配置中隐藏层的大小
        hidden_states = self.linear_2(hidden_states)
        
        # 返回最终的隐藏状态作为输出
        return hidden_states
# LLAVA_NEXT_START_DOCSTRING 变量，包含了关于 LLAVA-NeXT 模型的文档字符串，描述了其继承自 PreTrainedModel 的特性，
# 以及作为 PyTorch 的 nn.Module 的子类使用的相关信息和参数说明。

@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAVA_NEXT_START_DOCSTRING,
)
# 使用 add_start_docstrings 装饰器为 LlavaNextPreTrainedModel 类添加文档字符串，描述了其作为基础模型输出原始隐藏状态的特性，
# 并引用了 LLAVA_NEXT_START_DOCSTRING 中定义的模型配置和参数说明。

class LlavaNextPreTrainedModel(PreTrainedModel):
    # LlavaNextPreTrainedModel 类，继承自 PreTrainedModel 类。
    config_class = LlavaNextConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlavaNextVisionAttention"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True

    def _init_weights(self, module):
        # _init_weights 方法用于初始化模型参数，根据模型配置设置不同类型模块的权重初始化方式。
        # 在此版本的 LlavaNext 中，仅支持推断和微调，不支持从头开始训练，因此移除了原始代码中的适用于从头训练的初始化权重代码。

        std = (
            self.config.initializer_range
            if hasattr(self.config, "initializer_range")
            else self.config.text_config.initializer_range
        )

        if hasattr(module, "class_embedding"):
            # 如果模块具有 class_embedding 属性，则对其进行正态分布初始化，均值为 0，标准差为 std。
            module.class_embedding.data.normal_(mean=0.0, std=std)

        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 如果模块是 nn.Linear 或 nn.Conv2d 类型，则对权重进行正态分布初始化，均值为 0，标准差为 std。
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                # 如果模块有偏置项，则将偏置项初始化为 0。
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果模块是 nn.Embedding 类型，则对权重进行正态分布初始化，均值为 0，标准差为 std。
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                # 如果模块有 padding_idx，则将该位置的权重初始化为 0。
                module.weight.data[module.padding_idx].zero_()

    @property
    def _supports_sdpa(self):
        """
        Retrieve language_model's attribute to check whether the model supports
        SDPA or not.
        """
        # _supports_sdpa 属性，用于检查模型是否支持 SDPA（Self-Attention based Dual-path Aggregation）。
        return self.language_model._supports_sdpa


LLAVA_NEXT_INPUTS_DOCSTRING = r"""
"""
# LLAVA_NEXT_INPUTS_DOCSTRING 变量，目前为空字符串，用于定义 LLAVA-NeXT 模型的输入文档字符串。

@add_start_docstrings(
    """The LLAVA-NeXT model which consists of a vision backbone and a language model.""",
    LLAVA_NEXT_START_DOCSTRING,
)
# 使用 add_start_docstrings 装饰器为 LLAVA-NeXT 模型添加文档字符串，描述了该模型由视觉骨干和语言模型组成的特性，
# 并引用了 LLAVA_NEXT_START_DOCSTRING 中定义的模型配置和参数说明。
    # 继承自 LlavaNextPreTrainedModel 类的条件生成模型，用于生成下一步预测的输出
    class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
        def __init__(self, config: LlavaNextConfig):
            # 调用父类的初始化方法，传入配置对象
            super().__init__(config)
            # 根据视觉配置创建自动模型对象
            self.vision_tower = AutoModel.from_config(config.vision_config)

            # 创建多模态投影器对象
            self.multi_modal_projector = LlavaNextMultiModalProjector(config)

            # 创建可学习的张量参数，用于图像和文本信息的融合
            self.image_newline = nn.Parameter(torch.empty(config.text_config.hidden_size, dtype=self.dtype))

            # 获取文本配置中的词汇表大小
            self.vocab_size = config.text_config.vocab_size

            # 根据文本配置创建自动语言模型对象，支持因果语言模型
            self.language_model = AutoModelForCausalLM.from_config(
                config.text_config, attn_implementation=config._attn_implementation
            )

            # 设置填充标记 ID，如果配置中未指定填充标记 ID，则设置为 -1
            self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1

            # 执行初始化后的处理逻辑
            self.post_init()

        # 从语言模型中获取输入嵌入层
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings
        def get_input_embeddings(self):
            return self.language_model.get_input_embeddings()

        # 设置语言模型的输入嵌入层
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings
        def set_input_embeddings(self, value):
            self.language_model.set_input_embeddings(value)

        # 从语言模型中获取输出嵌入层
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings
        def get_output_embeddings(self):
            return self.language_model.get_output_embeddings()

        # 设置语言模型的输出嵌入层
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings
        def set_output_embeddings(self, new_embeddings):
            self.language_model.set_output_embeddings(new_embeddings)

        # 设置语言模型的解码器
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder
        def set_decoder(self, decoder):
            self.language_model.set_decoder(decoder)

        # 从语言模型中获取解码器
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder
        def get_decoder(self):
            return self.language_model.get_decoder()

        # 绑定语言模型的权重
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
        def tie_weights(self):
            return self.language_model.tie_weights()

        # 调整语言模型的标记嵌入大小
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
        def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
            model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
            # 更新配置中的词汇表大小
            self.config.text_config.vocab_size = model_embeds.num_embeddings
            self.vocab_size = model_embeds.num_embeddings
            return model_embeds

        # 合并输入标记 ID 与图像特征的处理逻辑，支持多模态输入
        # 复制自 transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._merge_input_ids_with_image_features
        @add_start_docstrings_to_model_forward(LLAVA_NEXT_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=LlavaNextCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法，接受多个输入参数
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的token ID张量，用于输入模型的文本序列
        pixel_values: torch.FloatTensor = None,  # 输入的像素值张量，用于输入模型的图像特征
        image_sizes: Optional[torch.LongTensor] = None,  # 可选的图像尺寸张量，用于指定输入图像的尺寸
        attention_mask: Optional[torch.Tensor] = None,  # 可选的注意力掩码张量，用于指定模型关注的位置
        position_ids: Optional[torch.LongTensor] = None,  # 可选的位置 ID 张量，用于指定输入的位置信息
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 可选的过去键值张量列表，用于缓存先前计算的键值信息
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 可选的嵌入输入张量，用于直接提供嵌入的输入
        vision_feature_layer: Optional[int] = None,  # 可选的视觉特征层索引，用于指定从哪个视觉特征层提取特征
        vision_feature_select_strategy: Optional[str] = None,  # 可选的视觉特征选择策略，用于控制视觉特征的选择方式
        labels: Optional[torch.LongTensor] = None,  # 可选的标签张量，用于计算模型的损失
        use_cache: Optional[bool] = None,  # 可选的缓存使用标志，用于控制是否使用缓存
        output_attentions: Optional[bool] = None,  # 可选的输出注意力张量标志，用于控制是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 可选的输出隐藏状态标志，用于控制是否输出中间层的隐藏状态
        return_dict: Optional[bool] = None,  # 可选的返回字典标志，用于控制是否返回字典形式的输出
    ):
        pass  # 此处为定义方法后的代码块结尾，无具体操作，仅作为示例展示参数的定义和类型说明

    # 定义生成过程的输入准备方法，接受多个输入参数和额外关键字参数
    def prepare_inputs_for_generation(
        self,
        input_ids,  # 输入的token ID张量，用于生成过程中的输入文本序列
        past_key_values=None,  # 可选的过去键值张量，用于缓存先前计算的键值信息
        inputs_embeds=None,  # 可选的嵌入输入张量，用于直接提供嵌入的输入
        pixel_values=None,  # 可选的像素值张量，用于生成过程中的输入图像特征
        image_sizes=None,  # 可选的图像尺寸张量，用于指定生成过程中输入图像的尺寸
        attention_mask=None,  # 可选的注意力掩码张量，用于指定生成过程中模型关注的位置
        **kwargs,  # 其余的关键字参数，用于兼容可能添加的未列出参数
    ):
        pass  # 此处为定义方法后的代码块结尾，无具体操作，仅作为示例展示参数的定义和类型说明
    ):
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
            elif self.config.image_token_index in input_ids:
                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
            # older attention values, as their corresponding values are not part of the input.
            if cache_length < past_length and attention_mask is not None:
                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
                "pixel_values": pixel_values,
                "image_sizes": image_sizes,
            }
        )
        return model_inputs

    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._reorder_cache
    def _reorder_cache(self, *args, **kwargs):
        return self.language_model._reorder_cache(*args, **kwargs)

`.\models\llava_next\processing_llava_next.py`

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LLaVa-NeXT.
"""


from typing import List, Optional, Union

from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType


class LlavaNextProcessor(ProcessorMixin):
    r"""
    Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.

    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
    [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.

    Args:
        image_processor ([`LlavaNextImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "LlavaNextImageProcessor"
    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None):
        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        images: ImageInput = None,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length=None,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
    ):
        """
        This method is the entry point for preprocessing textual and image inputs using the LLaVa-NeXT processor.

        Args:
            text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]):
                Input text or pre-tokenized text to be processed.
            images (ImageInput, optional):
                Input images to be processed.
            padding (Union[bool, str, PaddingStrategy], optional):
                Argument specifying if and how to pad the sequences.
            truncation (Union[bool, str, TruncationStrategy], optional):
                Argument specifying if and how to truncate sequences.
            max_length (int, optional):
                Maximum length of the sequences after tokenization.
            return_tensors (Optional[Union[str, TensorType]], optional):
                Desired framework tensors (PyTorch, TensorFlow) for the returned data.

        Returns:
            dict: A dictionary containing processed inputs suitable for model ingestion.
        """
        # Implementation of processing logic omitted for brevity
        pass

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.

        Returns:
            List[str]: Decoded texts corresponding to the input tokens or IDs.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        # 调用 LlamaTokenizerFast 的 decode 方法，将所有参数传递给它
        return self.tokenizer.decode(*args, **kwargs)

    @property
    # 从 transformers.models.clip.processing_clip.CLIPProcessor.model_input_names 复制而来
    def model_input_names(self):
        # 获取 tokenizer 的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取 image_processor 的模型输入名称列表
        image_processor_input_names = self.image_processor.model_input_names
        # 合并去重 tokenizer 和 image_processor 的模型输入名称，返回列表
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

`.\models\llava_next\init.py`

# Copyright 2024 The HuggingFace Team. All rights reserved.
# 版权声明，版权归HuggingFace团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# 本文件采用Apache许可证2.0版授权，除非符合许可证要求，否则不得使用本文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则按"原样"分发本软件，无任何明示或暗示的保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证了解具体的语言规定和限制

from typing import TYPE_CHECKING

# 从HuggingFace内部utils模块中导入OptionalDependencyNotAvailable和_LazyModule
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构
_import_structure = {
    "configuration_llava_next": ["LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaNextConfig"],
    "processing_llava_next": ["LlavaNextProcessor"],
}

# 检查是否Torch可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若Torch可用，则更新导入结构中的模型相关模块
    _import_structure["modeling_llava_next"] = [
        "LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LlavaNextForConditionalGeneration",
        "LlavaNextPreTrainedModel",
    ]

# 检查是否Vision模块可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若Vision模块可用，则更新导入结构中的图像处理相关模块
    _import_structure["image_processing_llava_next"] = ["LlavaNextImageProcessor"]

# 如果是类型检查模式（如mypy），则导入相应的模块
if TYPE_CHECKING:
    from .configuration_llava_next import LLAVA_NEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaNextConfig
    from .processing_llava_next import LlavaNextProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_llava_next import (
            LLAVA_NEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
            LlavaNextForConditionalGeneration,
            LlavaNextPreTrainedModel,
        )

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_llava_next import LlavaNextImageProcessor

# 如果不是类型检查模式，则使用_LazyModule懒加载模块
else:
    import sys

    # 将当前模块注册为_LazyModule，以实现延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\longformer\configuration_longformer.py`

# coding=utf-8
# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Longformer configuration"""

# 导入必要的库和模块
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Union

# 导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入ONNX配置类
from ...onnx import OnnxConfig
# 导入TensorType和logging工具
from ...utils import TensorType, logging

# 检查类型，导入额外依赖
if TYPE_CHECKING:
    from ...onnx.config import PatchingSpec
    from ...tokenization_utils_base import PreTrainedTokenizerBase

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射字典
LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/config.json",
    "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/config.json",
    "allenai/longformer-large-4096-finetuned-triviaqa": (
        "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/config.json"
    ),
    "allenai/longformer-base-4096-extra.pos.embd.only": (
        "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/config.json"
    ),
    "allenai/longformer-large-4096-extra.pos.embd.only": (
        "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/config.json"
    ),
}


class LongformerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LongformerModel`] or a [`TFLongformerModel`]. It
    is used to instantiate a Longformer model according to the specified arguments, defining the model architecture.

    This is the configuration class to store the configuration of a [`LongformerModel`]. It is used to instantiate an
    Longformer model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the LongFormer
    [allenai/longformer-base-4096](https://huggingface.co/allenai/longformer-base-4096) architecture with a sequence
    length 4,096.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 定义 Longformer 模型的配置类，用于配置模型的各种参数
    class LongformerConfig:
        # 构造函数，初始化 Longformer 模型的配置参数
        def __init__(
            self,
            vocab_size=30522,
            hidden_size=768,
            num_hidden_layers=12,
            num_attention_heads=12,
            intermediate_size=3072,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=2,
            initializer_range=0.02,
            layer_norm_eps=1e-12,
            attention_window=512
        ):
            # 设置 Longformer 模型的词汇表大小
            vocab_size (`int`, *optional*, defaults to 30522):
                Vocabulary size of the Longformer model. Defines the number of different tokens that can be represented by
                the `inputs_ids` passed when calling [`LongformerModel`] or [`TFLongformerModel`].
            # 设置编码器层和池化层的隐藏单元数
            hidden_size (`int`, *optional*, defaults to 768):
                Dimensionality of the encoder layers and the pooler layer.
            # 设置 Transformer 编码器中的隐藏层数量
            num_hidden_layers (`int`, *optional*, defaults to 12):
                Number of hidden layers in the Transformer encoder.
            # 设置每个注意力层中的注意力头数
            num_attention_heads (`int`, *optional*, defaults to 12):
                Number of attention heads for each attention layer in the Transformer encoder.
            # 设置 Transformer 编码器中“中间”（通常称为前馈）层的维度
            intermediate_size (`int`, *optional*, defaults to 3072):
                Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
            # 设置编码器和池化器中的非线性激活函数
            hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
                The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
                `"relu"`, `"silu"` and `"gelu_new"` are supported.
            # 设置嵌入层、编码器和池化器中所有全连接层的 dropout 概率
            hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
            # 设置注意力概率的 dropout 比率
            attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the attention probabilities.
            # 设置模型可能使用的最大序列长度
            max_position_embeddings (`int`, *optional*, defaults to 512):
                The maximum sequence length that this model might ever be used with. Typically set this to something large
                just in case (e.g., 512 or 1024 or 2048).
            # 设置 `token_type_ids` 的词汇表大小
            type_vocab_size (`int`, *optional*, defaults to 2):
                The vocabulary size of the `token_type_ids` passed when calling [`LongformerModel`] or
                [`TFLongformerModel`].
            # 设置所有权重矩阵初始化时的截断正态分布的标准差
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            # 设置层归一化层使用的 epsilon 值
            layer_norm_eps (`float`, *optional*, defaults to 1e-12):
                The epsilon used by the layer normalization layers.
            # 设置每个标记周围的注意力窗口大小
            attention_window (`int` or `List[int]`, *optional*, defaults to 512):
                Size of an attention window around each token. If an `int`, use the same size for all layers. To specify a
                different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`.
    
        Example:
    
        ```
        >>> from transformers import LongformerConfig, LongformerModel
    
        >>> # Initializing a Longformer configuration
        >>> configuration = LongformerConfig()
    
        >>> # Initializing a model from the configuration
        >>> model = LongformerModel(configuration)
    
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
        
        # 设置模型类型为 Longformer
        model_type = "longformer"
    def __init__(
        self,
        attention_window: Union[List[int], int] = 512,
        sep_token_id: int = 2,
        pad_token_id: int = 1,
        bos_token_id: int = 0,
        eos_token_id: int = 2,
        vocab_size: int = 30522,
        hidden_size: int = 768,
        num_hidden_layers: int = 12,
        num_attention_heads: int = 12,
        intermediate_size: int = 3072,
        hidden_act: str = "gelu",
        hidden_dropout_prob: float = 0.1,
        attention_probs_dropout_prob: float = 0.1,
        max_position_embeddings: int = 512,
        type_vocab_size: int = 2,
        initializer_range: float = 0.02,
        layer_norm_eps: float = 1e-12,
        onnx_export: bool = False,
        **kwargs,
    ):
        """
        构造函数，初始化 LongformerConfig 对象。

        参数:
        - attention_window: 注意力窗口大小，可以是整数或整数列表，默认为 512
        - sep_token_id: 分隔符 token 的 ID，默认为 2
        - pad_token_id: 填充 token 的 ID，默认为 1
        - bos_token_id: 文本开始 token 的 ID，默认为 0
        - eos_token_id: 文本结束 token 的 ID，默认为 2
        - vocab_size: 词汇表大小，默认为 30522
        - hidden_size: 隐藏层大小，默认为 768
        - num_hidden_layers: 隐藏层的数量，默认为 12
        - num_attention_heads: 注意力头的数量，默认为 12
        - intermediate_size: 中间层大小，默认为 3072
        - hidden_act: 隐藏层激活函数，默认为 "gelu"
        - hidden_dropout_prob: 隐藏层的 dropout 概率，默认为 0.1
        - attention_probs_dropout_prob: 注意力概率的 dropout 概率，默认为 0.1
        - max_position_embeddings: 最大位置嵌入大小，默认为 512
        - type_vocab_size: 类型词汇表的大小，默认为 2
        - initializer_range: 初始化范围，默认为 0.02
        - layer_norm_eps: 层归一化的 epsilon 值，默认为 1e-12
        - onnx_export: 是否导出到 ONNX 格式，默认为 False
        """
        # 调用父类构造函数，初始化基类的填充 token ID 和其他关键字参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        # 将参数赋值给对象的属性
        self.attention_window = attention_window
        self.sep_token_id = sep_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.onnx_export = onnx_export
# 定义一个继承自OnnxConfig的LongformerOnnxConfig类，用于配置Longformer模型的导出设置
class LongformerOnnxConfig(OnnxConfig):

    # 初始化方法，接收预训练配置、任务名称和补丁规格列表
    def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: "List[PatchingSpec]" = None):
        # 调用父类的初始化方法
        super().__init__(config, task, patching_specs)
        # 设置onnx_export属性为True，表示要导出为ONNX格式
        config.onnx_export = True

    # inputs属性，返回一个有序字典，描述模型的输入格式
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多选题(multiple-choice)，动态轴设置为包含batch、choice、sequence
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则动态轴设置为包含batch、sequence
            dynamic_axis = {0: "batch", 1: "sequence"}
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),                 # 输入的token IDs
                ("attention_mask", dynamic_axis),           # 注意力遮罩
                ("global_attention_mask", dynamic_axis),    # 全局注意力遮罩
            ]
        )

    # outputs属性，返回一个描述模型输出格式的字典
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        # 调用父类的outputs方法获取输出字典
        outputs = super().outputs
        # 如果任务是默认任务(default)，添加额外的汇聚输出(pooler_output)
        if self.task == "default":
            outputs["pooler_output"] = {0: "batch"}
        return outputs

    # atol_for_validation属性，返回模型转换验证时的绝对误差容差
    @property
    def atol_for_validation(self) -> float:
        """
        What absolute tolerance value to use during model conversion validation.

        Returns:
            Float absolute tolerance value.
        """
        return 1e-4

    # default_onnx_opset属性，返回模型导出时所需的默认ONNX操作集版本号
    @property
    def default_onnx_opset(self) -> int:
        # 需要>=14版本支持tril运算符
        return max(super().default_onnx_opset, 14)

    # generate_dummy_inputs方法，生成用于模型导出的虚拟输入数据
    def generate_dummy_inputs(
        self,
        tokenizer: "PreTrainedTokenizerBase",
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 调用父类的generate_dummy_inputs方法生成基础输入
        inputs = super().generate_dummy_inputs(
            preprocessor=tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
        )
        import torch

        # 设置全局注意力遮罩为与input_ids相同形状的全零张量
        inputs["global_attention_mask"] = torch.zeros_like(inputs["input_ids"])
        # 每隔一个token将全局注意力遮罩的相应位置设为1
        inputs["global_attention_mask"][:, ::2] = 1

        return inputs

`.\models\longformer\convert_longformer_original_pytorch_lightning_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert RoBERTa checkpoint."""


import argparse  # 导入 argparse 模块，用于解析命令行参数

import pytorch_lightning as pl  # 导入 PyTorch Lightning 库
import torch  # 导入 PyTorch 库
from torch import nn  # 从 PyTorch 导入神经网络模块

from transformers import LongformerForQuestionAnswering, LongformerModel  # 从 transformers 库导入 Longformer 模型和问答模型


class LightningModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model  # 初始化 Lightning 模型
        self.num_labels = 2  # 设置标签数量
        self.qa_outputs = nn.Linear(self.model.config.hidden_size, self.num_labels)  # 初始化问答输出层

    # implement only because lightning requires to do so
    def forward(self):
        pass


def convert_longformer_qa_checkpoint_to_pytorch(
    longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
):
    # load longformer model from model identifier
    longformer = LongformerModel.from_pretrained(longformer_model)  # 加载预训练的 Longformer 模型
    lightning_model = LightningModel(longformer)  # 初始化 Lightning 模型

    ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu"))  # 加载 PyTorch Lightning 检查点
    lightning_model.load_state_dict(ckpt["state_dict"])  # 载入模型权重

    # init longformer question answering model
    longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model)  # 初始化 Longformer 问答模型

    # transfer weights
    longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict())  # 转移 Longformer 权重
    longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict())  # 转移问答输出层权重
    longformer_for_qa.eval()  # 设置为评估模式

    # save model
    longformer_for_qa.save_pretrained(pytorch_dump_folder_path)  # 保存 PyTorch 模型至指定路径

    print(f"Conversion successful. Model saved under {pytorch_dump_folder_path}")  # 打印成功转换信息


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建参数解析器
    # Required parameters
    parser.add_argument(
        "--longformer_model",
        default=None,
        type=str,
        required=True,
        help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.",
    )  # 添加 Longformer 模型标识参数
    parser.add_argument(
        "--longformer_question_answering_ckpt_path",
        default=None,
        type=str,
        required=True,
        help="Path the official PyTorch Lightning Checkpoint.",
    )  # 添加 PyTorch Lightning 检查点路径参数
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加 PyTorch 模型保存路径参数
    args = parser.parse_args()  # 解析命令行参数
    # 调用函数 convert_longformer_qa_checkpoint_to_pytorch 将给定的 Longformer QA 模型的检查点文件转换为 PyTorch 格式
    convert_longformer_qa_checkpoint_to_pytorch(
        args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path
    )

`.\models\longformer\modeling_longformer.py`

# coding=utf-8
# 版权归 The Allen Institute for AI team 和 The HuggingFace Inc. team 所有。
#
# 根据 Apache License, Version 2.0 授权使用本文件；
# 除非遵守许可证，否则不得使用此文件。
# 可在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则根据"现状"分发软件，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。
"""PyTorch Longformer model."""

import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN, gelu
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_longformer import LongformerConfig

# 获取记录器，用于记录日志
logger = logging.get_logger(__name__)

# 用于文档的检查点名称
_CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096"
# 用于文档的配置名称
_CONFIG_FOR_DOC = "LongformerConfig"

# 预训练模型的存档列表
LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "allenai/longformer-base-4096",
    "allenai/longformer-large-4096",
    "allenai/longformer-large-4096-finetuned-triviaqa",
    "allenai/longformer-base-4096-extra.pos.embd.only",
    "allenai/longformer-large-4096-extra.pos.embd.only",
    # 所有 Longformer 模型详见 https://huggingface.co/models?filter=longformer
]


@dataclass
class LongformerBaseModelOutput(ModelOutput):
    """
    Longformer 输出的基类，包含潜在的隐藏状态、本地和全局注意力。
    """
    pass  # 此处暂时未定义具体内容，仅声明基类
    # 定义函数参数 `last_hidden_state`，表示模型最后一层的隐藏状态，是一个形状为 `(batch_size, sequence_length, hidden_size)` 的 `torch.FloatTensor`。
    last_hidden_state: torch.FloatTensor
    # 定义可选参数 `hidden_states`，当 `output_hidden_states=True` 时返回，表示模型每一层的隐藏状态的元组。
    # 每个元素是一个形状为 `(batch_size, sequence_length, hidden_size)` 的 `torch.FloatTensor`。
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

# 定义可选参数 `attentions`，当 `output_attentions=True` 时返回，表示模型每一层的本地注意力权重的元组。
# 每个元素是一个形状为 `(batch_size, num_heads, sequence_length, x + attention_window + 1)` 的 `torch.FloatTensor`。
# 这些是经过注意力 softmax 后的本地注意力权重，用于计算自注意力头中的加权平均值。
# 前 `x` 个值是对全局注意力掩码中的令牌的注意力权重，剩余的 `attention_window + 1` 个值是对注意力窗口中的令牌的注意力权重。
# 注意，前 `x` 个值指的是文本中固定位置的令牌的注意力权重，但剩余的 `attention_window + 1` 个值是相对位置的注意力权重。
# 如果注意力窗口包含具有全局注意力的令牌，则相应索引处的注意力权重设置为 0，其值应从第一个 `x` 个注意力权重中访问。
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

# 定义可选参数 `global_attentions`，当 `output_attentions=True` 时返回，表示模型每一层的全局注意力权重的元组。
# 每个元素是一个形状为 `(batch_size, num_heads, sequence_length, x)` 的 `torch.FloatTensor`。
# 这些是经过注意力 softmax 后的全局注意力权重，用于计算自注意力头中的加权平均值。
# 这些是从每个具有全局注意力的令牌到序列中每个令牌的注意力权重。
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 声明一个可选类型的变量 attentions，可以存储 torch.FloatTensor 类型的元组或者为 None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 声明一个可选类型的变量 global_attentions，可以存储 torch.FloatTensor 类型的元组或者为 None
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

使用 @dataclass 装饰器声明一个数据类，用于表示带池化的 Longformer 模型的基本输出

@dataclass
class LongformerBaseModelOutputWithPooling(ModelOutput):
"""
Longformer 模型的输出基类，同时包含最后隐藏状态的池化结果。

"""

# 最后的隐藏状态，类型为 torch.FloatTensor
last_hidden_state: torch.FloatTensor
# 可选项：池化层的输出，类型为 torch.FloatTensor，默认为 None
pooler_output: torch.FloatTensor = None
# 可选项：隐藏状态的元组，包含多个 torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：注意力分布的元组，包含多个 torch.FloatTensor
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：全局注意力的元组，包含多个 torch.FloatTensor
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

使用 @dataclass 装饰器声明一个数据类，用于表示 Longformer 掩码语言模型的输出

@dataclass
class LongformerMaskedLMOutput(ModelOutput):
"""
Longformer 掩码语言模型输出的基类。

"""

# 可选项：损失值，类型为 torch.FloatTensor，默认为 None
loss: Optional[torch.FloatTensor] = None
# 可选项：预测的 logits，类型为 torch.FloatTensor
logits: torch.FloatTensor = None
# 可选项：隐藏状态的元组，包含多个 torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：注意力分布的元组，包含多个 torch.FloatTensor
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：全局注意力的元组，包含多个 torch.FloatTensor
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

使用 @dataclass 装饰器声明一个数据类，用于表示 Longformer 问答模型的输出

@dataclass
class LongformerQuestionAnsweringModelOutput(ModelOutput):
"""
Longformer 问答模型输出的基类。

"""

# 可选项：损失值，类型为 torch.FloatTensor，默认为 None
loss: Optional[torch.FloatTensor] = None
# 可选项：起始位置的 logits，类型为 torch.FloatTensor
start_logits: torch.FloatTensor = None
# 可选项：结束位置的 logits，类型为 torch.FloatTensor
end_logits: torch.FloatTensor = None
# 可选项：隐藏状态的元组，包含多个 torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：注意力分布的元组，包含多个 torch.FloatTensor
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：全局注意力的元组，包含多个 torch.FloatTensor
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

使用 @dataclass 装饰器声明一个数据类，用于表示 Longformer 序列分类模型的输出

@dataclass
class LongformerSequenceClassifierOutput(ModelOutput):
"""
Longformer 序列分类模型输出的基类。

"""

# 可选项：损失值，类型为 torch.FloatTensor，默认为 None
loss: Optional[torch.FloatTensor] = None
# 可选项：预测的 logits，类型为 torch.FloatTensor
logits: torch.FloatTensor = None
# 可选项：隐藏状态的元组，包含多个 torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：注意力分布的元组，包含多个 torch.FloatTensor
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：全局注意力的元组，包含多个 torch.FloatTensor
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类（或者当`config.num_labels==1`时为回归）的损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类（或者当`config.num_labels==1`时为回归）的分数（SoftMax 之前）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含模型每一层输出的隐藏状态的元组，每个元素是一个 `torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`。

            模型每一层的输出的隐藏状态以及初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            包含局部注意力权重的元组，每个元素是一个 `torch.FloatTensor`，形状为 `(batch_size, num_heads, sequence_length, x + attention_window + 1)`，其中 `x` 是具有全局注意力掩码的令牌数量。

            局部注意力softmax后的权重，用于计算自注意力头中的加权平均值。这些是每个令牌到具有全局注意力的每个令牌（前 `x` 个值）和到注意力窗口中的每个令牌（剩余的 `attention_window + 1` 个值）的注意力权重。
            注意，前 `x` 个值指的是文本中具有固定位置的令牌，但剩余的 `attention_window + 1` 个值指的是具有相对位置的令牌：一个令牌到自身的注意力权重位于索引 `x + attention_window / 2` 处，前 `attention_window / 2`（后 `attention_window / 2`）个值是指前 `attention_window / 2`（后 `attention_window / 2`）个令牌的注意力权重。
            如果注意力窗口中包含一个具有全局注意力的令牌，则相应索引处的注意力权重设为0；这些值应从前 `x` 个注意力权重中获取。如果一个令牌具有全局注意力，则到`attentions`中的所有其他令牌的注意力权重为0，这些值应从`global_attentions`中获取。
        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            包含全局注意力权重的元组，每个元素是一个 `torch.FloatTensor`，形状为 `(batch_size, num_heads, sequence_length, x)`，其中 `x` 是具有全局注意力掩码的令牌数量。

            全局注意力softmax后的权重，用于计算自注意力头中的加权平均值。这些是每个具有全局注意力的令牌到序列中的每个令牌的注意力权重。
# 定义一个可选的浮点数张量 loss，初始值为 None
loss: Optional[torch.FloatTensor] = None

# 定义一个浮点数张量 logits，初始值为 None
logits: torch.FloatTensor = None

# 定义一个可选的元组，包含多个浮点数张量，表示隐藏状态，初始值为 None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

# 定义一个可选的元组，包含多个浮点数张量，表示注意力机制，初始值为 None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

# 定义一个可选的元组，包含多个浮点数张量，表示全局注意力机制，初始值为 None
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

@dataclass
class LongformerMultipleChoiceModelOutput(ModelOutput):
"""
Base class for outputs of multiple choice Longformer models.
"""

# 可选项：损失值，用于存储模型的损失值（浮点数张量）
loss: Optional[torch.FloatTensor] = None
# 输出：逻辑回归值，模型的逻辑回归输出（浮点数张量）
logits: torch.FloatTensor = None
# 可选项：隐藏状态，包含模型的隐藏状态的元组（浮点数张量的元组）
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：注意力分布，包含模型的注意力分布的元组（浮点数张量的元组）
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选项：全局注意力分布，包含模型的全局注意力分布的元组（浮点数张量的元组）
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

@dataclass
class LongformerTokenClassifierOutput(ModelOutput):
"""
Base class for outputs of token classification models.
"""
# 定义 loss 变量，用于存储分类损失（如果提供标签的话）
loss: Optional[torch.FloatTensor] = None
# 定义一个变量 logits，类型为 torch 的 FloatTensor，初始值为 None
logits: torch.FloatTensor = None
# 定义一个变量 hidden_states，类型为一个可选的元组，元组内包含多个 torch 的 FloatTensor 对象，初始值为 None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个变量 attentions，类型为一个可选的元组，元组内包含多个 torch 的 FloatTensor 对象，初始值为 None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个变量 global_attentions，类型为一个可选的元组，元组内包含多个 torch 的 FloatTensor 对象，初始值为 None
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

计算输入中第一个 `sep_token_id` 的索引位置

def _get_question_end_index(input_ids, sep_token_id):
# 找到所有 sep_token_id 的索引位置
sep_token_indices = (input_ids == sep_token_id).nonzero()
batch_size = input_ids.shape[0]

# 断言确保 `input_ids` 是二维的
assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
# 断言确保每个样本中恰好有三个分隔符 `sep_token_id`
assert sep_token_indices.shape[0] == 3 * batch_size, (
    f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You"
    " might also consider to set `global_attention_mask` manually in the forward function to avoid this error."
)
# 返回每个样本中第一个 `sep_token_id` 的索引
return sep_token_indices.view(batch_size, 3, 2)[:, 0, 1]

计算全局注意力掩码，根据 `before_sep_token` 决定在 `sep_token_id` 之前或之后放置注意力

def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
# 获取问题结束的索引位置
question_end_index = _get_question_end_index(input_ids, sep_token_id)
question_end_index = question_end_index.unsqueeze(dim=1) # size: batch_size x 1

# 创建布尔类型的注意力掩码，全局注意力位置为 True
attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)
if before_sep_token is True:
    # 将小于 `question_end_index` 的位置设置为 True
    attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.bool)
else:
    # 如果不在 `before_sep_token` 模式下，将 `sep_token_id` 之后的位置设置为 True
    attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.bool) * (
        attention_mask.expand_as(input_ids) < input_ids.shape[-1]
    ).to(torch.bool)

return attention_mask

根据输入的 `input_ids` 和 `padding_idx` 创建位置编号

def create_position_ids_from_input_ids(input_ids, padding_idx):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's utils.make_positions.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
"""
# 创建掩码，标识非填充符号的位置
mask = input_ids.ne(padding_idx).int()
# 计算递增的位置编号，从 `padding_idx+1` 开始
incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
return incremental_indices.long() + padding_idx

class LongformerEmbeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
# 初始化函数，接受配置对象 config
def init(self, config):
# 调用父类构造函数初始化
super().init()
# 创建词嵌入层，根据配置中的词汇大小（vocab_size）和隐藏大小（hidden_size），并设置填充 token 的索引
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
# 创建标记类型嵌入层，根据配置中的类型词汇大小（type_vocab_size）和隐藏大小（hidden_size）
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

    # 创建 LayerNorm 层，用于规范化隐藏状态向量，保持与 TensorFlow 模型变量名称的一致性，并能够加载任何 TensorFlow 检查点文件
    self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    # 创建 Dropout 层，用于在训练过程中随机丢弃部分隐藏状态向量，以防止过拟合
    self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 设置填充 token 的索引，以便后续使用
    self.padding_idx = config.pad_token_id
    # 创建位置嵌入层，根据配置中的最大位置嵌入数（max_position_embeddings）和隐藏大小（hidden_size），并设置填充 token 的索引
    self.position_embeddings = nn.Embedding(
        config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
    )

# 前向传播函数，接受输入参数 input_ids、token_type_ids、position_ids 和 inputs_embeds
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
    # 如果未提供 position_ids 参数
    if position_ids is None:
        # 如果提供了 input_ids 参数
        if input_ids is not None:
            # 根据 input_ids 创建 position_ids，保持任何填充的 token 仍然是填充状态
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
        else:
            # 否则，从 inputs_embeds 创建 position_ids
            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

    # 如果提供了 input_ids 参数
    if input_ids is not None:
        # 获取 input_ids 的形状
        input_shape = input_ids.size()
    else:
        # 否则，获取 inputs_embeds 的形状，去掉最后一维（即序列长度维度）
        input_shape = inputs_embeds.size()[:-1]

    # 如果未提供 token_type_ids 参数，则创建全零的 token_type_ids，形状与 input_shape 相同
    if token_type_ids is None:
        token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=position_ids.device)

    # 如果未提供 inputs_embeds 参数，则根据 input_ids 获取对应的词嵌入向量
    if inputs_embeds is None:
        inputs_embeds = self.word_embeddings(input_ids)
    # 根据 position_ids 获取对应的位置嵌入向量
    position_embeddings = self.position_embeddings(position_ids)
    # 根据 token_type_ids 获取对应的标记类型嵌入向量
    token_type_embeddings = self.token_type_embeddings(token_type_ids)

    # 将词嵌入向量、位置嵌入向量和标记类型嵌入向量相加得到最终的嵌入表示
    embeddings = inputs_embeds + position_embeddings + token_type_embeddings
    # 对嵌入向量进行 LayerNorm 规范化
    embeddings = self.LayerNorm(embeddings)
    # 对规范化后的向量进行 Dropout 操作
    embeddings = self.dropout(embeddings)
    # 返回嵌入向量作为模型的输出
    return embeddings

# 根据 inputs_embeds 参数创建位置 ids 的函数
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
    """
    We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

    Args:
        inputs_embeds: torch.Tensor inputs_embeds:

    Returns: torch.Tensor
    """
    # 获取 inputs_embeds 的形状，去掉最后一维（即序列长度维度）
    input_shape = inputs_embeds.size()[:-1]
    # 获取序列长度
    sequence_length = input_shape[1]

    # 生成从 padding_idx + 1 开始，到 padding_idx + 1 + sequence_length 的序列作为位置 ids
    position_ids = torch.arange(
        self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
    )
    # 将位置 ids 扩展为与 input_shape 相同的形状，并返回
    return position_ids.unsqueeze(0).expand(input_shape)
# LongformerSelfAttention 类的定义，继承自 nn.Module
class LongformerSelfAttention(nn.Module):
    def __init__(self, config, layer_id):
        super().__init__()
        # 检查隐藏大小是否是注意力头数的倍数
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        # 设置注意力头数和头部维度
        self.num_heads = config.num_attention_heads
        self.head_dim = int(config.hidden_size / config.num_attention_heads)
        self.embed_dim = config.hidden_size

        # 为查询、键和值分别创建线性层
        self.query = nn.Linear(config.hidden_size, self.embed_dim)
        self.key = nn.Linear(config.hidden_size, self.embed_dim)
        self.value = nn.Linear(config.hidden_size, self.embed_dim)

        # 为具有全局注意力的令牌单独创建投影层
        self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
        self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
        self.value_global = nn.Linear(config.hidden_size, self.embed_dim)

        # 设置注意力概率的 dropout 率
        self.dropout = config.attention_probs_dropout_prob

        self.layer_id = layer_id
        attention_window = config.attention_window[self.layer_id]
        # 确保 attention_window 是偶数
        assert (
            attention_window % 2 == 0
        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
        # 确保 attention_window 是正数
        assert (
            attention_window > 0
        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"

        # 设置单侧注意力窗口大小
        self.one_sided_attn_window_size = attention_window // 2

        self.config = config

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_head_mask=None,
        is_index_masked=None,
        is_index_global_attn=None,
        is_global_attn=None,
        output_attentions=False,
    ):
        # _pad_and_transpose_last_two_dims 方法：填充并转置最后两个维度的静态方法
        def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
            """pads rows and then flips rows and columns"""
            hidden_states_padded = nn.functional.pad(
                hidden_states_padded, padding
            )  # padding value is not important because it will be overwritten
            hidden_states_padded = hidden_states_padded.view(
                *hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2)
            )
            return hidden_states_padded

        @staticmethod
def _pad_and_diagonalize(chunked_hidden_states):
    """
    shift every row 1 step right, converting columns into diagonals.

    Example:

    ```
    chunked_hidden_states: [
        0.4983,
        2.6918,
        -0.0071,
        1.0492,
        -1.8348,
        0.7672,
        0.2986,
        0.0285,
        -0.7584,
        0.4206,
        -0.0405,
        0.1599,
        2.0514,
        -1.1600,
        0.5372,
        0.2629,
    ]
    window_overlap = num_rows = 4
    ```

                 (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
                   0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206,
                   -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
    """
    # 获取输入张量的维度信息
    total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
    # 在最后一维度上进行填充，增加 window_overlap + 1 个位置的填充
    chunked_hidden_states = nn.functional.pad(
        chunked_hidden_states, (0, window_overlap + 1)
    )  # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
    # 重新调整张量形状，将多维度张量转换为二维张量
    chunked_hidden_states = chunked_hidden_states.view(
        total_num_heads, num_chunks, -1
    )  # total_num_heads x num_chunks x window_overlap*window_overlap+window_overlap
    # 去除最后一个维度上的部分数据，保留前面的数据
    chunked_hidden_states = chunked_hidden_states[
        :, :, :-window_overlap
    ]  # total_num_heads x num_chunks x window_overlap*window_overlap
    # 将二维张量转换为四维张量，重塑成对角化矩阵的形式
    chunked_hidden_states = chunked_hidden_states.view(
        total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
    )
    # 去除最后一个维度上多余的数据，保留有效的部分
    chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
    # 返回处理后的张量结果
    return chunked_hidden_states

@staticmethod
def _chunk(hidden_states, window_overlap, onnx_export: bool = False):
    """将隐藏状态转换为重叠的块。块大小 = 2w，重叠大小 = w"""
    if not onnx_export:
        # 对于非导出到ONNX的情况，生成大小为2w的非重叠块
        hidden_states = hidden_states.view(
            hidden_states.size(0),
            torch.div(hidden_states.size(1), (window_overlap * 2), rounding_mode="trunc"),
            window_overlap * 2,
            hidden_states.size(2),
        )
        # 使用 `as_strided` 方法使块之间重叠，重叠大小为 window_overlap
        chunk_size = list(hidden_states.size())
        chunk_size[1] = chunk_size[1] * 2 - 1

        chunk_stride = list(hidden_states.stride())
        chunk_stride[1] = chunk_stride[1] // 2
        return hidden_states.as_strided(size=chunk_size, stride=chunk_stride)

    # 导出到ONNX时，使用以下逻辑，因为 `as_strided`、`unfold` 和二维张量索引在ONNX导出中不受支持（尚未支持）
    # 当 `unfold` 支持后，使用以下方式进行替换：
    # > return hidden_states.unfold(dimension=1, size=window_overlap * 2, step=window_overlap).transpose(2, 3)
    # 如果 hidden_states.size(1) == window_overlap * 2，则可以简单地返回 hidden_states.unsqueeze(1)，但这需要控制流

    chunk_size = [
        hidden_states.size(0),
        torch.div(hidden_states.size(1), window_overlap, rounding_mode="trunc") - 1,
        window_overlap * 2,
        hidden_states.size(2),
    ]

    overlapping_chunks = torch.empty(chunk_size, device=hidden_states.device)
    for chunk in range(chunk_size[1]):
        overlapping_chunks[:, chunk, :, :] = hidden_states[
            :, chunk * window_overlap : chunk * window_overlap + 2 * window_overlap, :
        ]
    return overlapping_chunks

@staticmethod
def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor:
    # 创建一个影响序列长度为 affected_seq_len 的二维开始位置掩码
    beginning_mask_2d = input_tensor.new_ones(affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0])
    beginning_mask = beginning_mask_2d[None, :, None, :]
    # 创建一个翻转的结束位置掩码，与开始位置掩码相反
    ending_mask = beginning_mask.flip(dims=(1, 3))

    # 对输入张量的开始位置进行掩码处理
    beginning_input = input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1]
    beginning_mask = beginning_mask.expand(beginning_input.size())
    input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1] = torch.full_like(
        beginning_input, -float("inf")
    ).where(beginning_mask.bool(), beginning_input)

    # 对输入张量的结束位置进行掩码处理
    ending_input = input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :]
    ending_mask = ending_mask.expand(ending_input.size())
    input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :] = torch.full_like(
        ending_input, -float("inf")
    ).where(ending_mask.bool(), ending_input)
def _sliding_chunks_matmul_attn_probs_value(
    self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
):
    """
    Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
    same shape as `attn_probs`
    """
    # 获取 value 张量的维度信息
    batch_size, seq_len, num_heads, head_dim = value.size()

    # 断言确保 seq_len 可以被 window_overlap*2 整除
    assert seq_len % (window_overlap * 2) == 0
    # 断言确保 attn_probs 的前三个维度与 value 的前三个维度相同
    assert attn_probs.size()[:3] == value.size()[:3]
    # 断言确保 attn_probs 的第四个维度等于 2*window_overlap + 1
    assert attn_probs.size(3) == 2 * window_overlap + 1
    
    # 计算 chunk 的数量，即将 seq_len 分成大小为 window_overlap 的 chunk 的数量
    chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
    
    # 将 attn_probs 转置后重塑成形状为 (batch_size*num_heads, chunks_count, window_overlap, 2*window_overlap+1) 的张量
    chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
        batch_size * num_heads,
        torch.div(seq_len, window_overlap, rounding_mode="trunc"),
        window_overlap,
        2 * window_overlap + 1,
    )

    # 将 value 转置后重塑成形状为 (batch_size*num_heads, seq_len, head_dim) 的张量
    value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)

    # 在序列的开头和结尾各填充 window_overlap 个值为 -1 的元素
    padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1)

    # 将 padded_value 切分成大小为 3*window_overlap 的 chunk，重叠部分为 window_overlap
    chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim)
    chunked_value_stride = padded_value.stride()
    chunked_value_stride = (
        chunked_value_stride[0],
        window_overlap * chunked_value_stride[1],
        chunked_value_stride[1],
        chunked_value_stride[2],
    )
    chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride)

    # 对 chunked_attn_probs 执行 _pad_and_diagonalize 操作
    chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)

    # 使用 Einstein Summation (einsum) 进行张量乘法操作，得到 context 张量
    context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))
    
    # 调整 context 张量的形状，并将第二和第三维度交换位置
    return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2)
def _get_global_attn_indices(is_index_global_attn):
    """计算全局注意力索引，在前向传递中需要的索引"""
    # 计算每个样本中全局注意力索引的数量
    num_global_attn_indices = is_index_global_attn.long().sum(dim=1)

    # 批次中全局注意力索引的最大数量
    max_num_global_attn_indices = num_global_attn_indices.max()

    # 全局注意力索引的位置
    is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)

    # 辅助变量，表示是否是全局注意力的本地索引
    is_local_index_global_attn = torch.arange(
        max_num_global_attn_indices, device=is_index_global_attn.device
    ) < num_global_attn_indices.unsqueeze(dim=-1)

    # 全局注意力索引中非零值的位置
    is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)

    # 全局注意力索引中零值（即填充值）的位置
    is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)

    return (
        max_num_global_attn_indices,
        is_index_global_attn_nonzero,
        is_local_index_global_attn_nonzero,
        is_local_index_no_global_attn_nonzero,
    )

def _concat_with_global_key_attn_probs(
    self,
    key_vectors,
    query_vectors,
    max_num_global_attn_indices,
    is_index_global_attn_nonzero,
    is_local_index_global_attn_nonzero,
    is_local_index_no_global_attn_nonzero,
):
    batch_size = key_vectors.shape[0]

    # 仅创建全局键向量
    key_vectors_only_global = key_vectors.new_zeros(
        batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
    )

    # 将全局注意力索引对应的键向量填充到新创建的张量中
    key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero]

    # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
    # 使用 Einstein Summation 计算全局键向量对应的注意力概率
    attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query_vectors, key_vectors_only_global))

    # 由于 ONNX 导出仅支持连续索引，需要进行转置操作
    attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)

    # 将填充位置的注意力概率置为一个很小的数，以便在处理中被忽略
    attn_probs_from_global_key[
        is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, :
    ] = torch.finfo(attn_probs_from_global_key.dtype).min

    # 再次进行转置，以便输出与原始格式匹配
    attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)

    return attn_probs_from_global_key

def _compute_attn_output_with_global_indices(
    self,
    value_vectors,
    attn_probs,
    max_num_global_attn_indices,
    is_index_global_attn_nonzero,
    is_local_index_global_attn_nonzero,
    is_local_index_no_global_attn_nonzero,
):
    # 省略函数体，不在注释范围内
    ):
    # 获取批量大小
    batch_size = attn_probs.shape[0]

    # 仅保留全局注意力的局部注意力概率
    attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices)
    
    # 仅获取全局注意力对应的数值向量
    value_vectors_only_global = value_vectors.new_zeros(
        batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
    )
    value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero]

    # 使用 `matmul` 替代 `einsum`，因为在 fp16 下 `einsum` 有时会崩溃
    # 计算仅全局注意力的输出
    attn_output_only_global = torch.matmul(
        attn_probs_only_global.transpose(1, 2).clone(), value_vectors_only_global.transpose(1, 2).clone()
    ).transpose(1, 2)

    # 重塑非全局注意力的注意力概率
    attn_probs_without_global = attn_probs.narrow(
        -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices
    ).contiguous()

    # 使用滑动窗口方法计算包含全局和非全局注意力的注意力输出
    attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
        attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
    )
    
    # 返回全局注意力输出与非全局注意力输出的总和
    return attn_output_only_global + attn_output_without_global

def _compute_global_attn_output_from_hidden(
    self,
    hidden_states,
    max_num_global_attn_indices,
    layer_head_mask,
    is_local_index_global_attn_nonzero,
    is_index_global_attn_nonzero,
    is_local_index_no_global_attn_nonzero,
    is_index_masked,

Copied from transformers.models.bert.modeling_bert.BertSelfOutput

class LongformerSelfOutput(nn.Module):
def init(self, config):
super().init()
# 定义一个全连接层，输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# LayerNorm 层，对输入进行归一化，eps 参数设置为 config.layer_norm_eps
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Dropout 层，按照 config.hidden_dropout_prob 概率随机丢弃输入
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
    # 全连接层 dense 对输入 hidden_states 进行线性变换
    hidden_states = self.dense(hidden_states)
    # 对线性变换后的结果进行 Dropout
    hidden_states = self.dropout(hidden_states)
    # 对 Dropout 后的结果和输入 input_tensor 进行 LayerNorm
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    # 返回 LayerNorm 结果作为输出
    return hidden_states

class LongformerAttention(nn.Module):
def init(self, config, layer_id=0):
super().init()
# 创建 LongformerSelfAttention 对象，传入 config 和 layer_id
self.self = LongformerSelfAttention(config, layer_id)
# 创建 LongformerSelfOutput 对象，传入 config
self.output = LongformerSelfOutput(config)
# 初始化一个空集合，用于存储需要剪枝的注意力头
self.pruned_heads = set()

def prune_heads(self, heads):
    if len(heads) == 0:
        return
    # 根据给定的 heads 列表，寻找可剪枝的注意力头和对应的索引
    heads, index = find_pruneable_heads_and_indices(
        heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
    )

    # 对 self.self 中的 query、key、value 线性层进行剪枝
    self.self.query = prune_linear_layer(self.self.query, index)
    self.self.key = prune_linear_layer(self.self.key, index)
    self.self.value = prune_linear_layer(self.self.value, index)
    # 对 self.output 中的 dense 线性层进行剪枝，dim=1 表示在第一个维度上进行剪枝
    self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

    # 更新超参数并存储被剪枝的头
    self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
    self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
    self.pruned_heads = self.pruned_heads.union(heads)

def forward(
    self,
    hidden_states,
    attention_mask=None,
    layer_head_mask=None,
    is_index_masked=None,
    is_index_global_attn=None,
    is_global_attn=None,
    output_attentions=False,
):
    # 调用 self.self 的 forward 方法，传入相应参数，获取自注意力机制的输出
    self_outputs = self.self(
        hidden_states,
        attention_mask=attention_mask,
        layer_head_mask=layer_head_mask,
        is_index_masked=is_index_masked,
        is_index_global_attn=is_index_global_attn,
        is_global_attn=is_global_attn,
        output_attentions=output_attentions,
    )
    # 将 self_outputs[0] 和 hidden_states 作为输入，调用 self.output 进行后续处理
    attn_output = self.output(self_outputs[0], hidden_states)
    # 返回 attn_output 和 self_outputs 的其余部分作为输出
    outputs = (attn_output,) + self_outputs[1:]
    return outputs

Copied from transformers.models.bert.modeling_bert.BertIntermediate

class LongformerIntermediate(nn.Module):
def init(self, config):
super().init()
# 创建一个全连接层，输入维度是 config.hidden_size，输出维度是 config.intermediate_size
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 如果 config.hidden_act 是字符串，则选择对应的激活函数，否则直接使用给定的激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 前向传播函数，接收隐藏状态张量并返回处理后的张量
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 通过全连接层处理隐藏状态张量
hidden_states = self.dense(hidden_states)
# 应用激活函数到全连接层输出的隐藏状态张量
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的隐藏状态张量
return hidden_states

Copied from transformers.models.bert.modeling_bert.BertOutput

定义了 LongformerOutput 类，继承自 nn.Module

class LongformerOutput(nn.Module):
def init(self, config):
super().init()
# 创建一个线性层，将输入特征大小调整为隐藏状态大小
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
# 创建 LayerNorm 层，对隐藏状态进行归一化
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建 Dropout 层，用于随机丢弃隐藏状态中的一些元素，以防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)

# 前向传播函数，接受两个张量作为输入并返回一个张量作为输出
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
    # 使用线性层进行变换
    hidden_states = self.dense(hidden_states)
    # 在输出上应用 dropout
    hidden_states = self.dropout(hidden_states)
    # 对变换后的隐藏状态应用 LayerNorm，并加上输入张量，形成残差连接
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    # 返回处理后的隐藏状态张量
    return hidden_states

定义了 LongformerLayer 类，继承自 nn.Module

class LongformerLayer(nn.Module):
def init(self, config, layer_id=0):
super().init()
# 创建 LongformerAttention 对象
self.attention = LongformerAttention(config, layer_id)
# 创建 LongformerIntermediate 对象
self.intermediate = LongformerIntermediate(config)
# 创建 LongformerOutput 对象
self.output = LongformerOutput(config)
# 设置前馈过程的分块大小
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 设置序列长度的维度
self.seq_len_dim = 1

# 前向传播函数，接受多个输入参数，并返回多个输出
def forward(
    self,
    hidden_states,
    attention_mask=None,
    layer_head_mask=None,
    is_index_masked=None,
    is_index_global_attn=None,
    is_global_attn=None,
    output_attentions=False,
):
    # 使用注意力层进行处理，并获取注意力输出
    self_attn_outputs = self.attention(
        hidden_states,
        attention_mask=attention_mask,
        layer_head_mask=layer_head_mask,
        is_index_masked=is_index_masked,
        is_index_global_attn=is_index_global_attn,
        is_global_attn=is_global_attn,
        output_attentions=output_attentions,
    )
    # 获取注意力输出的第一个元素作为注意力输出
    attn_output = self_attn_outputs[0]
    # 对注意力输出应用分块策略来进行前向传播
    layer_output = apply_chunking_to_forward(
        self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attn_output
    )
    # 将层输出和注意力输出的其余部分组合为输出元组
    outputs = (layer_output,) + self_attn_outputs[1:]
    # 返回最终的输出元组
    return outputs

# 前馈分块函数，接受注意力输出并返回层输出
def ff_chunk(self, attn_output):
    # 使用中间层进行处理
    intermediate_output = self.intermediate(attn_output)
    # 使用输出层进行处理，并返回层输出
    layer_output = self.output(intermediate_output, attn_output)
    return layer_output

定义了 LongformerEncoder 类，继承自 nn.Module

class LongformerEncoder(nn.Module):
def init(self, config):
super().init()
self.config = config
# 创建 nn.ModuleList 来包含多个 LongformerLayer 对象，每个对象代表一层
self.layer = nn.ModuleList([LongformerLayer(config, layer_id=i) for i in range(config.num_hidden_layers)])
# 默认关闭梯度检查点
self.gradient_checkpointing = False

# 前向传播函数，接受多个输入参数并返回多个输出
def forward(
    self,
    hidden_states,
    attention_mask=None,
    head_mask=None,
    padding_len=0,
    output_attentions=False,
    output_hidden_states=False,
    return_dict=True,
):
    # 遍历每一层 LongformerLayer 并调用其 forward 方法进行处理
    for layer_module in self.layer:
        # 将当前层的输出作为下一层的输入
        hidden_states = layer_module(
            hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=head_mask,
            output_attentions=output_attentions,
        )[0]  # 只保留每层的第一个输出
    # 返回最终的隐藏状态张量
    return hidden_states

Copied from transformers.models.bert.modeling_bert.BertPooler

定义了 LongformerPooler 类，继承自 nn.Module

class LongformerPooler(nn.Module):
def init(self, config):
super().init()
# 创建一个线性层，将隐藏状态大小映射回隐藏状态大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 创建激活函数层，使用 tanh 激活函数
self.activation = nn.Tanh()
# 定义一个方法 forward，接受一个名为 hidden_states 的张量作为输入，并返回一个张量作为输出
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 通过取第一个标记对应的隐藏状态来实现模型的"汇聚"操作
first_token_tensor = hidden_states[:, 0]
# 将第一个标记的隐藏状态传递给全连接层 self.dense
pooled_output = self.dense(first_token_tensor)
# 将全连接层的输出应用激活函数 self.activation
pooled_output = self.activation(pooled_output)
# 返回经过激活函数处理后的汇聚输出张量
return pooled_output

从transformers.models.roberta.modeling_roberta.RobertaLMHead中复制并修改为LongformerLMHead

class LongformerLMHead(nn.Module):
"""Longformer Head for masked language modeling."""

def __init__(self, config):
    super().__init__()
    # 定义一个全连接层，输入和输出维度为config.hidden_size
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    # Layer normalization，输入维度为config.hidden_size
    self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 解码器线性层，将config.hidden_size映射到config.vocab_size
    self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
    # 偏置项，用于解码器线性层的偏置
    self.bias = nn.Parameter(torch.zeros(config.vocab_size))
    # 将解码器的偏置设置为自定义的偏置项
    self.decoder.bias = self.bias

def forward(self, features, **kwargs):
    # 全连接层的前向传播
    x = self.dense(features)
    # GELU激活函数
    x = gelu(x)
    # Layer normalization
    x = self.layer_norm(x)

    # 使用解码器将特征映射到词汇表大小的向量空间
    x = self.decoder(x)

    return x

def _tie_weights(self):
    # 如果解码器的偏置设备类型为"meta"，则将解码器的偏置与自定义的偏置项绑定
    # 否则，将自定义的偏置项与解码器的偏置绑定
    if self.decoder.bias.device.type == "meta":
        self.decoder.bias = self.bias
    else:
        self.bias = self.decoder.bias

class LongformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""

# 指定配置类为LongformerConfig
config_class = LongformerConfig
# 基础模型前缀为"longformer"
base_model_prefix = "longformer"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 不进行模块拆分的模块列表
_no_split_modules = ["LongformerSelfAttention"]

def _init_weights(self, module):
    """初始化权重"""
    if isinstance(module, nn.Linear):
        # 略微不同于TF版本，使用正态分布初始化权重
        # 参考 https://github.com/pytorch/pytorch/pull/5617
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
        # 使用正态分布初始化嵌入层权重
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            # 如果有填充索引，则将填充索引对应的权重置为零
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
        # 将LayerNorm层的偏置项置零，权重置为1.0
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)

长格式开始文档字符串，描述了LongformerPreTrainedModel类的基本信息和用法

LONGFORMER_START_DOCSTRING = r"""

This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)

This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
    config ([`LongformerConfig`]): Model configuration class with all the parameters of the
        model. Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

"""
LongformerModel 类的定义，继承自 LongformerPreTrainedModel 类，实现了具有长序列处理能力的自注意力机制。
"""

@add_start_docstrings(
"The bare Longformer Model outputting raw hidden-states without any specific head on top.",
LONGFORMER_START_DOCSTRING,
)
class LongformerModel(LongformerPreTrainedModel):
"""
这个类是从 RobertaModel 复制的代码，并用长形自注意力机制覆盖了标准的自注意力机制，
以提供处理长序列的能力，遵循 Longformer: the Long-Document Transformer
论文中描述的自注意力方法，由 Iz Beltagy、Matthew E. Peters 和 Arman Cohan 提出。

Longformer 的自注意力结合了局部（滑动窗口）和全局注意力，可以在不增加 O(n^2) 内存和计算量的情况下扩展到长文档。

这里实现的 `LongformerSelfAttention` 自注意力模块支持局部和全局注意力的结合，但不支持自回归注意力和扩展注意力。
自回归和扩展注意力对于自回归语言建模比下游任务的微调更为重要。未来的版本将添加对自回归注意力的支持，
但是对扩展注意力的支持需要一个定制的 CUDA 内核，以确保内存和计算效率。

"""

def __init__(self, config, add_pooling_layer=True):
    """
    初始化函数，接受一个配置对象 `config` 和一个布尔值参数 `add_pooling_layer`。
    """
    super().__init__(config)
    self.config = config

    if isinstance(config.attention_window, int):
        assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
        assert config.attention_window > 0, "`config.attention_window` has to be positive"
        config.attention_window = [config.attention_window] * config.num_hidden_layers  # 为每一层设置一个值
    else:
        assert len(config.attention_window) == config.num_hidden_layers, (
            "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
            f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
        )

    self.embeddings = LongformerEmbeddings(config)  # 初始化 LongformerEmbeddings
    self.encoder = LongformerEncoder(config)  # 初始化 LongformerEncoder
    self.pooler = LongformerPooler(config) if add_pooling_layer else None  # 初始化 LongformerPooler，如果 add_pooling_layer 为 True 则初始化，否则为 None

    # 初始化权重并应用最终处理
    self.post_init()

def get_input_embeddings(self):
    """
    返回输入嵌入层 `word_embeddings`。
    """
    return self.embeddings.word_embeddings

def set_input_embeddings(self, value):
    """
    设置输入嵌入层 `word_embeddings` 的值为 `value`。
    """
    self.embeddings.word_embeddings = value

def _prune_heads(self, heads_to_prune):
    """
    剪枝模型的注意力头。heads_to_prune: dict，键为层号，值为要在该层中剪枝的注意力头列表。参见 PreTrainedModel 基类。
    """
    for layer, heads in heads_to_prune.items():
        self.encoder.layer[layer].attention.prune_heads(heads)
def _pad_to_window_size(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
    inputs_embeds: torch.Tensor,
    pad_token_id: int,
):
    """A helper function to pad tokens and mask to work with implementation of Longformer self-attention."""
    # 获取注意力窗口大小，若为整数则直接使用，否则取最大值
    attention_window = (
        self.config.attention_window
        if isinstance(self.config.attention_window, int)
        else max(self.config.attention_window)
    )

    # 断言确保 attention_window 是偶数
    assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
    
    # 获取输入数据的形状信息
    input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
    batch_size, seq_len = input_shape[:2]

    # 计算需要填充的长度，使得序列长度是 attention_window 的整数倍
    padding_len = (attention_window - seq_len % attention_window) % attention_window

    # 在 ONNX 导出时需要记录这个分支，即使 padding_len == 0 也是可以的
    if padding_len > 0:
        # 发出警告，说明输入的长度被自动填充到多个 attention_window 的倍数
        logger.warning_once(
            f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
            f"`config.attention_window`: {attention_window}"
        )
        # 如果存在 input_ids，则使用 nn.functional.pad 进行填充
        if input_ids is not None:
            input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
        # 如果存在 position_ids，则使用 nn.functional.pad 进行填充，填充值为 pad_token_id
        if position_ids is not None:
            position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id)
        # 如果存在 inputs_embeds，则创建一个新的 padding 数据，填充值为 pad_token_id，并拼接在原 inputs_embeds 后面
        if inputs_embeds is not None:
            input_ids_padding = inputs_embeds.new_full(
                (batch_size, padding_len),
                self.config.pad_token_id,
                dtype=torch.long,
            )
            inputs_embeds_padding = self.embeddings(input_ids_padding)
            inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)

        # 使用 nn.functional.pad 在 attention_mask 上进行填充，填充值为 0，表示填充部分不考虑注意力
        attention_mask = nn.functional.pad(
            attention_mask, (0, padding_len), value=0
        )  # no attention on the padding tokens
        # 使用 nn.functional.pad 在 token_type_ids 上进行填充，填充值为 0
        token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0)  # pad with token_type_id = 0

    # 返回填充后的信息：padding_len 填充长度，以及可能被填充的 input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds
    return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds
def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
    # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
    # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
    # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
    
    # 如果传入的 attention_mask 不为空
    if attention_mask is not None:
        # 将 attention_mask 乘以 (global_attention_mask + 1)，生成最终的合并后的 attention_mask
        attention_mask = attention_mask * (global_attention_mask + 1)
    else:
        # 如果没有传入 attention_mask，则直接使用 global_attention_mask + 1 作为 attention_mask
        attention_mask = global_attention_mask + 1
    
    # 返回合并后的 attention_mask
    return attention_mask

@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LongformerBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    global_attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,

使用装饰器为类添加文档字符串，描述其作为 Longformer 模型和语言建模头部的特性

@add_start_docstrings("""Longformer Model with a language modeling head on top.""", LONGFORMER_START_DOCSTRING)
class LongformerForMaskedLM(LongformerPreTrainedModel):
# 定义用于共享权重的关键字列表
_tied_weights_keys = ["lm_head.decoder"]

# 初始化方法，接受一个配置对象，并调用父类的初始化方法
def __init__(self, config):
    super().__init__(config)

    # 创建 Longformer 模型，不包含池化层
    self.longformer = LongformerModel(config, add_pooling_layer=False)
    # 创建 LongformerLMHead 对象作为语言建模头部
    self.lm_head = LongformerLMHead(config)

    # 调用初始化权重和应用最终处理的方法
    self.post_init()

# 返回语言建模头部的解码器
def get_output_embeddings(self):
    return self.lm_head.decoder

# 设置语言建模头部的解码器
def set_output_embeddings(self, new_embeddings):
    self.lm_head.decoder = new_embeddings

# 使用装饰器为前向方法添加文档字符串，描述其接受的输入参数和输出类型
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LongformerMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    global_attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
    kwargs (`Dict[str, any]`, optional, defaults to *{}*):
        Used to hide legacy arguments that have been deprecated.

    Returns:

    Mask filling example:

    ```
    >>> from transformers import AutoTokenizer, LongformerForMaskedLM

    >>> tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
    >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
    ```

    Let's try a very long input.

    ```
    >>> TXT = (
    ...     "My friends are <mask> but they eat too many carbs."
    ...     + " That's why I decide not to eat with them." * 300
    ... )
    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
    >>> logits = model(input_ids).logits

    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    >>> probs = logits[0, masked_index].softmax(dim=0)
    >>> values, predictions = probs.topk(5)

    >>> tokenizer.decode(predictions).split()
    ['healthy', 'skinny', 'thin', 'good', 'vegetarian']
    ```"""
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 调用Longformer模型进行预测
    outputs = self.longformer(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,
        head_mask=head_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 获取序列输出
    sequence_output = outputs[0]
    # 使用语言建模头部生成预测分数
    prediction_scores = self.lm_head(sequence_output)

    masked_lm_loss = None
    if labels is not None:
        # 定义交叉熵损失函数
        loss_fct = CrossEntropyLoss()

        # 将labels移动到与预测分数相同的设备上
        labels = labels.to(prediction_scores.device)
        # 计算masked语言建模的损失
        masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

    if not return_dict:
        # 如果不使用return_dict，则返回额外的输出
        output = (prediction_scores,) + outputs[2:]
        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

    # 使用LongformerMaskedLMOutput类来返回结果
    return LongformerMaskedLMOutput(
        loss=masked_lm_loss,
        logits=prediction_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        global_attentions=outputs.global_attentions,
    )

@add_start_docstrings(
"""
Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
LONGFORMER_START_DOCSTRING,
)
class LongformerForSequenceClassification(LongformerPreTrainedModel):
"""
Longformer模型，顶部带有序列分类/回归头部（即在汇总输出之上的线性层），例如用于GLUE任务。
继承自LongformerPreTrainedModel。
"""

def __init__(self, config):
    """
    初始化方法，接收一个配置参数config。

    Args:
        config (LongformerConfig): 模型的配置对象。

    """
    super().__init__(config)
    self.num_labels = config.num_labels
    self.config = config

    # 创建Longformer模型，不包含汇总层
    self.longformer = LongformerModel(config, add_pooling_layer=False)
    # 创建Longformer分类头部
    self.classifier = LongformerClassificationHead(config)

    # 初始化权重并应用最终处理
    self.post_init()

@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint="jpwahle/longformer-base-plagiarism-detection",
    output_type=LongformerSequenceClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
    expected_output="'ORIGINAL'",
    expected_loss=5.44,
)
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    global_attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
):
    """
    前向传播方法，接收多个输入和控制参数，并返回模型输出。

    Args:
        input_ids (torch.Tensor, optional): 输入的token IDs张量。Default: None
        attention_mask (torch.Tensor, optional): 注意力掩码张量。Default: None
        global_attention_mask (torch.Tensor, optional): 全局注意力掩码张量。Default: None
        head_mask (torch.Tensor, optional): 头部掩码张量。Default: None
        token_type_ids (torch.Tensor, optional): token类型IDs张量。Default: None
        position_ids (torch.Tensor, optional): 位置IDs张量。Default: None
        inputs_embeds (torch.Tensor, optional): 嵌入输入张量。Default: None
        labels (torch.Tensor, optional): 标签张量。Default: None
        output_attentions (bool, optional): 是否返回注意力权重。Default: None
        output_hidden_states (bool, optional): 是否返回隐藏状态。Default: None
        return_dict (bool, optional): 是否以字典形式返回输出。Default: None

    Returns:
        Various depending on the configuration (torch.Tensor or dict of torch.Tensor):
        根据配置返回不同类型的输出（torch.Tensor或torch.Tensor字典）。

    """
    ) -> Union[Tuple, LongformerSequenceClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    # 初始化返回字典，如果未提供则根据配置决定是否返回字典
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 如果全局注意力掩码未提供，则发出警告并初始化全局注意力掩码，将第一个token设置为全局关注
    if global_attention_mask is None:
        logger.warning_once("Initializing global attention on CLS token...")
        global_attention_mask = torch.zeros_like(input_ids)
        global_attention_mask[:, 0] = 1  # 在CLS token上开启全局关注

    # 使用Longformer模型进行前向传播
    outputs = self.longformer(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,
        head_mask=head_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    sequence_output = outputs[0]  # 获取Longformer模型的序列输出
    logits = self.classifier(sequence_output)  # 使用分类器对序列输出进行分类得到logits

    loss = None
    if labels is not None:
        labels = labels.to(logits.device)  # 将标签移到与logits相同的设备上

        # 确定问题类型（回归、单标签分类、多标签分类）
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        # 根据问题类型选择合适的损失函数进行计算损失
        if self.config.problem_type == "regression":
            loss_fct = MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

    # 如果不要求返回字典，则返回一个元组
    if not return_dict:
        output = (logits,) + outputs[2:]  # 组装输出元组
        return ((loss,) + output) if loss is not None else output

    # 返回Longformer模型的输出，作为LongformerSequenceClassifierOutput对象
    return LongformerSequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        global_attentions=outputs.global_attentions,
    )

class LongformerClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""

def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 定义一个全连接层，输入输出维度为config.hidden_size
    self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 定义一个dropout层，概率为config.hidden_dropout_prob
    self.out_proj = nn.Linear(config.hidden_size, config.num_labels)  # 定义一个全连接层，输入为config.hidden_size，输出为config.num_labels

def forward(self, hidden_states, **kwargs):
    hidden_states = hidden_states[:, 0, :]  # 取hidden_states的第一个token（相当于[CLS]）
    hidden_states = self.dropout(hidden_states)  # 对hidden_states进行dropout处理
    hidden_states = self.dense(hidden_states)  # 将hidden_states输入全连接层进行线性变换
    hidden_states = torch.tanh(hidden_states)  # 对变换后的hidden_states应用tanh激活函数
    hidden_states = self.dropout(hidden_states)  # 再次对hidden_states进行dropout处理
    output = self.out_proj(hidden_states)  # 将处理后的hidden_states输入输出层进行线性变换得到最终输出
    return output

@add_start_docstrings(
"""
Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD /
TriviaQA (a linear layers on top of the hidden-states output to compute span start logits and span end logits).
""",
LONGFORMER_START_DOCSTRING,
)
class LongformerForQuestionAnswering(LongformerPreTrainedModel):
def init(self, config):
super().init(config)
self.num_labels = config.num_labels

    self.longformer = LongformerModel(config, add_pooling_layer=False)  # 使用LongformerModel初始化一个Longformer层，不加池化层
    self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)  # 定义一个全连接层，输入维度为config.hidden_size，输出维度为config.num_labels

    # Initialize weights and apply final processing
    self.post_init()  # 执行初始化权重和最终处理步骤

@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    global_attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    start_positions: Optional[torch.Tensor] = None,
    end_positions: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,

@add_start_docstrings(
"""
Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
LONGFORMER_START_DOCSTRING,
)
class LongformerForTokenClassification(LongformerPreTrainedModel):
# 该类为基于Longformer的标记分类模型，用于例如命名实体识别（NER）任务
# 初始化方法，接收一个配置对象作为参数
def init(self, config):
# 调用父类的初始化方法，传入配置对象
super().init(config)
# 将配置对象中的标签数量赋给实例变量 num_labels
self.num_labels = config.num_labels

    # 使用配置对象初始化 Longformer 模型，不添加池化层
    self.longformer = LongformerModel(config, add_pooling_layer=False)
    # 使用配置对象中的隐藏层 dropout 概率初始化 dropout 层
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # 使用配置对象中的隐藏大小和标签数量初始化线性分类器
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # 调用自定义的后初始化方法，用于初始化权重并进行最终处理
    self.post_init()

# 前向传播方法，根据输入计算输出结果
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint="brad1141/Longformer-finetuned-norm",
    output_type=LongformerTokenClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
    expected_output=(
        "['Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence',"
        " 'Evidence', 'Evidence', 'Evidence', 'Evidence']"
    ),
    expected_loss=0.63,
)
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    global_attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    ) -> Union[Tuple, LongformerTokenClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
    """
    # 确定是否返回字典格式的输出，如果未指定则根据配置决定
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 将输入传递给Longformer模型进行处理
    outputs = self.longformer(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,
        head_mask=head_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 获取模型输出的序列输出
    sequence_output = outputs[0]

    # 对序列输出应用dropout操作
    sequence_output = self.dropout(sequence_output)
    
    # 将dropout后的序列输出传递给分类器得到logits
    logits = self.classifier(sequence_output)

    # 初始化损失为None
    loss = None
    
    # 如果提供了标签，则计算损失
    if labels is not None:
        # 使用交叉熵损失函数
        loss_fct = CrossEntropyLoss()

        # 将标签移到与logits相同的设备上
        labels = labels.to(logits.device)
        
        # 计算交叉熵损失
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    # 如果不要求返回字典格式的输出
    if not return_dict:
        # 构造输出元组，包括logits和可能的额外输出状态
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 返回LongformerTokenClassifierOutput对象，其中包括损失、logits、隐藏状态和注意力权重
    return LongformerTokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        global_attentions=outputs.global_attentions,
    )

"""
Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
"""

继承自 LongformerPreTrainedModel 的 Longformer 多选分类模型

class LongformerForMultipleChoice(LongformerPreTrainedModel):
def init(self, config):
super().init(config)

    # 初始化 Longformer 模型
    self.longformer = LongformerModel(config)
    # Dropout 层
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # 分类器，线性层
    self.classifier = nn.Linear(config.hidden_size, 1)

    # 初始化权重并进行最终处理
    self.post_init()

@add_start_docstrings_to_model_forward(
    LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=LongformerMultipleChoiceModelOutput,
    config_class=_CONFIG_FOR_DOC,
)
# 前向传播函数
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    global_attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,

"""

posted @ 2024-06-29 16:59 绝不原创的飞龙阅读(206) 评论(0) 编辑收藏举报

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-六十五-

Transformers 源码解析（六十五）

.\models\llama\tokenization_llama.py

.\models\llama\tokenization_llama_fast.py

.\models\llama\__init__.py

.\models\llava\configuration_llava.py

.\models\llava\convert_llava_weights_to_hf.py

.\models\llava\modeling_llava.py

.\models\llava\processing_llava.py

.\models\llava\__init__.py

.\models\llava_next\configuration_llava_next.py

.\models\llava_next\convert_llava_next_weights_to_hf.py

.\models\llava_next\image_processing_llava_next.py

.\models\llava_next\modeling_llava_next.py

.\models\llava_next\processing_llava_next.py

.\models\llava_next\__init__.py

.\models\longformer\configuration_longformer.py

.\models\longformer\convert_longformer_original_pytorch_lightning_to_pytorch.py

.\models\longformer\modeling_longformer.py