Transformers-源码解析-二十七-

Transformers 源码解析(二十七)

.\models\clvp\number_normalizer.py

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""English Normalizer class for CLVP."""


import re

class EnglishNormalizer:
    def __init__(self):
        # List of (regular expression, replacement) pairs for abbreviations:
        self._abbreviations = [
            # Compile regular expressions for abbreviations and their replacements
            (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
            for x in [
                ("mrs", "misess"),
                ("mr", "mister"),
                ("dr", "doctor"),
                ("st", "saint"),
                ("co", "company"),
                ("jr", "junior"),
                ("maj", "major"),
                ("gen", "general"),
                ("drs", "doctors"),
                ("rev", "reverend"),
                ("lt", "lieutenant"),
                ("hon", "honorable"),
                ("sgt", "sergeant"),
                ("capt", "captain"),
                ("esq", "esquire"),
                ("ltd", "limited"),
                ("col", "colonel"),
                ("ft", "fort"),
            ]
        ]

        # List of English words for numbers
        self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
        self.teens = [
            "ten",
            "eleven",
            "twelve",
            "thirteen",
            "fourteen",
            "fifteen",
            "sixteen",
            "seventeen",
            "eighteen",
            "nineteen",
        ]
        self.tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
    def number_to_words(self, num: int) -> str:
        """
        Converts numbers(`int`) to words(`str`).

        Please note that it only supports up to - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
        trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
        thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
        """
        # 如果输入的数字为0,返回字符串 "zero"
        if num == 0:
            return "zero"
        # 如果输入的数字小于0,返回负数的英文表示,递归调用自身处理绝对值
        elif num < 0:
            return "minus " + self.number_to_words(abs(num))
        # 处理0到9之间的数字,直接返回对应的英文表示
        elif num < 10:
            return self.ones[num]
        # 处理10到19之间的数字,直接返回对应的英文表示
        elif num < 20:
            return self.teens[num - 10]
        # 处理20到99之间的数字,分解为十位和个位,递归调用自身处理个位
        elif num < 100:
            return self.tens[num // 10] + ("-" + self.number_to_words(num % 10) if num % 10 != 0 else "")
        # 处理100到999之间的数字,分解为百位和剩余部分,递归调用自身处理剩余部分
        elif num < 1000:
            return (
                self.ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100) if num % 100 != 0 else "")
            )
        # 处理1000到999999之间的数字,分解为千位和剩余部分,递归调用自身处理剩余部分
        elif num < 1_000_000:
            return (
                self.number_to_words(num // 1000)
                + " thousand"
                + (", " + self.number_to_words(num % 1000) if num % 1000 != 0 else "")
            )
        # 处理1000000到999999999之间的数字,分解为百万位和剩余部分,递归调用自身处理剩余部分
        elif num < 1_000_000_000:
            return (
                self.number_to_words(num // 1_000_000)
                + " million"
                + (", " + self.number_to_words(num % 1_000_000) if num % 1_000_000 != 0 else "")
            )
        # 处理1000000000到999999999999之间的数字,分解为十亿位和剩余部分,递归调用自身处理剩余部分
        elif num < 1_000_000_000_000:
            return (
                self.number_to_words(num // 1_000_000_000)
                + " billion"
                + (", " + self.number_to_words(num % 1_000_000_000) if num % 1_000_000_000 != 0 else "")
            )
        # 处理1000000000000到999999999999999之间的数字,分解为万亿位和剩余部分,递归调用自身处理剩余部分
        elif num < 1_000_000_000_000_000:
            return (
                self.number_to_words(num // 1_000_000_000_000)
                + " trillion"
                + (", " + self.number_to_words(num % 1_000_000_000_000) if num % 1_000_000_000_000 != 0 else "")
            )
        # 处理1000000000000000到999999999999999999之间的数字,分解为千万亿位和剩余部分,递归调用自身处理剩余部分
        elif num < 1_000_000_000_000_000_000:
            return (
                self.number_to_words(num // 1_000_000_000_000_000)
                + " quadrillion"
                + (
                    ", " + self.number_to_words(num % 1_000_000_000_000_000)
                    if num % 1_000_000_000_000_000 != 0
                    else ""
                )
            )
        # 处理超出范围的数字,返回字符串 "number out of range"
        else:
            return "number out of range"

    def convert_to_ascii(self, text: str) -> str:
        """
        Converts unicode to ascii
        """
        # 将Unicode文本转换为ASCII编码,忽略非ASCII字符
        return text.encode("ascii", "ignore").decode("utf-8")
    def _expand_dollars(self, m: str) -> str:
        """
        This method is used to expand numerical dollar values into spoken words.
        """
        # 匹配到的数字字符串,即货币值
        match = m.group(1)
        # 将货币值按小数点分割为整数部分和小数部分
        parts = match.split(".")
        if len(parts) > 2:
            return match + " dollars"  # 如果小数点超过一个,返回原始字符串加上 " dollars" 表示异常格式

        # 解析整数部分和小数部分
        dollars = int(parts[0]) if parts[0] else 0
        cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
        # 根据货币值的整数部分和小数部分,构造成对应的英文表达形式
        if dollars and cents:
            dollar_unit = "dollar" if dollars == 1 else "dollars"
            cent_unit = "cent" if cents == 1 else "cents"
            return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
        elif dollars:
            dollar_unit = "dollar" if dollars == 1 else "dollars"
            return "%s %s" % (dollars, dollar_unit)
        elif cents:
            cent_unit = "cent" if cents == 1 else "cents"
            return "%s %s" % (cents, cent_unit)
        else:
            return "zero dollars"

    def _remove_commas(self, m: str) -> str:
        """
        This method is used to remove commas from sentences.
        """
        # 去除输入字符串中的逗号
        return m.group(1).replace(",", "")

    def _expand_decimal_point(self, m: str) -> str:
        """
        This method is used to expand '.' into spoken word ' point '.
        """
        # 将输入字符串中的点号 '.' 替换为单词 " point "
        return m.group(1).replace(".", " point ")

    def _expand_ordinal(self, num: str) -> str:
        """
        This method is used to expand ordinals such as '1st', '2nd' into spoken words.
        """
        # 定义英文序数词的后缀映射表
        ordinal_suffixes = {1: "st", 2: "nd", 3: "rd"}

        # 提取序数词的数字部分并转换为整数
        num = int(num.group(0)[:-2])
        # 根据序数的不同情况选择正确的后缀
        if 10 <= num % 100 and num % 100 <= 20:
            suffix = "th"
        else:
            suffix = ordinal_suffixes.get(num % 10, "th")
        # 将整数转换为对应的英文序数词形式并添加后缀
        return self.number_to_words(num) + suffix

    def _expand_number(self, m: str) -> str:
        """
        This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
        link :
        https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
        """
        # 提取匹配到的数字字符串并转换为整数
        num = int(m.group(0))

        # 如果数字在 1000 到 3000 之间,按特定规则进行英文数字的扩展
        if num > 1000 and num < 3000:
            if num == 2000:
                return "two thousand"
            elif num > 2000 and num < 2010:
                return "two thousand " + self.number_to_words(num % 100)
            elif num % 100 == 0:
                return self.number_to_words(num // 100) + " hundred"
            else:
                return self.number_to_words(num)
        else:
            return self.number_to_words(num)


这段代码包含了几个用于文本处理和数字转换的方法,每个方法都有详细的注释说明其功能和实现逻辑。
    # 此方法用于规范化文本中的数字,如将数字转换为单词,移除逗号等操作。
    def normalize_numbers(self, text: str) -> str:
        # 使用正则表达式替换匹配的数字和逗号,调用 self._remove_commas 方法
        text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
        # 替换匹配的英镑金额为其单词表示形式
        text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
        # 替换匹配的美元金额为其完整的金额表达形式,调用 self._expand_dollars 方法
        text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
        # 替换匹配的小数形式为其完整的数值表达形式,调用 self._expand_decimal_point 方法
        text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
        # 替换匹配的序数词(如1st、2nd)为其完整的序数词形式,调用 self._expand_ordinal 方法
        text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
        # 替换匹配的数字为其完整的数值表达形式,调用 self._expand_number 方法
        text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
        # 返回规范化后的文本
        return text

    # 扩展缩写词
    def expand_abbreviations(self, text: str) -> str:
        # 遍历缩写词及其对应的替换规则,使用正则表达式进行替换
        for regex, replacement in self._abbreviations:
            text = re.sub(regex, replacement, text)
        # 返回扩展后的文本
        return text

    # 去除多余的空白字符
    def collapse_whitespace(self, text: str) -> str:
        # 使用正则表达式将多个连续的空白字符替换为一个空格
        return re.sub(re.compile(r"\s+"), " ", text)

    # 对象可调用方法,将文本转换为 ASCII 码,将数字转换为完整形式,并扩展缩写词
    def __call__(self, text):
        # 将文本转换为 ASCII 码表示形式
        text = self.convert_to_ascii(text)
        # 将文本转换为小写形式
        text = text.lower()
        # 规范化文本中的数字
        text = self.normalize_numbers(text)
        # 扩展文本中的缩写词
        text = self.expand_abbreviations(text)
        # 去除文本中的多余空白字符
        text = self.collapse_whitespace(text)
        # 移除文本中的双引号
        text = text.replace('"', "")

        # 返回处理后的文本
        return text

.\models\clvp\processing_clvp.py

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Processor class for CLVP
"""


from ...processing_utils import ProcessorMixin

# 导入处理工具类 ProcessorMixin


class ClvpProcessor(ProcessorMixin):
    """
    Constructs a CLVP processor which wraps a CLVP Feature Extractor and a CLVP Tokenizer into a single processor.

    [`ClvpProcessor`] offers all the functionalities of [`ClvpFeatureExtractor`] and [`ClvpTokenizer`]. See the
    [`~ClvpProcessor.__call__`], [`~ClvpProcessor.decode`] and [`~ClvpProcessor.batch_decode`] for more information.

    Args:
        feature_extractor (`ClvpFeatureExtractor`):
            An instance of [`ClvpFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`ClvpTokenizer`):
            An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
    """

    feature_extractor_class = "ClvpFeatureExtractor"
    tokenizer_class = "ClvpTokenizer"
    model_input_names = [
        "input_ids",
        "input_features",
        "attention_mask",
    ]

    def __init__(self, feature_extractor, tokenizer):
        # 初始化方法,接收 CLVP Feature Extractor 和 CLVP Tokenizer 实例,并调用父类构造函数
        super().__init__(feature_extractor, tokenizer)

    def __call__(self, *args, **kwargs):
        """
        Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
        argument to [`~ClvpTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
        information.
        """
        
        raw_speech = kwargs.pop("raw_speech", None)
        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)

        if raw_speech is None and text is None:
            # 如果既没有原始语音输入也没有文本输入,则抛出数值错误
            raise ValueError("You need to specify either an `raw_speech` or `text` input to process.")

        if raw_speech is not None:
            # 如果有原始语音输入,则调用 CLVP Feature Extractor 处理原始语音和采样率参数
            inputs = self.feature_extractor(raw_speech, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            # 如果有文本输入,则调用 CLVP Tokenizer 处理文本参数
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            # 如果只有原始语音输入,则返回 CLVP Feature Extractor 的输出
            return inputs
        elif raw_speech is None:
            # 如果只有文本输入,则返回 CLVP Tokenizer 的输出
            return encodings
        else:
            # 如果同时有原始语音和文本输入,则将 CLVP Tokenizer 的输出合并到 CLVP Feature Extractor 的输出中,并返回
            inputs["input_ids"] = encodings["input_ids"]
            inputs["attention_mask"] = encodings["attention_mask"]
            return inputs

    # Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode with Whisper->Clvp
    # 此方法将所有参数转发给 ClvpTokenizer 的 `batch_decode` 方法。
    # 请参考该方法的文档字符串获取更多信息。
    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)

    # 从 transformers.models.whisper.processing_whisper.WhisperProcessor.decode 复制而来,
    # 将 Whisper 替换为 Clvp
    # 此方法将所有参数转发给 ClvpTokenizer 的 `decode` 方法。
    # 请参考该方法的文档字符串获取更多信息。
    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)

.\models\clvp\tokenization_clvp.py

# 设置文件编码为 UTF-8

# 版权声明

# 导入必要的库和模块
import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple

# 导入第三方库 regex 并重命名为 re
import regex as re

# 导入所需的 Hugging Face 库和模块
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging

# 导入 CLVP 的数字归一化模块
from .number_normalizer import EnglishNormalizer

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 定义词汇文件的名称字典
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}

# 定义预训练模型词汇文件的映射字典
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "clvp_dev": "https://huggingface.co/susnato/clvp_dev/blob/main/vocab.json",
    },
    "merges_file": {
        "clvp_dev": "https://huggingface.co/susnato/clvp_dev/blob/main/merges.txt",
    },
}

# 定义预训练位置嵌入大小的字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "clvp_dev": 1024,
}

# 使用 lru_cache 装饰器缓存 bytes_to_unicode 函数的结果
@lru_cache()
def bytes_to_unicode():
    """
    返回 utf-8 字节列表及其对应的 Unicode 字符映射。避免将空白字符和控制字符映射到 BPE 代码会出错的地方。

    可逆的 BPE 编码适用于 Unicode 字符串。这意味着如果要避免 UNKs,你需要在词汇表中包含大量的 Unicode 字符。
    当你处理像 10B 令牌数据集时,你最终需要大约 5K 个 Unicode 字符才能获得良好的覆盖率。
    这在你正常使用的 32K BPE 词汇表中占据了显著的比例。为了避免这种情况,我们需要 utf-8 字节和 Unicode 字符串之间的查找表。
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


# 复制自 transformers.models.gpt2.tokenization_gpt2 的 bytes_to_unicode 函数

# 复制自 transformers.models.gpt2.tokenization_gpt2 的 get_pairs 函数
def get_pairs(word):
    """
    返回单词中的符号对集合。

    单词被表示为符号元组(符号是长度可变的字符串)。
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


class ClvpTokenizer(PreTrainedTokenizer):
    """
    构建一个 CLVP 分词器。基于字节级字节对编码。
    """
    pass
    # 定义 CLVPTokenizer 类,它是基于 PreTrainedTokenizer 的一个子类,包含了大部分主要方法
    # 用户可以参考其父类 PreTrainedTokenizer 获取更多关于这些方法的信息
    class CLVPTokenizer(PreTrainedTokenizer):
        # 构造函数,初始化一个 CLVPTokenizer 对象
        def __init__(
            self,
            vocab_file: str,
            merges_file: str,
            errors: str = "replace",
            unk_token: str = "[UNK]",
            bos_token: str = "<|endoftext|>",
            eos_token: str = "[STOP]",
            pad_token: str = "[STOP]",
            add_prefix_space: bool = False,
            add_bos_token: bool = False,
            add_eos_token: bool = False,
        ):
            # 调用父类的构造函数,传入相关参数进行初始化
            super().__init__(
                vocab_file=vocab_file,
                merges_file=merges_file,
                errors=errors,
                unk_token=unk_token,
                bos_token=bos_token,
                eos_token=eos_token,
                pad_token=pad_token,
                add_prefix_space=add_prefix_space,
                add_bos_token=add_bos_token,
                add_eos_token=add_eos_token,
            )
    
        # 类变量,指定了预训练模型中的词汇文件名
        vocab_files_names = VOCAB_FILES_NAMES
        # 类变量,指定了预训练模型中的预训练词汇文件映射
        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
        # 类变量,指定了预训练模型的最大输入尺寸
        max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
        # 类变量,定义了模型输入的名称列表
        model_input_names = [
            "input_ids",
            "attention_mask",
        ]
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        unk_token="[UNK]",
        bos_token="<|endoftext|>",
        eos_token="[STOP]",
        pad_token="[STOP]",
        add_prefix_space=False,
        add_bos_token=False,
        add_eos_token=False,
        **kwargs,
    ):
        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

        self.add_bos_token = add_bos_token  # 是否添加起始标记到词汇处理器
        self.add_eos_token = add_eos_token  # 是否添加结束标记到词汇处理器
        self._normalizer = None

        # 从给定的vocab_file读取JSON格式的词汇表并存储在self.encoder中
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建反向映射,将self.encoder的键值对颠倒,存储在self.decoder中
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # 设置解码时如何处理错误的策略
        self.byte_encoder = bytes_to_unicode()  # 初始化字节到Unicode字符的编码器
        # 创建字节解码器,将字节到Unicode字符的映射关系反转
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        # 从给定的merges_file中读取BPE合并列表,并建立合并的排名字典存储在self.bpe_ranks中
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))  # 将BPE合并转化为排名字典
        self.cache = {}  # 初始化缓存字典
        self.add_prefix_space = add_prefix_space  # 是否在前缀中添加空格

        # 创建正则表达式模式,用于识别不同类型的标记
        # 注意:应添加 re.IGNORECASE 以便对大小写版本的缩略词进行BPE合并
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        super().__init__(
            errors=errors,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            add_prefix_space=add_prefix_space,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            **kwargs,
        )

    @property
    def vocab_size(self):
        return len(self.encoder)  # 返回词汇表大小

    @property
    def normalizer(self):
        if self._normalizer is None:
            self._normalizer = EnglishNormalizer()  # 如果没有正规化器,则创建一个英语正规化器
        return self._normalizer  # 返回正规化器对象

    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
    # 对输入的单词进行 BPE(Byte Pair Encoding)处理
    def bpe(self, token):
        # 如果 token 已经在缓存中,则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        # 将单词转换为元组
        word = tuple(token)
        # 获取单词中的所有字符对
        pairs = get_pairs(word)

        # 如果没有字符对,则直接返回原始单词
        if not pairs:
            return token

        # 循环处理字符对,直到无法继续合并
        while True:
            # 选择出现频率最低的字符对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果该字符对不在预定义的频率表中,则跳出循环
            if bigram not in self.bpe_ranks:
                break
            # 分割单词为新的列表
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                # 如果找到了匹配的字符对,则合并并跳过这两个字符
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            # 将新单词转换为元组
            new_word = tuple(new_word)
            word = new_word
            # 如果新单词长度为1,则停止合并
            if len(word) == 1:
                break
            else:
                # 继续获取新的字符对
                pairs = get_pairs(word)
        # 将处理后的单词转换为字符串形式
        word = " ".join(word)
        # 将处理结果缓存起来
        self.cache[token] = word
        # 返回处理后的单词
        return word

    # 生成带有特殊标记的输入序列,用于模型输入
    # 从 transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens 复制而来
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        # 如果需要添加开始标记,则添加到输出中
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        # 如果需要添加结束标记,则添加到输出中
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        # 将输入序列与特殊标记合并
        output = bos_token_id + token_ids_0 + eos_token_id

        # 如果有第二个输入序列,则将其也与特殊标记合并
        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id

        # 返回最终的带有特殊标记的输入序列
        return output

    # 获取带有特殊标记的特殊 token 掩码
    # 从 transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_special_tokens_mask 复制而来
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If the token list already has special tokens, delegate the masking to the superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If `add_bos_token` is False, use the superclass method to get special tokens mask
        if not self.add_bos_token:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
            )

        # If token_ids_1 is None, return a mask with one special token followed by sequence tokens
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0))
        
        # Otherwise, return a mask with special tokens followed by sequence tokens of both lists
        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))

    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []
        text = self.normalizer(text)
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)

            # Replace special token "Ġ" with "[SPACE]" if "[SPACE]" is in the vocab
            bpe_tokens.extend(
                "[SPACE]" if bpe_token == "\u0120" and "[SPACE]" in self.encoder.keys() else bpe_token
                for bpe_token in self.bpe(token).split(" ")
            )

        return bpe_tokens

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
    def _convert_token_to_id(self, token):
        """Converts a token (str) into an id using the vocab."""
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocab."""
        return self.decoder.get(index)

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) into a single string."""
        # Join tokens into a string and decode bytes to UTF-8 characters, handling errors
        text = "".join(tokens)
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text
    # 清理文本中的特殊标记和空格
    def clean_up_tokenization(self, text):
        # 将列表中的字符串连接成一个字符串
        text = "".join(text)
        # 获取所有编码器和新增编码器中的词汇标记
        vocab_tokens = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())

        # 替换文本中的特殊标记"[SPACE]"为普通空格" ",如果"[SPACE]"存在于词汇标记中
        text = text.replace("[SPACE]", " ") if "[SPACE]" in vocab_tokens else text
        # 替换文本中的特殊标记"[STOP]"为普通空格" ",如果"[STOP]"存在于词汇标记中
        text = text.replace("[STOP]", " ") if "[STOP]" in vocab_tokens else text

        # 替换文本中的未知标记为"",连续的三个空格为一个空格,连续的两个空格为一个空格
        text = text.replace(self.unk_token, "").replace("   ", " ").replace("  ", " ")
        return text

    # 保存词汇表到指定目录,参考了transformers库中GPT2Tokenizer的save_vocabulary方法
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录不存在,记录错误信息并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构造词汇文件路径,结合前缀和文件名
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构造合并文件路径,结合前缀和文件名
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将编码器内容以JSON格式写入词汇文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将BPE标记和其索引写入合并文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 按照BPE索引排序并写入文件
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的词汇文件路径和合并文件路径
        return vocab_file, merge_file

.\models\clvp\__init__.py

# 版权声明及版权许可信息
#
# 版权所有2023年HuggingFace团队保留所有权利。
# 
# 根据Apache许可证2.0版(“许可证”)许可;
# 除非符合许可证的规定,否则您不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,软件
# 按“原样”分发,不提供任何明示或暗示的担保或条件。
# 有关具体语言的详情,请参阅许可证。
from typing import TYPE_CHECKING

# 导入必要的依赖模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_clvp": [
        "CLVP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "ClvpConfig",
        "ClvpDecoderConfig",
        "ClvpEncoderConfig",
    ],
    "feature_extraction_clvp": ["ClvpFeatureExtractor"],
    "processing_clvp": ["ClvpProcessor"],
    "tokenization_clvp": ["ClvpTokenizer"],
}

# 检查是否有torch可用,如果没有则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用,添加modeling_clvp模块到导入结构
    _import_structure["modeling_clvp"] = [
        "CLVP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ClvpModelForConditionalGeneration",
        "ClvpForCausalLM",
        "ClvpModel",
        "ClvpPreTrainedModel",
        "ClvpEncoder",
        "ClvpDecoder",
    ]

# 如果是类型检查模式,导入具体的类和方法
if TYPE_CHECKING:
    from .configuration_clvp import (
        CLVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ClvpConfig,
        ClvpDecoderConfig,
        ClvpEncoderConfig,
    )
    from .feature_extraction_clvp import ClvpFeatureExtractor
    from .processing_clvp import ClvpProcessor
    from .tokenization_clvp import ClvpTokenizer

    # 再次检查torch是否可用,若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入modeling_clvp模块下的类和方法
        from .modeling_clvp import (
            CLVP_PRETRAINED_MODEL_ARCHIVE_LIST,
            ClvpDecoder,
            ClvpEncoder,
            ClvpForCausalLM,
            ClvpModel,
            ClvpModelForConditionalGeneration,
            ClvpPreTrainedModel,
        )

# 如果不是类型检查模式,将模块注册为_LazyModule以延迟导入
else:
    import sys

    # 将当前模块注册为_LazyModule,用于按需导入模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\codegen\configuration_codegen.py

# coding=utf-8
# 上面是文件编码声明,指定文件编码格式为UTF-8

# Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
# 版权声明,指出代码版权归 Salesforce、The EleutherAI 和 HuggingFace Teams 所有,保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 授权,即在符合许可证的情况下可以自由使用该代码
# you may not use this file except in compliance with the License.
# 除非符合许可证要求,否则不得使用本文件

# You may obtain a copy of the License at
# 可以在以下链接获取许可证副本
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意,否则根据许可证分发的软件是基于“原样”分发的,没有任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看许可证以了解特定语言的权限和限制

""" CodeGen model configuration"""
# 以下是 CodeGen 模型的配置信息

from collections import OrderedDict
# 导入 OrderedDict 类,用于创建有序字典
from typing import Any, List, Mapping, Optional
# 导入类型提示,用于声明函数参数和返回值的类型

from ... import PreTrainedTokenizer, TensorType, is_torch_available
# 导入模块和函数,包括预训练的 Tokenizer、TensorType 和 is_torch_available 函数

from ...configuration_utils import PretrainedConfig
# 从 configuration_utils 模块导入 PretrainedConfig 类,用于配置预训练模型的基本配置

from ...onnx import OnnxConfigWithPast, PatchingSpec
# 从 onnx 模块导入 OnnxConfigWithPast 和 PatchingSpec 类

from ...utils import logging
# 从 utils 模块导入 logging 模块,用于记录日志信息

logger = logging.get_logger(__name__)
# 获取当前模块的 logger 实例,用于记录模型配置相关的日志信息

CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Salesforce/codegen-350M-nl": "https://huggingface.co/Salesforce/codegen-350M-nl/resolve/main/config.json",
    "Salesforce/codegen-350M-multi": "https://huggingface.co/Salesforce/codegen-350M-multi/resolve/main/config.json",
    "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/config.json",
    "Salesforce/codegen-2B-nl": "https://huggingface.co/Salesforce/codegen-2B-nl/resolve/main/config.json",
    "Salesforce/codegen-2B-multi": "https://huggingface.co/Salesforce/codegen-2B-multi/resolve/main/config.json",
    "Salesforce/codegen-2B-mono": "https://huggingface.co/Salesforce/codegen-2B-mono/resolve/main/config.json",
    "Salesforce/codegen-6B-nl": "https://huggingface.co/Salesforce/codegen-6B-nl/resolve/main/config.json",
    "Salesforce/codegen-6B-multi": "https://huggingface.co/Salesforce/codegen-6B-multi/resolve/main/config.json",
    "Salesforce/codegen-6B-mono": "https://huggingface.co/Salesforce/codegen-6B-mono/resolve/main/config.json",
    "Salesforce/codegen-16B-nl": "https://huggingface.co/Salesforce/codegen-16B-nl/resolve/main/config.json",
    "Salesforce/codegen-16B-multi": "https://huggingface.co/Salesforce/codegen-16B-multi/resolve/main/config.json",
    "Salesforce/codegen-16B-mono": "https://huggingface.co/Salesforce/codegen-16B-mono/resolve/main/config.json",
}
# 定义 CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,存储不同 CodeGen 模型的预训练配置文件的 URL 映射

class CodeGenConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a
    CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the CodeGen
    [Salesforce/codegen-2B-mono](https://huggingface.co/Salesforce/codegen-2B-mono) architecture. Configuration objects
    """
    # CodeGenConfig 类继承自 PretrainedConfig,用于存储 CodeGen 模型的配置信息

    def __init__(
        self,
        **kwargs
    ):
        # 初始化方法,接受任意关键字参数

        super().__init__(**kwargs)
        # 调用父类 PretrainedConfig 的初始化方法
    class CodeGenConfig(PretrainedConfig):
        # 继承自 `PretrainedConfig`,用于控制模型输出。更多信息请参考 `PretrainedConfig` 的文档。
        def __init__(
            self,
            vocab_size=50400,
            n_positions=2048,
            n_ctx=2048,
            n_embd=4096,
            n_layer=28,
            n_head=16,
            rotary_dim=64,
            n_inner=None,
            activation_function="gelu_new",
            resid_pdrop=0.0,
            embd_pdrop=0.0,
            attn_pdrop=0.0,
            layer_norm_epsilon=1e-05,
            initializer_range=0.02,
            use_cache=True,
            bos_token_id=50256,
            eos_token_id=50256,
            tie_word_embeddings=False,
        ):
            # 初始化方法,设定模型的各种配置参数
            self.vocab_size = vocab_size
            self.n_positions = n_positions
            self.n_ctx = n_ctx
            self.n_embd = n_embd
            self.n_layer = n_layer
            self.n_head = n_head
            self.rotary_dim = rotary_dim
            self.n_inner = n_inner if n_inner is not None else 4 * n_embd
            self.activation_function = activation_function
            self.resid_pdrop = resid_pdrop
            self.embd_pdrop = embd_pdrop
            self.attn_pdrop = attn_pdrop
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
            self.use_cache = use_cache
            self.bos_token_id = bos_token_id
            self.eos_token_id = eos_token_id
            self.tie_word_embeddings = tie_word_embeddings
    # 导入 transformers 库中的 CodeGenConfig 和 CodeGenModel 类
    >>> from transformers import CodeGenConfig, CodeGenModel

    # 初始化一个 CodeGenConfig 对象,用于配置 CodeGen 模型的参数
    >>> configuration = CodeGenConfig()

    # 使用上述配置初始化一个 CodeGenModel 对象,模型参数采用随机初始化
    >>> model = CodeGenModel(configuration)

    # 获取模型的配置信息并存储在 configuration 变量中
    >>> configuration = model.config
    ```
# 从 transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig 复制而来的类,继承自 OnnxConfigWithPast
class CodeGenOnnxConfig(OnnxConfigWithPast):
    
    # 初始化方法,接受一个预训练配置对象 config 和其他可选参数
    def __init__(
        self,
        config: PretrainedConfig,
        task: str = "default",
        patching_specs: List[PatchingSpec] = None,
        use_past: bool = False,
    ):
        # 调用父类 OnnxConfigWithPast 的初始化方法
        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
        
        # 如果配置对象中不存在 pad_token_id 属性,则设置其为 0
        if not getattr(self._config, "pad_token_id", None):
            # TODO: how to do that better?
            self._config.pad_token_id = 0

    # 输入属性,返回一个字典,描述模型的输入结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 共通的输入格式,包含 input_ids 和 attention_mask
        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
        
        # 如果使用过去信息,则填充 past 相关的输入格式
        if self.use_past:
            self.fill_with_past_key_values_(common_inputs, direction="inputs")
            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
        else:
            # 否则只保留当前序列的 attention_mask
            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}

        return common_inputs

    # 返回模型的层数,从配置对象的 n_layer 属性获取
    @property
    def num_layers(self) -> int:
        return self._config.n_layer

    # 返回模型的注意力头数,从配置对象的 n_head 属性获取
    @property
    def num_attention_heads(self) -> int:
        return self._config.n_head

    # 生成虚拟输入数据的方法,接受 tokenizer 和其他参数
    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
        # (此处省略部分方法内容)
        ) -> Mapping[str, Any]:
        # 调用父类方法生成通用的输入字典,包括输入的文本编码、批处理大小、序列长度、是否为成对数据和框架类型
        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
        )

        # 按照 forward() 方法中的顺序排列输入
        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})

        # 如果需要使用过去的键值(past_keys)
        if self.use_past:
            # 如果没有安装 PyTorch,则抛出数值错误
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch

                # 获取输入数据的批次大小和序列长度
                batch, seqlen = common_inputs["input_ids"].shape
                # 计算过去键值的长度,增加2以保证足够的容量
                past_key_values_length = seqlen + 2
                # 定义过去键值的形状
                past_shape = (
                    batch,
                    self.num_attention_heads,
                    past_key_values_length,
                    self._config.hidden_size // self.num_attention_heads,
                )
                # 为每一层的每个位置创建零张量,形成 past_key_values 列表
                ordered_inputs["past_key_values"] = [
                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
                ]

        # 将通用的注意力掩码添加到有序输入中
        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]

        # 如果需要使用过去的键值(past_keys)
        if self.use_past:
            # 获取注意力掩码的数据类型
            mask_dtype = ordered_inputs["attention_mask"].dtype
            # 将形状相同的全1张量连接到现有的注意力掩码张量后面,以扩展其长度
            ordered_inputs["attention_mask"] = torch.cat(
                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
            )

        # 返回最终的有序输入字典
        return ordered_inputs

    @property
    def default_onnx_opset(self) -> int:
        # 返回默认的 ONNX 操作集版本号
        return 13

.\models\codegen\modeling_codegen.py

# coding=utf-8
# Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch CodeGen model."""

from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_codegen import CodeGenConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Salesforce/codegen-2B-mono"
_CONFIG_FOR_DOC = "CodeGenConfig"


CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Salesforce/codegen-350M-nl",
    "Salesforce/codegen-350M-multi",
    "Salesforce/codegen-350M-mono",
    "Salesforce/codegen-2B-nl",
    "Salesforce/codegen-2B-multi",
    "Salesforce/codegen-2B-mono",
    "Salesforce/codegen-6B-nl",
    "Salesforce/codegen-6B-multi",
    "Salesforce/codegen-6B-mono",
    "Salesforce/codegen-16B-nl",
    "Salesforce/codegen-16B-multi",
    "Salesforce/codegen-16B-mono",
    # See all CodeGen models at https://huggingface.co/models?filter=codegen
]


# Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions
# 创建一个张量,包含给定维度和长度的正弦位置编码
def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
    inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
    sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq).float()
    return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)


# Copied from transformers.models.gptj.modeling_gptj.rotate_every_two
# 旋转张量的每两个元素
def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
    x1 = x[:, :, :, ::2]
    x2 = x[:, :, :, 1::2]
    x = torch.stack((-x2, x1), dim=-1)
    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')


# Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb
# 应用旋转位置编码到输入张量
def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
    return (tensor * cos) + (rotate_every_two(tensor) * sin)


class CodeGenAttention(nn.Module):
    # 初始化方法,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 从配置对象中获取最大位置嵌入数
        max_positions = config.max_position_embeddings
        # 注册一个缓冲区,用来存储因果掩码
        self.register_buffer(
            "causal_mask",
            # 创建一个下三角形状的布尔类型张量作为因果掩码
            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions
            ),
            persistent=False,
        )

        # 定义注意力权重的 dropout 层
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        # 定义残差连接的 dropout 层
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # 设置嵌入维度
        self.embed_dim = config.hidden_size
        # 设置注意力头的数量
        self.num_attention_heads = config.num_attention_heads
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_attention_heads
        # 检查是否能够均分嵌入维度到每个注意力头
        if self.head_dim * self.num_attention_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
                f" `num_attention_heads`: {self.num_attention_heads})."
            )
        # 缩放因子,用于缩放注意力得分
        self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
        # 定义 qkv 投影层
        self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)

        # 定义输出投影层
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        # 如果有旋转维度,则使用它;否则使用嵌入维度
        self.rotary_dim = config.rotary_dim
        pos_embd_dim = self.rotary_dim or self.embed_dim
        # 创建正弦位置编码
        self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)

    # 将张量按注意力头拆分
    def _split_heads(self, x, n_head, dim_head, mp_num):
        reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
        reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:])
        return reshaped

    # 将分开的注意力头合并回张量
    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
        """
        # 根据张量的维度进行不同的维度置换操作
        if len(tensor.shape) == 5:
            tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
        elif len(tensor.shape) == 4:
            tensor = tensor.permute(0, 2, 1, 3).contiguous()
        else:
            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
        # 重新构造张量的形状,将注意力头和注意力头大小维度合并
        new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
        return tensor.view(new_shape)

    # 注意力计算函数
    def _attn(
        self,
        query,
        key,
        value,
        attention_mask=None,
        head_mask=None,
        # 计算因果掩码,基于因果掩码缓冲区
        query_length, key_length = query.size(-2), key.size(-2)
        causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]

        # 将查询和键的数据类型转换为 float32,以避免溢出问题
        query = query.to(torch.float32)
        key = key.to(torch.float32)

        # 计算注意力权重
        attn_weights = torch.matmul(query, key.transpose(-1, -2))

        # 缩放注意力权重
        attn_weights = attn_weights / self.scale_attn

        # 设置掩码值,避免数据类型不匹配和设备不一致的错误
        mask_value = torch.finfo(attn_weights.dtype).min
        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)

        # 应用因果掩码
        attn_weights = torch.where(causal_mask, attn_weights, mask_value)

        if attention_mask is not None:
            # 应用额外的注意力掩码
            attn_weights = attn_weights + attention_mask

        # 对注意力权重进行 softmax 归一化
        attn_weights = nn.Softmax(dim=-1)(attn_weights)

        # 将注意力权重转换回与值张量相同的数据类型
        attn_weights = attn_weights.to(value.dtype)

        # 应用注意力 dropout
        attn_weights = self.attn_dropout(attn_weights)

        # 如果有头部掩码,应用头部掩码
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算最终的注意力输出
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights
        ]:
        # 使用 self.qkv_proj 方法处理隐藏状态以获取查询、键、值(QKV)的投影
        qkv = self.qkv_proj(hidden_states)
        # 定义每个 TPU-v4 逻辑核的数量为 4
        # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
        mp_num = 4
        # 将 QKV 张量重塑为形状为 (batch_size, seq_length, mp_num, local_dim) 的张量
        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))

        # 计算每个局部区域的维度
        local_dim = self.head_dim * self.num_attention_heads // mp_num
        # 按照局部维度将 QKV 张量分割成查询、键、值
        query, value, key = torch.split(qkv_split, local_dim, dim=-1)
        # 将查询分割为多个头并重新排列
        query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
        # 将键分割为多个头并重新排列
        key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)

        # 将值分割为多个头并重新排列
        value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
        # 对值进行维度变换,使得维度顺序变为 (batch_size, num_heads, seq_length, head_dim)
        value = value.permute(0, 2, 1, 3)

        # 获取嵌入位置信息
        embed_positions = self.embed_positions
        # 如果嵌入位置信息与位置 ID 的设备不匹配,则将嵌入位置信息移动到位置 ID 的设备上
        if embed_positions.device != position_ids.device:
            embed_positions = embed_positions.to(position_ids.device)
            self.embed_positions = embed_positions

        # 根据位置 ID 从嵌入位置信息中提取出 sin 和 cos 组成的张量 sincos
        sincos = embed_positions[position_ids]
        # 将 sincos 张量按照最后一个维度分割为 sin 和 cos 两部分
        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)

        # 如果存在旋转维度,则分别对键和查询的旋转部分应用旋转位置编码
        if self.rotary_dim is not None:
            k_rot = key[:, :, :, : self.rotary_dim]
            k_pass = key[:, :, :, self.rotary_dim :]

            q_rot = query[:, :, :, : self.rotary_dim]
            q_pass = query[:, :, :, self.rotary_dim :]

            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)

            # 合并旋转后的部分和原始部分
            key = torch.cat([k_rot, k_pass], dim=-1)
            query = torch.cat([q_rot, q_pass], dim=-1)
        else:
            # 否则,对整个键和查询都应用旋转位置编码
            key = apply_rotary_pos_emb(key, sin, cos)
            query = apply_rotary_pos_emb(query, sin, cos)

        # 将键和查询的维度顺序调整为 (batch_size, num_heads, seq_length, head_dim)
        key = key.permute(0, 2, 1, 3)
        query = query.permute(0, 2, 1, 3)

        # 如果存在过去的层状态,则将过去的键和值与当前的键和值拼接起来
        if layer_past is not None:
            past_key = layer_past[0]
            past_value = layer_past[1]
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        # 如果使用缓存,则返回用于下一步计算的 present
        if use_cache is True:
            # 注意这里的类型转换比较丑陋,但是在原始代码中,k_rot 一直是 fp32 类型
            # 参考链接:https://github.com/salesforce/CodeGen/blob/f210c3bb1216c975ad858cd4132c0fdeabf4bfc2/codegen1/jaxformer/hf/codegen/modeling_codegen.py#L38
            present = (key.to(hidden_states.dtype), value)
        else:
            present = None

        # 计算自注意力机制,得到输出和注意力权重
        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # 将多头自注意力机制的输出合并
        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
        # 使用输出投影层进行投影
        attn_output = self.out_proj(attn_output)
        # 应用残差连接的 dropout
        attn_output = self.resid_dropout(attn_output)

        # 返回输出结果以及可能的 present 和注意力权重
        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # 返回输出元组 (attn_output, present, (attentions))
# Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->CodeGen
class CodeGenMLP(nn.Module):
    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * embed_dim
        super().__init__()
        embed_dim = config.n_embd

        # 定义输入层全连接网络,将输入维度调整为intermediate_size
        self.fc_in = nn.Linear(embed_dim, intermediate_size)
        # 定义输出层全连接网络,将输出维度调整为embed_dim
        self.fc_out = nn.Linear(intermediate_size, embed_dim)

        # 激活函数选择,根据配置文件中的激活函数名选择对应的激活函数
        self.act = ACT2FN[config.activation_function]
        # Dropout层,根据配置中的残差概率添加dropout
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor:
        # 输入数据经过输入层全连接网络
        hidden_states = self.fc_in(hidden_states)
        # 经过激活函数处理
        hidden_states = self.act(hidden_states)
        # 经过输出层全连接网络
        hidden_states = self.fc_out(hidden_states)
        # 经过dropout处理
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen
class CodeGenBlock(nn.Module):
    # Ignore copy
    def __init__(self, config):
        super().__init__()
        # 内部维度的设定,若未指定则默认为4倍的嵌入维度
        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
        # LayerNorm层,对输入进行归一化处理
        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        # 自注意力机制模块
        self.attn = CodeGenAttention(config)
        # 多层感知机模块
        self.mlp = CodeGenMLP(inner_dim, config)

    def forward(
        self,
        hidden_states: Optional[torch.FloatTensor],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
        # 残差连接,保存输入状态
        residual = hidden_states
        # 对输入状态进行LayerNorm归一化处理
        hidden_states = self.ln_1(hidden_states)
        # 使用自注意力机制模块进行注意力计算
        attn_outputs = self.attn(
            hidden_states=hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 获取自注意力机制模块的输出
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]

        # 经过多层感知机模块处理后的隐藏状态
        feed_forward_hidden_states = self.mlp(hidden_states)
        # 经过残差连接后的隐藏状态
        hidden_states = attn_output + feed_forward_hidden_states + residual

        if use_cache:
            # 若使用缓存,则将隐藏状态作为输出的一部分
            outputs = (hidden_states,) + outputs
        else:
            # 若不使用缓存,则仅保留后续输出部分
            outputs = (hidden_states,) + outputs[1:]

        return outputs  # 返回隐藏状态、present、(注意力)
    # 设定跳过设备放置的键名,用于特定用途(如处理键值对)
    _skip_keys_device_placement = "past_key_values"

    # 初始化函数,接受任意位置参数和关键字参数
    def __init__(self, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(*inputs, **kwargs)

    # 初始化模型参数函数
    def _init_weights(self, module):
        """Initialize the weights."""
        # 如果模块是线性层(nn.Linear 类型)
        if isinstance(module, (nn.Linear,)):
            # 使用正态分布初始化权重,均值为 0,标准差为模型配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果模块有偏置项,则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层(nn.Embedding 类型)
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重,均值为 0,标准差为模型配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果模块设置了填充索引,将填充索引处的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是层归一化层(nn.LayerNorm 类型)
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全1
            module.weight.data.fill_(1.0)
# 生成一个长字符串,用作代码文档字符串的起始部分,描述了这个模型是一个 PyTorch 的子类,应当像普通的 PyTorch Module 一样使用。
# 提供了一个链接到 PyTorch 文档的参考。
CODEGEN_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CodeGenConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 生成一个空字符串,用作描述代码文档字符串的输入部分,留待进一步填充和完善。
CODEGEN_INPUTS_DOCSTRING = r"""
"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列的token索引,用于词汇表中的标识符
            Indices of input sequence tokens in the vocabulary.
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            [What are input IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩,用于避免在填充token索引上执行注意力操作
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段token索引,指示输入的第一和第二部分
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列token在位置嵌入中的位置索引
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
            [What are position IDs?](../glossary#position-ids)

        head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
            # 用于屏蔽自注意力模块中的特定头部的遮罩
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
            # 可选参数,允许直接传递嵌入表示而不是`input_ids`
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            # 是否返回一个`ModelOutput`对象而不是普通元组
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
    "The bare CodeGen Model transformer outputting raw hidden-states without any specific head on top.",
    CODEGEN_START_DOCSTRING,
)
"""
class CodeGenModel(CodeGenPreTrainedModel):
    """
    Implementing a transformer model for code generation without additional task-specific heads.

    Args:
        config: The configuration class for model initialization.

    Attributes:
        embed_dim (int): Dimensionality of the embedding layer.
        vocab_size (int): Size of the vocabulary.
        wte (nn.Embedding): Embedding layer to convert input tokens to embeddings.
        drop (nn.Dropout): Dropout layer for regularization.
        h (nn.ModuleList): List of CodeGenBlock modules representing transformer layers.
        ln_f (nn.LayerNorm): Layer normalization for final layer.
        rotary_dim (int): Dimension for rotary position encodings.
        gradient_checkpointing (bool): Whether to use gradient checkpointing during training.

    Methods:
        get_input_embeddings(): Returns the input embedding layer.
        set_input_embeddings(new_embeddings): Sets new input embeddings for the model.
        forward(...): Performs forward pass through the model with various input tensors.
    """

    def __init__(self, config):
        super().__init__(config)

        self.embed_dim = config.n_embd
        self.vocab_size = config.vocab_size
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList([CodeGenBlock(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
        self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)

        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()


@add_start_docstrings(
    """
    The CodeGen Model transformer with a language modeling head on top.
    """,
    CODEGEN_START_DOCSTRING,
)
"""
class CodeGenForCausalLM(CodeGenPreTrainedModel):
    """
    Extended transformer model for code generation with an added language modeling head.

    Args:
        config: The configuration class for model initialization.

    Attributes:
        transformer (CodeGenModel): Instance of the base CodeGenModel transformer.
        lm_head (nn.Linear): Linear layer for language modeling predictions.

    Methods:
        get_output_embeddings(): Returns the output embedding layer.
        set_output_embeddings(new_embeddings): Sets new output embeddings for the model.
    """

    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.transformer = CodeGenModel(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
        # 从 kwargs 中获取 token_type_ids,如果不存在则为 None
        token_type_ids = kwargs.get("token_type_ids", None)
        # 如果 past_key_values 不为 None,则执行以下操作
        if past_key_values:
            # 获取 past_key_values 的长度信息
            past_length = past_key_values[0][0].shape[2]

            # 如果 input_ids 的第二维度大于 past_length,则移除前缀长度为 past_length
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则默认行为:保留最后一个输入 ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 截取 input_ids 的第二维度,保留从 remove_prefix_length 到结尾的部分
            input_ids = input_ids[:, remove_prefix_length:]
            # 如果 token_type_ids 不为 None,则截取与 input_ids 相同长度的部分
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]

        # 从 kwargs 中获取 attention_mask 和 position_ids
        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        # 如果 attention_mask 不为 None 且 position_ids 为 None,则执行以下操作
        if attention_mask is not None and position_ids is None:
            # 动态生成 position_ids 以用于批量生成
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 如果 past_key_values 不为 None,则截取与 input_ids 相同长度的部分
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # 返回准备好的输入字典
        return {
            "input_ids": input_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }

    @add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 如果未指定 return_dict,则使用配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Transformer 处理输入的各种参数,并获取输出
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取 Transformer 输出中的隐藏状态
        hidden_states = transformer_outputs[0]

        # 确保在 fp16 下的采样工作正常,并且在 fp32 下计算损失,以与 mesh-tf 版本保持一致
        # 参考链接: https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
        # 将隐藏状态传递给语言模型头部,转换为 torch.float32 类型
        lm_logits = self.lm_head(hidden_states).to(torch.float32)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 将标签移动到正确的设备上以启用模型并行处理
            labels = labels.to(lm_logits.device)
            # 移动 logits 以便让 tokens < n 预测 n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 展平 tokens
            loss_fct = CrossEntropyLoss()
            # 计算损失
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            # 将损失转换为与隐藏状态类型相同的类型
            loss = loss.to(hidden_states.dtype)

        # 如果不使用 return_dict,则输出的格式为元组
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果使用 return_dict,则返回 CausalLMOutputWithPast 类型的对象,包含损失、logits 和其他 Transformer 的输出
        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
        """
        如果调用了 `PretrainedModel.beam_search` 或 `PretrainedModel.beam_sample`,则使用此函数重新排序 `past_key_values` 缓存。
        这是为了确保在每一代生成步骤中,`past_key_values` 与正确的 `beam_idx` 匹配。

        返回一个元组,其中包含重新排序后的 `past_key_values`,每个元素都是一个元组,每个元组包含一组 `torch.Tensor` 对象。
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values
        )

.\models\codegen\tokenization_codegen.py

# coding=utf-8
# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for CodeGen"""

# 导入必要的库和模块
import json
import os
from functools import lru_cache
from typing import TYPE_CHECKING, List, Optional, Tuple, Union

import numpy as np
import regex as re

# 导入日志模块和其它辅助函数
from ...utils import is_tf_available, is_torch_available, logging, to_py_obj

# 检查类型,根据可用的深度学习框架导入相关模块
if TYPE_CHECKING:
    if is_torch_available():
        import torch
    if is_tf_available():
        import tensorflow as tf

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义用于存储词汇表和合并文件名的字典
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}

# 预训练模型的词汇表文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/vocab.json",
    },
    "merges_file": {
        "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/merges.txt",
    },
}

# 预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "Salesforce/codegen-350M-mono": 2048,
}

# 用 LRU 缓存装饰器定义函数,将字节映射为 Unicode 字符
@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

# 定义函数,返回单词中的符号对集合
def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

# 定义一个类,继承自 PreTrainedTokenizer,用于代码生成的分词器
class CodeGenTokenizer(PreTrainedTokenizer):
    # 定义了 CodeGenTokenizer 类,基于字节级的 Byte-Pair-Encoding(BPE)进行分词。
    
    # 该分词器训练时将空格视为标记的一部分(类似 sentencepiece),因此同一个词在句首和非句首会有不同的编码:
    # - 如果词在句首没有空格,编码不同;
    # - 通过示例展示了不同编码的效果。
    
    # 可以通过在实例化或调用时传递 `add_prefix_space=True` 来修改这种行为,但是因为模型不是以此方式预训练的,
    # 可能会导致性能下降。
    
    # 当设置 `is_split_into_words=True` 时,分词器会在每个词之前添加一个空格,即使是第一个词也是如此。
    
    # CodeGenTokenizer 继承自 PreTrainedTokenizer,该类包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
    
    # Args: 定义了构造函数的参数说明,包括:
    # - vocab_file (`str`): 词汇文件的路径。
    # - merges_file (`str`): 合并文件的路径。
    # - errors (`str`, *optional*, defaults to `"replace"`): 解码字节为 UTF-8 时的错误处理方式。
    # - unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 未知标记,用于不在词汇中的标记。
    # - bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 序列开始标记。
    # - eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 序列结束标记。
    # - pad_token (`str`, *optional*): 用于填充的标记,例如在批处理不同长度序列时使用。
    # - add_prefix_space (`bool`, *optional*, defaults to `False`): 是否在输入的开头添加空格,以便将首个词视为普通词。
    # - add_bos_token (`bool`, *optional*, defaults to `False`): 是否在序列开始处添加一个开始标记。
    vocab_files_names = VOCAB_FILES_NAMES  # 词汇文件名列表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 预训练词汇文件映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 预训练位置嵌入的最大模型输入大小
    model_input_names = ["input_ids", "attention_mask"]  # 模型输入的名称列表
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        pad_token=None,
        add_prefix_space=False,
        add_bos_token=False,
        **kwargs,
    ):
        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
        self.add_bos_token = add_bos_token

        # 打开并读取词汇文件,使用 UTF-8 编码
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建词汇的反向映射,从索引到词汇的映射
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # 用于处理解码中的错误
        # 创建字节到 Unicode 的编码器和解码器
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        
        # 打开并读取 BPE 合并文件,使用 UTF-8 编码
        with open(merges_file, encoding="utf-8") as merges_handle:
            # 读取文件内容并解析 BPE 合并规则
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        # 创建 BPE 合并的排序字典
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        # 初始化缓存
        self.cache = {}
        self.add_prefix_space = add_prefix_space

        # 设置正则表达式模式,用于标记化文本
        # 应该添加 re.IGNORECASE 以便对大小写不敏感的情况进行 BPE 合并
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        
        # 调用父类的初始化方法,传递参数和关键字参数
        super().__init__(
            errors=errors,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            add_prefix_space=add_prefix_space,
            add_bos_token=add_bos_token,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回词汇表的大小
        return len(self.encoder)

    def get_vocab(self):
        # 返回词汇表及其扩展标记的编码器
        return dict(self.encoder, **self.added_tokens_encoder)
    def _tokenize(self, text):
        """Tokenize a string."""
        # 初始化空列表,用于存储分词后的结果
        bpe_tokens = []
        # 使用正则表达式找到文本中的所有匹配项,并遍历每个匹配到的 token
        for token in re.findall(self.pat, text):
            # 将 token 编码为 UTF-8 字节,并通过 byte_encoder 映射为 Unicode 字符串,
            # 避免 BPE 中的控制标记(在这里是空格)
            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
            # 对经过 BPE 处理后的 token 进行拆分,并添加到 bpe_tokens 中
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        # 返回 BPE 处理后的 token 列表
        return bpe_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用 encoder 字典将 token 转换为对应的 id,如果 token 不存在则使用 unk_token
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用 decoder 字典将 index 转换为对应的 token
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 tokens 列表中的所有 token 连接成一个字符串
        text = "".join(tokens)
        # 将 UTF-8 字节序列转换为字符串,使用 byte_decoder 进行解码,处理可能的编码错误
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        # 返回转换后的字符串
        return text
    # 保存词汇表到指定目录下的文件中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在,如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建词汇表文件路径,包括可选的前缀和文件名
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构建合并文件路径,包括可选的前缀和文件名
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将编码器中的内容以 JSON 格式写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将 BPE 标记和它们的索引写入合并文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    # 记录警告信息,提示 BPE 合并索引不是连续的,可能的损坏
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的词汇表文件路径和合并文件路径
        return vocab_file, merge_file


    # 准备文本以便进行标记化处理
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        # 获取是否添加前缀空格的设置,若未提供则使用默认设置
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        # 如果文本已经分割成单词或需要添加前缀空格,则在文本前加空格
        if is_split_into_words or add_prefix_space:
            text = " " + text
        # 返回处理后的文本和可能更新的参数
        return (text, kwargs)


    # 解码操作,将标记 ID 转换为文本
    def decode(
        self,
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        truncate_before_pattern: Optional[List[str]] = None,
        **kwargs,
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
                A list of regular expression strings that will be used to truncate the returned string. This can be
                used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
                of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        """

        # Convert `token_ids` to Python object
        token_ids = to_py_obj(token_ids)

        # Decode the `token_ids` into text using inherited `_decode` method
        decoded_text = super()._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

        # Truncate the decoded text based on `truncate_before_pattern` if specified
        if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
            decoded_text = self.truncate(decoded_text, truncate_before_pattern)

        # Return the decoded text as a string
        return decoded_text

    def truncate(self, completion, truncate_before_pattern):
        """
        Truncates `completion` text based on specified `truncate_before_pattern`.

        Args:
            completion (str): The text to truncate.
            truncate_before_pattern (List[str]): List of regular expressions to determine truncation points.

        Returns:
            str: The truncated text.
        """

        def find_re(string, pattern, start_pos):
            """
            Helper function to find the position of a pattern in a string.

            Args:
                string (str): The string to search within.
                pattern (Pattern): The compiled regular expression pattern.
                start_pos (int): The starting position of the search.

            Returns:
                int: The position of the pattern in the string, or -1 if not found.
            """
            m = pattern.search(string, start_pos)
            return m.start() if m else -1

        # Compile regular expression patterns for each element in `truncate_before_pattern`
        terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]

        # Find all occurrences of "^print" in `completion` and limit to the second occurrence
        prints = list(re.finditer("^print", completion, re.MULTILINE))

        if len(prints) > 1:
            completion = completion[: prints[1].start()]

        # Find all occurrences of "^def" in `completion` and limit to the second occurrence
        defs = list(re.finditer("^def", completion, re.MULTILINE))

        if len(defs) > 1:
            completion = completion[: defs[1].start()]

        start_pos = 0

        # Find positions of all patterns in `truncate_before_pattern` within `completion`
        terminals_pos = [
            pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
        ]

        # Return `completion` truncated before the smallest found position, or as is if no positions found
        if len(terminals_pos) > 0:
            return completion[: min(terminals_pos)]
        else:
            return completion

.\models\codegen\tokenization_codegen_fast.py

# 导入所需的模块和库
import json  # 导入处理 JSON 格式数据的模块
import re  # 导入正则表达式模块,用于文本处理
from typing import TYPE_CHECKING, List, Optional, Tuple, Union  # 导入类型提示相关模块

import numpy as np  # 导入处理数组数据的 NumPy 库

# 导入日志记录模块
from ...utils import is_tf_available, is_torch_available, logging

# 检查类型注解,确定是否导入 torch 或 tensorflow 相关模块
if TYPE_CHECKING:
    if is_torch_available():
        import torch
    if is_tf_available():
        import tensorflow as tf

# 导入 tokenizers 库中的预处理模块
from tokenizers import pre_tokenizers

# 导入基础的 tokenization_utils_base 模块中的 BatchEncoding 类
from ...tokenization_utils_base import BatchEncoding

# 导入 tokenization_utils_fast 模块中的 PreTrainedTokenizerFast 类
from ...tokenization_utils_fast import PreTrainedTokenizerFast

# 导入本地的 tokenization_codegen 模块中的 CodeGenTokenizer 类
from .tokenization_codegen import CodeGenTokenizer

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型的词汇文件映射字典
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/vocab.json",
    },
    "merges_file": {
        "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/merges.txt",
    },
    "tokenizer_file": {
        "Salesforce/codegen-350M-mono": (
            "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/tokenizer.json"
        ),
    },
}

# 定义预训练模型的位置编码大小映射字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "Salesforce/codegen-350M-mono": 2048,
}


class CodeGenTokenizerFast(PreTrainedTokenizerFast):
    """
    构建一个“快速”CodeGen分词器(由HuggingFace的*tokenizers*库支持)。基于字节级的 Byte-Pair-Encoding。

    这个分词器经过训练,将空格视为标记的一部分(类似于sentencepiece),因此一个单词的编码方式会因其是否位于句子开头而不同(没有空格或有空格):

    ```
    >>> from transformers import CodeGenTokenizerFast

    >>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
    >>> tokenizer("Hello world")["input_ids"]
    [15496, 995]

    >>> tokenizer(" Hello world")["input_ids"]
    [18435, 995]
    ```

    如果在实例化分词器时传入 `add_prefix_space=True`,可以避免这种行为,但由于模型未以这种方式进行预训练,可能会降低性能。

    <Tip>

    当 `is_split_into_words=True` 时,需要使用 `add_prefix_space=True` 实例化这个分词器。
    """
    # 定义类 CodeGenTokenizer,继承自 PreTrainedTokenizerFast 类,包含大多数主要方法
    This tokenizer inherits from `PreTrainedTokenizerFast` which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    # 初始化方法,接受多个参数来配置 tokenizer
    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
            词汇表文件的路径。
        merges_file (`str`, *optional*):
            Path to the merges file.
            merges 文件的路径。
        tokenizer_file (`str`, *optional*):
            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
            tokenizers 文件的路径,通常为 .json 扩展名,包含加载 tokenizer 所需的全部信息。
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
            未知标记。不在词汇表中的标记无法转换为 ID,并将设置为此标记。
        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The beginning of sequence token.
            序列的开始标记。
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
            序列的结束标记。
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
            是否在输入前添加一个初始空格。这样可以将前导单词视为任何其他单词。(CodeGen tokenizer 通过前导空格检测单词的开始)。
    """

    # 定义常量,指定词汇表文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇表文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 慢速 tokenizer 的类为 CodeGenTokenizer
    slow_tokenizer_class = CodeGenTokenizer

    # 初始化方法
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        **kwargs,
    ):
    ):
        # 调用父类的构造函数,初始化一个新的实例
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        # 如果在参数 kwargs 中设置了 "add_bos_token",则抛出错误
        if kwargs.pop("add_bos_token", False):
            model_id = kwargs.pop("name_or_path", "")
            raise ValueError(
                "Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
                "Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
                f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
                f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
                "This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005."
                " so that the fast tokenizer works correctly."
            )

        # 获取当前预处理器的状态并将其转换为 JSON 格式
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
        # 如果预处理器的 "add_prefix_space" 参数与当前实例中的 add_prefix_space 不一致,则更新预处理器的状态
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 设置实例的 add_prefix_space 属性
        self.add_prefix_space = add_prefix_space

    # 重写父类的 _batch_encode_plus 方法,返回 BatchEncoding 对象
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 获取是否已经将输入拆分为单词的参数,默认为 False
        is_split_into_words = kwargs.get("is_split_into_words", False)
        # 断言如果 add_prefix_space 为 True 或者未将输入拆分为单词,则抛出错误
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        # 调用父类的 _batch_encode_plus 方法并返回结果
        return super()._batch_encode_plus(*args, **kwargs)

    # 重写父类的 _encode_plus 方法,返回 BatchEncoding 对象
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 获取是否已经将输入拆分为单词的参数,默认为 False
        is_split_into_words = kwargs.get("is_split_into_words", False)

        # 断言如果 add_prefix_space 为 True 或者未将输入拆分为单词,则抛出错误
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        # 调用父类的 _encode_plus 方法并返回结果
        return super()._encode_plus(*args, **kwargs)

    # 保存词汇表到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 调用 Tokenizer 的 model.save 方法保存模型文件到指定目录,并返回文件名的元组
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

    # 解码 token_ids 到原始文本
    def decode(
        self,
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        truncate_before_pattern: Optional[List[str]] = None,
        **kwargs,
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
                A list of regular expression strings that will be used to truncate the returned string. This can be
                used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
                of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        """

        # 使用继承自父类的方法 `decode` 对 token_ids 进行解码
        decoded_text = super().decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

        # 如果指定了 `truncate_before_pattern`,则根据正则表达式列表进行截断
        if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
            decoded_text = self.truncate(decoded_text, truncate_before_pattern)

        # 返回解码后的文本
        return decoded_text

    def truncate(self, completion, truncate_before_pattern):
        # 内部函数,用于在字符串中查找正则表达式的位置
        def find_re(string, pattern, start_pos):
            m = pattern.search(string, start_pos)
            return m.start() if m else -1

        # 编译正则表达式列表为多行模式的正则对象
        terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]

        # 查找代码字符串中以 "^print" 开头的所有位置
        prints = list(re.finditer("^print", completion, re.MULTILINE))

        # 如果找到多于一个 "^print" 开头的位置,则截断字符串到第二个 "^print" 之前
        if len(prints) > 1:
            completion = completion[: prints[1].start()]

        # 查找代码字符串中以 "^def" 开头的所有位置
        defs = list(re.finditer("^def", completion, re.MULTILINE))

        # 如果找到多于一个 "^def" 开头的位置,则截断字符串到第二个 "^def" 之前
        if len(defs) > 1:
            completion = completion[: defs[1].start()]

        start_pos = 0

        # 查找代码字符串中所有 `truncate_before_pattern` 匹配的位置
        terminals_pos = [
            pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
        ]

        # 如果找到任何一个 `truncate_before_pattern` 的位置,则截断字符串到最小的位置处
        if len(terminals_pos) > 0:
            return completion[: min(terminals_pos)]
        else:
            return completion

.\models\codegen\__init__.py

# 版权声明及许可信息,指出本代码的所有权和使用许可
#
# 根据 Apache 许可证 2.0 版本授权,除非符合许可证,否则禁止使用本文件
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则不得以任何形式分发本软件
# 本软件基于"按原样"提供,没有任何明示或暗示的担保或条件
# 请参阅许可证,了解详细的法律条文和限制条件
from typing import TYPE_CHECKING

# 导入所需模块和函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构,包含各个子模块和类的映射关系
_import_structure = {
    "configuration_codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenOnnxConfig"],
    "tokenization_codegen": ["CodeGenTokenizer"],
}

# 尝试导入 tokenizers,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若导入成功,添加对应的快速 tokenization_codegen_fast 模块
    _import_structure["tokenization_codegen_fast"] = ["CodeGenTokenizerFast"]

# 尝试导入 torch,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若导入成功,添加 modeling_codegen 模块及其内容
    _import_structure["modeling_codegen"] = [
        "CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CodeGenForCausalLM",
        "CodeGenModel",
        "CodeGenPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入配置、tokenizer 及模型相关的类和映射
    from .configuration_codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenOnnxConfig
    from .tokenization_codegen import CodeGenTokenizer

    # 尝试导入 tokenizers,若不可用则不进行导入
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若导入成功,导入 tokenization_codegen_fast 模块中的类
        from .tokenization_codegen_fast import CodeGenTokenizerFast

    # 尝试导入 torch,若不可用则不进行导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若导入成功,导入 modeling_codegen 模块中的类
        from .modeling_codegen import (
            CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
            CodeGenForCausalLM,
            CodeGenModel,
            CodeGenPreTrainedModel,
        )

# 非类型检查模式下,使用 LazyModule 实现懒加载模块
else:
    import sys

    # 将当前模块设置为 LazyModule,以便按需加载各个子模块和类
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\code_llama\tokenization_code_llama.py

# 设置文件编码为 UTF-8

# 版权声明和版权信息

# 导入必要的模块和库
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

# 导入 sentencepiece 库,用于分词
import sentencepiece as spm

# 导入其他必要的自定义模块和函数
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging, requires_backends

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件名映射
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "hf-internal-testing/llama-code-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
    },
    "tokenizer_file": {
        "hf-internal-testing/llama-code-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
    },
}

# 预训练模型的位置编码尺寸映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "hf-internal-testing/llama-code-tokenizer": 2048,
}

# SentencePiece 分词使用的特殊符号
SPIECE_UNDERLINE = "▁"

# 定义特殊标记,用于表示系统提示的起始和结束
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

# 默认的系统提示信息,指导模型生成回复时的行为规范
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
    def __init__(
        self,
        vocab_file,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        prefix_token="▁<PRE>",
        middle_token="▁<MID>",
        suffix_token="▁<SUF>",
        eot_token="▁<EOT>",
        fill_token="<FILL_ME>",
        suffix_first=False,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        add_bos_token=True,
        add_eos_token=False,
        clean_up_tokenization_spaces=False,
        additional_special_tokens=None,
        use_default_system_prompt=False,
        **kwargs,
    ):
        # 要求依赖 protobuf 库
        requires_backends(self, "protobuf")

        # 如果未提供 sp_model_kwargs,则设置为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 如果 bos_token/eos_token/unk_token 是字符串,则创建相应的特殊 token 对象
        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token

        # 设置是否使用默认系统提示
        self.use_default_system_prompt = use_default_system_prompt

        # 将特殊标记添加到 additional_special_tokens 列表中,用于跳过它们
        additional_special_tokens = additional_special_tokens or []
        for token in [prefix_token, middle_token, suffix_token, eot_token]:
            additional_special_tokens += [token] if token is not None else []

        # 初始化实例变量
        self.vocab_file = vocab_file
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self._prefix_token = prefix_token
        self._middle_token = middle_token
        self._suffix_token = suffix_token
        self._eot_token = eot_token
        self.fill_token = fill_token
        self.suffix_first = suffix_first

        # 获取 SPM 处理器
        self.sp_model = self.get_spm_processor()

        # 调用父类初始化方法
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            prefix_token=prefix_token,
            middle_token=middle_token,
            suffix_token=suffix_token,
            eot_token=eot_token,
            fill_token=fill_token,
            sp_model_kwargs=self.sp_model_kwargs,
            suffix_first=suffix_first,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            additional_special_tokens=additional_special_tokens,
            use_default_system_prompt=use_default_system_prompt,
            **kwargs,
        )

    @property
    def unk_token_length(self):
        # 返回未知标记 token 的编码长度
        return len(self.sp_model.encode(str(self.unk_token)))
    # 返回一个 SentencePieceProcessor 对象,用于分词处理
    def get_spm_processor(self):
        # 使用给定的 sp_model_kwargs 初始化 SentencePieceProcessor 对象
        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        
        # 打开并读取词汇文件,将其内容作为二进制数据加载到内存
        with open(self.vocab_file, "rb") as f:
            sp_model = f.read()
            
            # 导入 protobuf 模块中的 import_protobuf 函数
            model_pb2 = import_protobuf()
            
            # 使用 protobuf 解析 sp_model,转换为 ModelProto 对象
            model = model_pb2.ModelProto.FromString(sp_model)
            
            # 创建一个 NormalizerSpec 对象,设定 add_dummy_prefix 属性为 False
            normalizer_spec = model_pb2.NormalizerSpec()
            normalizer_spec.add_dummy_prefix = False
            
            # 将创建的 NormalizerSpec 对象合并到 ModelProto 对象的 normalizer_spec 中
            model.normalizer_spec.MergeFrom(normalizer_spec)
            
            # 将修改后的 ModelProto 对象序列化为字符串
            sp_model = model.SerializeToString()
            
            # 从序列化后的 ModelProto 字符串中加载 tokenizer
            tokenizer.LoadFromSerializedProto(sp_model)
        
        # 返回配置好的 tokenizer 对象
        return tokenizer

    @property
    def prefix_token(self):
        return self._prefix_token

    @property
    def prefix_id(self):
        # 如果 _prefix_token 为 None,则返回 None
        if self._prefix_token is None:
            return None
        # 否则,将 _prefix_token 转换为其对应的 id,并返回
        return self.convert_tokens_to_ids(self.prefix_token)

    @property
    def middle_token(self):
        return self._middle_token

    @property
    def middle_id(self):
        # 如果 _middle_token 为 None,则返回 None
        if self._middle_token is None:
            return None
        # 否则,将 _middle_token 转换为其对应的 id,并返回
        return self.convert_tokens_to_ids(self.middle_token)

    @property
    def suffix_token(self):
        return self._suffix_token

    @property
    def suffix_id(self):
        # 如果 _suffix_token 为 None,则返回 None
        if self._suffix_token is None:
            return None
        # 否则,将 _suffix_token 转换为其对应的 id,并返回
        return self.convert_tokens_to_ids(self.suffix_token)

    @property
    def eot_token(self):
        return self._eot_token

    @property
    def eot_id(self):
        # 如果 _eot_token 为 None,则返回 None
        if self._eot_token is None:
            return None
        # 否则,将 _eot_token 转换为其对应的 id,并返回
        return self.convert_tokens_to_ids(self.eot_token)

    @property
    def vocab_size(self):
        """Returns vocab size"""
        # 返回 sp_model 的词汇大小,即词汇表中的词汇数量
        return self.sp_model.get_piece_size()

    # 从 transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab 复制而来
    def get_vocab(self):
        """Returns vocab as a dict"""
        # 创建一个包含所有词汇及其对应 id 的字典 vocab
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        
        # 将 added_tokens_encoder 中的内容更新到 vocab 中
        vocab.update(self.added_tokens_encoder)
        
        # 返回构建好的词汇表字典 vocab
        return vocab
    # 将字符串 `prefix` 添加前缀空格
    def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
        # 如果 `prefix` 中包含 `self.fill_token`,且没有 `suffix`,则拆分为 `prefix` 和 `suffix`
        if self.fill_token is not None and self.fill_token in prefix and suffix is None:
            prefix, suffix = prefix.split(self.fill_token)

        # 如果 `prefix` 长度大于 0,将 `SPIECE_UNDERLINE` 替换为空格并添加前缀 `_`
        if len(prefix) > 0:
            prefix = SPIECE_UNDERLINE + prefix.replace(SPIECE_UNDERLINE, " ")

        # 如果 `suffix` 为 None 或长度小于 1,则仅使用 `prefix` 进行分词
        if suffix is None or len(suffix) < 1:
            tokens = super().tokenize(prefix, **kwargs)
            # 如果 `tokens` 的长度大于 1 且第一个 token 是 `SPIECE_UNDERLINE`,并且第二个 token 是特殊 token 列表中的一部分,则移除第一个 token
            if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
                tokens = tokens[1:]
            return tokens

        # 对 `prefix` 进行分词,包含额外的 `SPIECE_UNDERLINE`
        prefix_tokens = self._tokenize(prefix)

        # 如果 `prefix_id`, `middle_id`, `suffix_id` 有任一为 None,则抛出 ValueError
        if None in (self.prefix_id, self.middle_id, self.suffix_id):
            raise ValueError(
                "The input either includes a `prefix` and a `suffix` used for the infilling task,"
                f"  or can be split on the {self.fill_token} token, creating a suffix and prefix,"
                " but the model does not support `infilling`."
            )

        # 对 `suffix` 进行分词,确保不会影响 CodeLlama sp 模型的结果
        suffix_tokens = self._tokenize(suffix)

        # 根据 `suffix_first` 参数决定返回的 token 排序顺序
        suffix_first = suffix_first if suffix_first is not None else self.suffix_first
        if suffix_first:
            # 格式化为 " <PRE> <SUF>{suf} <MID> {pre}"
            return [self.prefix_token, self.suffix_token] + suffix_tokens + [self.middle_token] + prefix_tokens
        else:
            # 格式化为 " <PRE> {pre} <SUF>{suf} <MID>"
            return [self.prefix_token] + prefix_tokens + [self.suffix_token] + suffix_tokens + [self.middle_token]

    # 返回经过分词后的字符串
    def _tokenize(self, text, **kwargs):
        """
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        """
        # 使用 sentencepiece 对文本进行编码,输出为字符串类型的 token 列表
        tokens = self.sp_model.encode(text, out_type=str)
        # 如果 `text` 不以 `SPIECE_UNDERLINE` 或空格开头,则直接返回 tokens
        if not text.startswith((SPIECE_UNDERLINE, " ")):
            return tokens
        # 在编码字符串前添加 `unk_token`,然后去除 `unk_token`
        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
        # 如果 `tokens` 的长度大于等于 `unk_token_length`,则去除前 `unk_token_length` 个 token;否则返回整个 tokens 列表
        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens

    # 从词汇表中将 token 转换为其对应的 id
    # 复制自 transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.sp_model.piece_to_id(token)
    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用 sentencepiece 模型将索引转换为对应的 token 字符串
        token = self.sp_model.IdToPiece(index)
        return token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 因为我们手动添加了前缀空格,所以在解码时需要去除
        if tokens[0].startswith(SPIECE_UNDERLINE):
            # 去除第一个 token 的前缀下划线
            tokens[0] = tokens[0][1:]

        current_sub_tokens = []
        out_string = ""
        for _, token in enumerate(tokens):
            # 确保特殊 token 不使用 sentencepiece 模型解码
            if token in self.all_special_tokens:
                out_string += self.sp_model.decode(current_sub_tokens) + token
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
        # 解码剩余的子 token 并添加到输出字符串中
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string

    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            # 如果保存目录不存在,则记录错误并返回
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # 构造输出的词汇表文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与输出路径不同,并且当前文件是一个存在的文件,则复制当前文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇表文件不存在,则将 sentencepiece 模型的序列化内容写入输出文件
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)

    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        # 根据是否添加特殊 token 构建输入的 token 序列
        output = bos_token_id + token_ids_0 + eos_token_id

        if token_ids_1 is not None:
            # 如果有第二个 token 序列,则连接第二个 token 序列的特殊 token 和 token_ids
            output = output + bos_token_id + token_ids_1 + eos_token_id

        return output

    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Initialize special tokens (BOS and EOS) IDs based on tokenizer settings
        bos_token_id = [1] if self.add_bos_token else []
        eos_token_id = [1] if self.add_eos_token else []

        # If only one sequence is provided
        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
        
        # For sequence pairs, concatenate masks for both sequences
        return (
            bos_token_id
            + ([0] * len(token_ids_0))
            + eos_token_id
            + bos_token_id
            + ([0] * len(token_ids_1))
            + eos_token_id
        )

    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Initialize BOS and EOS token IDs based on tokenizer settings
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        # Initialize output list with zeros for the length of the first sequence with added special tokens
        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)

        # If there is a second sequence, concatenate its token type IDs after the first sequence
        if token_ids_1 is not None:
            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)

        return output

    @property
    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
    # 返回对象的状态字典,以便序列化
    def __getstate__(self):
        # 复制对象的字典属性,确保状态独立于实例
        state = self.__dict__.copy()
        # 将 sp_model 设为 None,因为不能直接序列化 SentencePieceProcessor 对象
        state["sp_model"] = None
        # 获取序列化后的 sp_model_proto 字符串表示,并存入状态字典
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        # 返回最终的状态字典
        return state

    # 根据给定的状态字典来恢复对象的状态
    def __setstate__(self, d):
        # 直接将对象的状态字典设置为传入的状态字典 d
        self.__dict__ = d
        # 使用 sp_model_kwargs 参数重新创建 sp_model 对象
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        # 从序列化后的 proto 字符串加载 sp_model 的状态
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)

.\models\code_llama\tokenization_code_llama_fast.py

# 导入所需的库和模块
import os  # 导入操作系统模块
from shutil import copyfile  # 导入复制文件函数
from typing import List, Optional, Tuple  # 导入类型提示相关的工具

from tokenizers import normalizers, processors  # 导入 tokenizers 库中的规范化和处理器模块

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 从本地库导入预训练的快速分词器
from ...utils import is_sentencepiece_available, logging  # 导入判断是否可用 SentencePiece 和日志模块
from ...utils.versions import require_version  # 导入版本要求函数


require_version("tokenizers>=0.13.3")  # 确保 tokenizers 版本在 0.13.3 或以上

if is_sentencepiece_available():
    from .tokenization_code_llama import CodeLlamaTokenizer  # 如果可用,导入 CodeLlamaTokenizer
else:
    CodeLlamaTokenizer = None  # 否则设为 None

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}  # 词汇文件和分词器文件的名称定义

SPIECE_UNDERLINE = "▁"  # 定义特定的空格字符

B_INST, E_INST = "[INST]", "[/INST]"  # 定义实例开始和结束标记
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"  # 定义系统提示的开始和结束标记


# fmt: off
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
# fmt: on


class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
    """
    构建 Llama 快速分词器。基于字节级别的字节对编码。

    这里特别使用了 ByteFallback 和没有规范化。

    ```
    >>> from transformers import CodeLlamaTokenizerFast

    >>> tokenizer = CodeLlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
    >>> tokenizer.encode("Hello this is a test")
    [1, 15043, 445, 338, 263, 1243]
    ```

    如果要更改 `bos_token` 或 `eos_token`,请在初始化模型时指定它们,或调用 `tokenizer.update_post_processor()` 来确保后处理正确执行
    (否则编码序列的第一个标记和最后一个标记的值将不正确)。有关更多详细信息,请查看 [后处理器文档](https://huggingface.co/docs/tokenizers/api/post-processors)。

    该分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应该
    """
    # 定义默认的词汇文件名列表,用于加载模型
    vocab_files_names = VOCAB_FILES_NAMES

    # 指定慢速分词器的类,这里使用 CodeLlamaTokenizer
    slow_tokenizer_class = CodeLlamaTokenizer

    # 指定填充的位置在左侧
    padding_side = "left"

    # 指定模型的输入名称列表,包括输入的标识符和注意力掩码
    model_input_names = ["input_ids", "attention_mask"]
    # 初始化函数,用于初始化一个自定义的Tokenizer对象
    def __init__(
        self,
        vocab_file=None,  # 词汇表文件路径,默认为None
        tokenizer_file=None,  # 分词器文件路径,默认为None
        clean_up_tokenization_spaces=False,  # 是否清理分词后的空格,默认为False
        unk_token="<unk>",  # 未知标记,默认为"<unk>"
        bos_token="<s>",  # 开始标记,默认为"<s>"
        eos_token="</s>",  # 结束标记,默认为"</s>"
        prefix_token="▁<PRE>",  # 前缀标记,默认为"▁<PRE>"
        middle_token="▁<MID>",  # 中间标记,默认为"▁<MID>"
        suffix_token="▁<SUF>",  # 后缀标记,默认为"▁<SUF>"
        eot_token="▁<EOT>",  # 结束标记,默认为"▁<EOT>"
        fill_token="<FILL_ME>",  # 填充标记,默认为"<FILL_ME>"
        additional_special_tokens=None,  # 额外的特殊标记列表,默认为None
        add_bos_token=True,  # 是否添加开始标记,默认为True
        add_eos_token=False,  # 是否添加结束标记,默认为False
        use_default_system_prompt=False,  # 是否使用默认系统提示,默认为False
        **kwargs,
    ):
        # 标记需要特别处理的特殊标记
        additional_special_tokens = additional_special_tokens or []
        for token in [prefix_token, middle_token, suffix_token, eot_token]:
            additional_special_tokens += [token] if token is not None else []
        # 记录是否使用默认系统提示
        self.use_default_system_prompt = use_default_system_prompt

        # 调用父类的初始化方法,传递所有参数
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            additional_special_tokens=additional_special_tokens,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            prefix_token=prefix_token,
            middle_token=middle_token,
            suffix_token=suffix_token,
            eot_token=eot_token,
            fill_token=fill_token,
            use_default_system_prompt=use_default_system_prompt,
            **kwargs,
        )
        
        # 初始化是否添加开始标记和结束标记的标志位
        self._add_bos_token = add_bos_token
        self._add_eos_token = add_eos_token
        
        # 更新后处理器
        self.update_post_processor()

        # 记录词汇表文件路径
        self.vocab_file = vocab_file

        # 记录各种特殊标记的值
        self._prefix_token = prefix_token
        self._middle_token = middle_token
        self._suffix_token = suffix_token
        self._eot_token = eot_token
        self.fill_token = fill_token

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查词汇表文件是否存在,从而判断是否可以保存慢速分词器
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    # 从 transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor 复制
    def update_post_processor(self):
        """
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        """
        # 获取当前的 `bos_token` 和 `bos_token_id`
        bos = self.bos_token
        bos_token_id = self.bos_token_id
        # 如果 `bos_token` 为 None 且需要添加 `bos_token`,则引发数值错误
        if bos is None and self.add_bos_token:
            raise ValueError("add_bos_token = True but bos_token = None")

        # 获取当前的 `eos_token` 和 `eos_token_id`
        eos = self.eos_token
        eos_token_id = self.eos_token_id
        # 如果 `eos_token` 为 None 且需要添加 `eos_token`,则引发数值错误
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

        # 构建单句和双句模板,包含 `bos_token` 和 `eos_token`
        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"

        # 准备特殊标记列表,包括 `bos_token` 和 `eos_token`,用于后处理器
        special_tokens = []
        if self.add_bos_token:
            special_tokens.append((bos, bos_token_id))
        if self.add_eos_token:
            special_tokens.append((eos, eos_token_id))
        
        # 更新 tokenizer 的后处理器使用新的模板和特殊标记
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=single, pair=pair, special_tokens=special_tokens
        )

    @property
    def prefix_token(self):
        return self._prefix_token

    @property
    def prefix_id(self):
        # 如果 `_prefix_token` 为 None,则返回 None,否则将 `_prefix_token` 转换为对应的 id
        if self._prefix_token is None:
            return None
        return self.convert_tokens_to_ids(self.prefix_token)

    @property
    def middle_token(self):
        return self._middle_token

    @property
    def middle_id(self):
        # 如果 `_middle_token` 为 None,则返回 None,否则将 `_middle_token` 转换为对应的 id
        if self._middle_token is None:
            return None
        return self.convert_tokens_to_ids(self.middle_token)

    @property
    def suffix_token(self):
        return self._suffix_token

    @property
    def suffix_id(self):
        # 如果 `_suffix_token` 为 None,则返回 None,否则将 `_suffix_token` 转换为对应的 id
        if self._suffix_token is None:
            return None
        return self.convert_tokens_to_ids(self.suffix_token)

    @property
    def eot_id(self):
        # 如果 `_eot_token` 为 None,则返回 None,否则将 `_eot_token` 转换为对应的 id
        if self._eot_token is None:
            return None
        return self.convert_tokens_to_ids(self.eot_token)

    @property
    def eot_token(self):
        return self._eot_token

    @property
    def add_eos_token(self):
        return self._add_eos_token

    @property
    def add_bos_token(self):
        return self._add_bos_token

    @add_eos_token.setter
    def add_eos_token(self, value):
        # 设置 `_add_eos_token` 的值,然后更新后处理器
        self._add_eos_token = value
        self.update_post_processor()

    @add_bos_token.setter
    def add_bos_token(self, value):
        # 设置 `_add_bos_token` 的值,然后更新后处理器
        self._add_bos_token = value
        self.update_post_processor()
    def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
        """
        Updates the normalizer to ensure the prompt format for `infilling` is respected. The infilling format is as follows:
        if `suffix_first`:
            " <PRE> <SUF>{suf} <MID> {pre}"
        else:
            " <PRE> {pre} <SUF>{suf} <MID>"

        If `reset` is `True`, resets `normalizer` and `post_processor` to their default behaviors:
        normalizer adds a prefix space, post_processor adds a `bos_token`.

        Args:
            reset (bool): Indicates whether to reset the processors.
            suffix_first (bool, optional): Flag indicating the order of suffix and prefix in the format.
            add_special_tokens (bool, optional): Whether to add special tokens.

        Returns:
            None
        """
        # Resetting the processors if `reset` is `True`
        if reset:
            self._tokenizer.normalizer = normalizers.Sequence(
                [
                    normalizers.Prepend(prepend="▁"),  # Add a prefix space if resetting
                    normalizers.Replace(pattern=" ", content="▁"),  # Replace spaces with underscores
                ]
            )
            # Update post processor
            self.update_post_processor()
            return

        # Setting normalizer to replace spaces with underscores
        self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")

        # Building `pair` and `special_tokens` based on `suffix_first` flag
        pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
        special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
        
        if suffix_first:
            # Format as " <PRE> <SUF>{suf} <MID> {pre}"
            pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
            special_tokens += [
                (self.prefix_token, self.prefix_id),
                (self.suffix_token, self.suffix_id),
                (self.middle_token, self.middle_id),
            ]
        else:
            # Format as " <PRE> {pre} <SUF>{suf} <MID>"
            pair += [self.prefix_token, "$A", self.suffix_token, "$B", self.middle_token]
            special_tokens += [
                (self.prefix_token, self.prefix_id),
                (self.suffix_token, self.suffix_id),
                (self.middle_token, self.middle_id),
            ]

        # Adding `eos_token` if required
        if self.add_eos_token and add_special_tokens:
            pair += [self.eos_token]
            special_tokens += [(self.eos_token, self.eos_token_id)]

        # Setting `post_processor` using TemplateProcessing
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single="$A", pair=pair, special_tokens=special_tokens
        )
    def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_tokens=True, **kwargs):
        # 用于确保输入预处理在 Rust 外完成的一个小技巧
        text_pair = kwargs.pop("suffix", text_pair)
        # 如果存在填充标记并且在文本中找到了该标记但没有提供 text_pair,则将 text 拆分成 text 和 text_pair
        if self.fill_token is not None and self.fill_token in text and text_pair is None:
            text, text_pair = text.split(self.fill_token)

        # 如果 text_pair 为 None 或者长度小于1,则调用父类方法返回编码结果
        if text_pair is None or len(text_pair) < 1:
            return super().encode_plus(text, text_pair, add_special_tokens=add_special_tokens, **kwargs)

        # 如果 self.prefix_id, self.middle_id, self.suffix_id 中有任何一个为 None,则抛出 ValueError 异常
        if None in (self.prefix_id, self.middle_id, self.suffix_id):
            raise ValueError(
                "Then input includes a `prefix` and a `suffix` used for the infilling task,"
                " the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
                f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
            )

        # 设置 infilling 处理器,根据 suffix_first 和 add_special_tokens 参数决定是否添加特殊标记
        self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=add_special_tokens)
        # 调用父类方法编码 text 和 text_pair,并返回结果 tokens
        tokens = super().encode_plus(" " + text, text_pair=text_pair, add_special_tokens=True, **kwargs)
        # 恢复默认的 infilling 处理器设置
        self.set_infilling_processor(True)
        return tokens

    # 从 transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary 复制而来
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果无法保存慢速 tokenizer 的词汇表,则抛出 ValueError 异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存路径不是目录,则记录错误信息并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # 确定输出的词汇表文件路径,并复制当前词汇表文件到该路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与目标路径不同,则执行复制操作
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)

    @property
    # 从 transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template 复制而来
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        # 此方法应返回将特殊标记添加到输入 token_ids_0 和 token_ids_1 中的结果
    def set_lang(self, src_lang_code: int, tgt_lang_code: int = None) -> List[int]:
        """
        通过连接和添加特殊标记,从序列或序列对构建用于序列分类任务的模型输入。特殊标记取决于调用 set_lang。

        对于 NLLB 序列,其格式如下,其中 `X` 表示序列:

        - `input_ids`(用于编码器):`X [eos, src_lang_code]`
        - `decoder_input_ids`(用于解码器):`X [eos, tgt_lang_code]`

        BOS 永远不会被使用。序列对不是预期的使用情况,但将会在没有分隔符的情况下处理。

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表,用于序列对。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """
        if token_ids_1 is None:
            # 如果没有第二个序列对,返回加上特殊标记的第一个序列的输入 ID 列表
            return self.bos_token_id + token_ids_0 + self.eos_token_id
        # 如果有第二个序列对,返回加上特殊标记的两个序列的输入 ID 列表
        return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id

.\models\code_llama\__init__.py

# 导入所需的模块和函数
from typing import TYPE_CHECKING
# 导入自定义异常类,用于处理缺少可选依赖的情况
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available

# 定义模块的导入结构
_import_structure = {}

# 检查是否安装了 SentencePiece 库,如果未安装则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,将 CodeLlamaTokenizer 添加到模块的导入结构中
    _import_structure["tokenization_code_llama"] = ["CodeLlamaTokenizer"]

# 检查是否安装了 Tokenizers 库,如果未安装则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,将 CodeLlamaTokenizerFast 添加到模块的导入结构中
    _import_structure["tokenization_code_llama_fast"] = ["CodeLlamaTokenizerFast"]

# 如果是类型检查阶段
if TYPE_CHECKING:
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 tokenization_code_llama 模块中导入 CodeLlamaTokenizer 类
        from .tokenization_code_llama import CodeLlamaTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 tokenization_code_llama_fast 模块中导入 CodeLlamaTokenizerFast 类
        from .tokenization_code_llama_fast import CodeLlamaTokenizerFast

# 如果不是类型检查阶段,则为模块创建 LazyModule,并将其加入到 sys.modules 中
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\cohere\configuration_cohere.py

# coding=utf-8
# Copyright 2024 Cohere team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Cohere model configuration"""

# 导入预训练配置类 PretrainedConfig 和日志记录工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 初始化预训练配置文件映射字典
COHERE_PRETRAINED_CONFIG_ARCHIVE_MAP = {}

# CohereConfig 类,继承自 PretrainedConfig 类,用于存储 CohereModel 的配置信息
class CohereConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
    model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


    ```
    >>> from transformers import CohereModel, CohereConfig

    >>> # Initializing a Cohere model configuration
    >>> configuration = CohereConfig()

    >>> # Initializing a model from the Cohere configuration
    >>> model = CohereModel(configuration) # doctest: +SKIP

    >>> # Accessing the model configuration
    >>> configuration = model.config # doctest: +SKIP
    ```
    """

    # 模型类型
    model_type = "cohere"
    # 推理过程中需要忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]

    # 初始化方法,定义了各种模型超参数和配置项
    def __init__(
        self,
        vocab_size=256000,
        hidden_size=8192,
        intermediate_size=22528,
        logit_scale=0.0625,
        num_hidden_layers=40,
        num_attention_heads=64,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=8192,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=5,
        eos_token_id=255001,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        # 调用父类 PretrainedConfig 的初始化方法
        super().__init__(**kwargs)
        # 设置模型的各种超参数和配置项
        # 词汇表大小
        self.vocab_size = vocab_size
        # 隐藏层大小
        self.hidden_size = hidden_size
        # 中间层大小
        self.intermediate_size = intermediate_size
        # logit 缩放比例
        self.logit_scale = logit_scale
        # 隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 注意力头数量
        self.num_attention_heads = num_attention_heads
        # key-value 头数量
        self.num_key_value_heads = num_key_value_heads
        # 隐藏层激活函数
        self.hidden_act = hidden_act
        # 最大位置嵌入
        self.max_position_embeddings = max_position_embeddings
        # 初始化范围
        self.initializer_range = initializer_range
        # 层归一化 epsilon
        self.layer_norm_eps = layer_norm_eps
        # 是否使用缓存
        self.use_cache = use_cache
        # 填充 token 的 id
        self.pad_token_id = pad_token_id
        # 开始 token 的 id
        self.bos_token_id = bos_token_id
        # 结束 token 的 id
        self.eos_token_id = eos_token_id
        # 是否绑定词嵌入
        self.tie_word_embeddings = tie_word_embeddings
        # rope_theta 参数
        self.rope_theta = rope_theta
        # 注意力偏置
        self.attention_bias = attention_bias
        # 注意力 dropout
        self.attention_dropout = attention_dropout
        ):
            # 初始化模型的各种参数
            self.vocab_size = vocab_size  # 词汇表大小
            self.max_position_embeddings = max_position_embeddings  # 最大位置编码数
            self.hidden_size = hidden_size  # 隐藏层大小
            self.logit_scale = logit_scale  # 对数缩放比例
            self.intermediate_size = intermediate_size  # 中间层大小
            self.num_hidden_layers = num_hidden_layers  # 隐藏层数
            self.num_attention_heads = num_attention_heads  # 注意力头数

            # 为了向后兼容性
            if num_key_value_heads is None:
                num_key_value_heads = num_attention_heads

            self.num_key_value_heads = num_key_value_heads  # 键值头数
            self.hidden_act = hidden_act  # 隐藏层激活函数
            self.initializer_range = initializer_range  # 初始化范围
            self.layer_norm_eps = layer_norm_eps  # 层归一化的 epsilon 参数
            self.use_cache = use_cache  # 是否使用缓存
            self.rope_theta = rope_theta  # 绳索 theta 参数
            self.attention_bias = attention_bias  # 注意力偏置
            self.attention_dropout = attention_dropout  # 注意力丢弃率

            # 调用父类的初始化方法,设置特殊的令牌 ID 和参数关联词嵌入
            super().__init__(
                pad_token_id=pad_token_id,
                bos_token_id=bos_token_id,
                eos_token_id=eos_token_id,
                tie_word_embeddings=tie_word_embeddings,
                **kwargs,
            )

.\models\cohere\modeling_cohere.py

# 定义 CohereLayerNorm 类,用于实现 Cohere 模型中的 LayerNorm 层
class CohereLayerNorm(nn.Module):
    # 初始化函数
    def __init__(self, hidden_size, eps=1e-5, bias=False):
        super().__init__()
        # 定义可学习参数 weight,形状为 hidden_size,初始化为全1张量
        self.weight = nn.Parameter(torch.ones(hidden_size))
        # 如果 bias=True,则定义可学习参数 bias,形状同样为 hidden_size,初始化为全0张量;否则为 None
        self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None
        # 设定 LayerNorm 中的 epsilon 值
        self.variance_epsilon = eps
    # 定义前向传播方法,接收隐藏状态作为输入参数
    def forward(self, hidden_states):
        # 获取输入张量的数据类型
        input_dtype = hidden_states.dtype
        # 将隐藏状态张量转换为 float32 类型
        hidden_states = hidden_states.to(torch.float32)
        # 计算隐藏状态在最后一个维度上的均值
        mean = hidden_states.mean(-1, keepdim=True)
        # 计算隐藏状态在最后一个维度上的方差
        variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
        # 根据均值和方差对隐藏状态进行归一化
        hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
        # 使用权重对归一化后的隐藏状态进行加权
        hidden_states = self.weight.to(torch.float32) * hidden_states
        # 如果存在偏置项,将偏置项加到隐藏状态上
        if self.bias is not None:
            hidden_states = hidden_states + self.bias.to(torch.float32)
        # 将处理后的隐藏状态张量转回初始输入数据类型
        return hidden_states.to(input_dtype)
# 将 CohereLayerNorm 添加到 ALL_LAYERNORM_LAYERS 列表中
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)

class CohereRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        # 计算旋转位置嵌入的频率逆向
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # 将 inv_freq 注册为不可训练的缓冲区
        self.register_buffer("inv_freq", inv_freq, persistent=False)

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        # 扩展 inv_freq 到与 position_ids 相同的形状
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()

        # 强制将频率计算结果转为 float32,因为 bfloat16 在长上下文中会丢失精度
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            # 计算旋转角度的余弦和正弦值
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.repeat_interleave(freqs, 2, dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


def rotate_half(x):
    # 将输入张量 x 拆分和旋转
    x1 = x[..., ::2]
    x2 = x[..., 1::2]
    rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
    return rot_x


# 从 transformers.models.llama.modeling_llama.apply_rotary_pos_emb 复制过来的函数
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    # 将 cos 张量在指定维度上扩展维度
    cos = cos.unsqueeze(unsqueeze_dim)
    # 将 sin 张量在指定维度上扩展维度
    sin = sin.unsqueeze(unsqueeze_dim)
    # 计算查询向量的旋转位置嵌入,使用 cos 和 sin 对应元素加权
    q_embed = (q * cos) + (rotate_half(q) * sin)
    # 计算键向量的旋转位置嵌入,使用 cos 和 sin 对应元素加权
    k_embed = (k * cos) + (rotate_half(k) * sin)
    # 返回旋转后的查询向量和键向量作为元组
    return q_embed, k_embed
# 从 transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere 复制过来的类,用于定义 CohereMLP 模型
class CohereMLP(nn.Module):
    # 初始化函数,接受一个配置对象 config 作为参数
    def __init__(self, config):
        super().__init__()
        # 将配置对象保存到实例中
        self.config = config
        # 设置隐藏层大小为配置中的 hidden_size
        self.hidden_size = config.hidden_size
        # 设置中间层大小为配置中的 intermediate_size
        self.intermediate_size = config.intermediate_size
        # 创建一个线性层,用于门控投影,输入维度为 hidden_size,输出维度为 intermediate_size,没有偏置项
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        # 创建一个线性层,用于上投影,输入维度为 hidden_size,输出维度为 intermediate_size,没有偏置项
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        # 创建一个线性层,用于下投影,输入维度为 intermediate_size,输出维度为 hidden_size,没有偏置项
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        # 根据配置中的激活函数类型选择对应的激活函数,保存到实例中
        self.act_fn = ACT2FN[config.hidden_act]

    # 前向传播函数,接受输入张量 x
    def forward(self, x):
        # 计算门控投影后的结果,应用激活函数,再与上投影结果相乘,然后应用下投影
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        # 返回下投影的结果
        return down_proj


# 从 transformers.models.llama.modeling_llama.repeat_kv 复制过来的函数
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    这是等效于 torch.repeat_interleave(x, dim=1, repeats=n_rep) 的函数。
    将隐藏状态从 (batch, num_key_value_heads, seqlen, head_dim) 扩展为 (batch, num_attention_heads, seqlen, head_dim)
    """
    # 获取隐藏状态张量的形状信息
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    # 如果 n_rep 等于 1,直接返回原始隐藏状态张量
    if n_rep == 1:
        return hidden_states
    # 扩展隐藏状态张量的维度,重复 n_rep 次
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    # 重新整形张量,以得到期望的形状 (batch, num_attention_heads * n_rep, seqlen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# 从 transformers.models.llama.modeling_llama.LlamaAttention Llama->Cohere 复制过来的类,用于定义 CohereAttention 模型
class CohereAttention(nn.Module):
    """来自 'Attention Is All You Need' 论文的多头注意力机制"""

    # 类的文档字符串,描述其为来自论文的多头注意力机制
    # 初始化函数,用于创建一个新的实例
    def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
        # 调用父类的初始化函数
        super().__init__()
        # 将传入的配置信息和层索引保存到对象中
        self.config = config
        self.layer_idx = layer_idx
        # 如果未传入层索引,则记录警告信息,建议在使用缓存时传入层索引以避免错误
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        # 从配置中获取注意力机制的丢弃率、隐藏层大小、注意力头数等信息
        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        # 检查隐藏层大小是否可以被注意力头数整除,否则抛出数值错误
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 初始化线性投影层,用于将隐藏状态映射到注意力头和维度上
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
        
        # 初始化旋转嵌入,这是一个辅助函数,用于增强模型在序列位置上的表示能力
        self._init_rope()

    # 辅助函数,用于初始化旋转嵌入
    def _init_rope(self):
        self.rotary_emb = CohereRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    # 前向传播函数,接受输入的隐藏状态并计算输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # 获取输入张量的尺寸信息
        bsz, q_len, _ = hidden_states.size()

        # 使用线性投影层对隐藏状态进行变换,得到查询、键、值的状态
        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        # 将查询、键、值的状态按照指定维度重新组织,并进行维度转置
        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        # 获取已存储的过去键值对信息,若存在则更新当前键值对状态
        past_key_value = getattr(self, "past_key_value", past_key_value)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # 对于已存储的过去键值对,根据 RoPE 模型的特定要求进行更新
            # 需要提供 sin 和 cos 参数以及缓存位置信息
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        # 将键值对状态分组复制以便多头注意力计算
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        # 计算注意力权重
        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        # 如果存在注意力掩码,则将其应用于注意力权重
        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # 将注意力权重归一化并进行 dropout 处理
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        # 检查注意力输出的尺寸是否符合预期
        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        # 调整注意力输出张量的维度顺序,并保证连续性
        attn_output = attn_output.transpose(1, 2).contiguous()

        # 将注意力输出张量重新调整为隐藏状态的形状
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        # 使用输出投影层处理最终的注意力输出
        attn_output = self.o_proj(attn_output)

        # 如果不需要输出注意力权重,则将其置为 None
        if not output_attentions:
            attn_weights = None

        # 返回注意力输出张量、注意力权重张量以及更新后的过去键值对信息(如果有)
        return attn_output, attn_weights, past_key_value
# 从 `transform
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if causal masking is required based on the configuration
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # Temporary workaround for an issue with Flash Attention on RoCm
            # Remove this check once the issue is resolved in future versions
            causal = self.is_causal and query_length != 1

        # Check if there are any padding tokens in the input sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input based on the attention mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Compute attention scores for variable-length sequences
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the computed attention scores to match the original input length
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Compute attention scores without considering padding
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        return attn_output
    # 从注意力掩码中获取未填充的数据的索引、当前序列长度和批次中最大序列长度
    indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
    # 获取输入张量 key_layer 的形状信息
    batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

    # 根据获取的索引重新排列 key_layer 张量的第一个轴,以对应未填充的数据
    key_layer = index_first_axis(
        key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
    )
    # 根据获取的索引重新排列 value_layer 张量的第一个轴,以对应未填充的数据
    value_layer = index_first_axis(
        value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
    )

    # 根据查询长度调整查询层的数据
    if query_length == kv_seq_len:
        # 若查询长度等于 key_value 序列长度,则根据获取的索引重新排列查询层的数据
        query_layer = index_first_axis(
            query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
        )
        # 更新当前查询序列长度和批次中最大查询序列长度
        cu_seqlens_q = cu_seqlens_k
        max_seqlen_in_batch_q = max_seqlen_in_batch_k
        indices_q = indices_k
    elif query_length == 1:
        # 若查询长度为1,则对应的当前查询序列长度为1,且索引为批次索引
        max_seqlen_in_batch_q = 1
        cu_seqlens_q = torch.arange(
            batch_size + 1, dtype=torch.int32, device=query_layer.device
        )  # 这里有一个内存拷贝,这样做效率很低。
        indices_q = cu_seqlens_q[:-1]
        query_layer = query_layer.squeeze(1)
    else:
        # 如果查询长度不等于 key_value 序列长度且不等于1,则假设存在左填充情况,根据注意力掩码和查询层调整输入数据
        attention_mask = attention_mask[:, -query_length:]
        query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

    # 返回调整后的查询层、键层、值层、查询索引、当前序列长度元组和最大序列长度元组
    return (
        query_layer,
        key_layer,
        value_layer,
        indices_q,
        (cu_seqlens_q, cu_seqlens_k),
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
    )
# 定义一个名为 CohereSdpaAttention 的类,它继承自 CohereAttention 类
# 该类用于实现 Cohere 模型的自注意力机制,使用 torch.nn.functional.scaled_dot_product_attention
class CohereSdpaAttention(CohereAttention):
    """
    Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `CohereAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # 重写 forward 方法,适应 SDPA API
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        # forward 方法的具体实现在子类中完成,本处不直接提供具体实现
        pass

# 定义一个字典 COHERE_ATTENTION_CLASSES,包含不同的 CohereAttention 类型作为值,以字符串为键
COHERE_ATTENTION_CLASSES = {
    "eager": CohereAttention,
    "flash_attention_2": CohereFlashAttention2,
    "sdpa": CohereSdpaAttention,  # 将 CohereSdpaAttention 类作为 sdpa 类型的实现
}


class CohereDecoderLayer(nn.Module):
    # CohereDecoderLayer 类,继承自 nn.Module
    def __init__(self, config: CohereConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        # 初始化 self_attn 属性,根据配置选择不同的注意力机制实现类
        self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)

        # 初始化 mlp 属性为 CohereMLP 类的实例
        self.mlp = CohereMLP(config)
        # 初始化 input_layernorm 属性为 CohereLayerNorm 类的实例,用于层归一化
        self.input_layernorm = CohereLayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 定义 forward 方法,实现模型的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ):
        # forward 方法的具体实现在子类中完成,本处不直接提供具体实现
        pass
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            # 发出警告信息,提示 `padding_mask` 参数已经不推荐使用
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

        # 将输入状态作为残差连接的起始点
        residual = hidden_states

        # Layer normalization,对输入状态进行归一化处理
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        # 调用自注意力机制进行处理,获取注意力加权的输出、注意力权重及可能的过去键值状态
        hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )

        # Fully Connected Layer
        # 通过多层感知机(MLP)进行全连接层的处理
        hidden_states_mlp = self.mlp(hidden_states)

        # 将残差、注意力加权输出和MLP输出相加得到最终的隐藏状态表示
        hidden_states = residual + hidden_states_attention + hidden_states_mlp

        # 构建输出元组
        outputs = (hidden_states,)

        # 如果需要返回注意力权重,则添加到输出元组中
        if output_attentions:
            outputs += (self_attn_weights,)

        # 如果需要返回缓存状态,则添加到输出元组中
        if use_cache:
            outputs += (present_key_value,)

        # 返回最终的输出元组
        return outputs
"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`CohereConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

@add_start_docstrings(
    "The bare Cohere Model outputting raw hidden-states without any specific head on top.",
    COHERE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Cohere
class CoherePreTrainedModel(PreTrainedModel):
    config_class = CohereConfig  # 设置模型配置类为CohereConfig
    base_model_prefix = "model"  # 模型基本前缀为"model"
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["CohereDecoderLayer"]  # 不分割的模块列表,包括"CohereDecoderLayer"
    _skip_keys_device_placement = ["past_key_values"]  # 跳过设备放置的键列表,包括"past_key_values"
    _supports_flash_attn_2 = True  # 支持flash attention 2
    _supports_sdpa = True  # 支持sdpa
    _supports_cache_class = True  # 支持缓存类操作

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化线性层权重为正态分布
            if module.bias is not None:
                module.bias.data.zero_()  # 如果有偏置,初始化为零
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化嵌入层权重为正态分布
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()  # 如果有填充索引,对应位置初始化为零

    def _setup_cache(self, cache_cls, max_batch_size, max_cache_len: Optional[int] = None):
        if self.config._attn_implementation == "flash_attention_2" and cache_cls == StaticCache:
            raise ValueError(
                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
            )

        for layer in self.model.layers:
            device = layer.input_layernorm.weight.device
            if hasattr(self.config, "_pre_quantization_dtype"):
                dtype = self.config._pre_quantization_dtype
            else:
                dtype = layer.self_attn.o_proj.weight.dtype
            layer.self_attn.past_key_value = cache_cls(
                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype
            )

    def _reset_cache(self):
        for layer in self.model.layers:
            layer.self_attn.past_key_value = None
# CohereModel 类定义,继承自 CoherePreTrainedModel
class CohereModel(CoherePreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]

    Args:
        config: CohereConfig
    """

    # 初始化函数,接受一个 config 对象作为参数
    def __init__(self, config: CohereConfig):
        # 调用父类的初始化函数
        super().__init__(config)
        # 将配置中的 pad_token_id 赋给当前实例的 padding_idx 属性
        self.padding_idx = config.pad_token_id
        # 将配置中的 vocab_size 赋给当前实例的 vocab_size 属性
        self.vocab_size = config.vocab_size

        # 创建一个词嵌入层,参数为 vocab_size(词汇表大小)、hidden_size(隐藏层大小)、padding_idx(填充标记的索引)
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        
        # 使用列表推导式创建包含多个 CohereDecoderLayer 对象的层列表,数量为 config.num_hidden_layers
        self.layers = nn.ModuleList(
            [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        
        # 创建一个 CohereLayerNorm 层,参数为 hidden_size(隐藏层大小)、eps(层归一化的 epsilon 值)
        self.norm = CohereLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 设置梯度检查点为 False
        self.gradient_checkpointing = False

        # 创建一个因果遮罩(causal mask),用于区分因果和填充遮罩的创建。注意:这不利于 TorchScript、ONNX 和大型 max_position_embeddings 的序列化导出。
        # causal_mask 是一个二维的布尔类型张量,形状为 (config.max_position_embeddings, config.max_position_embeddings)
        causal_mask = torch.full(
            (config.max_position_embeddings, config.max_position_embeddings), fill_value=True, dtype=torch.bool
        )
        # 将上三角矩阵部分设置为 False,保留下三角矩阵和对角线为 True
        self.register_buffer("causal_mask", torch.triu(causal_mask, diagonal=1), persistent=False)
        
        # 调用初始化后的处理函数
        self.post_init()

    # 返回词嵌入层 embed_tokens
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置词嵌入层 embed_tokens 的值
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # forward 函数重写,对模型进行前向传播
    @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
# 从 transformers.models.llama.modeling_llama.LlamaForCausalLM 复制而来,将 Llama 替换为 Cohere
class CohereForCausalLM(CoherePreTrainedModel):
    # 定义与 lm_head 权重相关的键列表,用于权重共享
    _tied_weights_keys = ["lm_head.weight"]

    # 初始化方法,接受一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 CohereModel 的实例,并保存到 self.model 中
        self.model = CohereModel(config)
        # 从 config 中获取词汇表大小,并保存到 self.vocab_size 中
        self.vocab_size = config.vocab_size
        # 创建一个线性层,用于 LM 头部,将隐藏大小转换为词汇表大小,无偏置
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        # 从 config 中获取 logit_scale,并保存到 self.logit_scale 中
        self.logit_scale = config.logit_scale
        # 从 config 中获取 tie_word_embeddings,并保存到 self.tie_word_embeddings 中
        self.tie_word_embeddings = config.tie_word_embeddings
        # 调用后处理初始化方法
        self.post_init()

    # 返回模型的输入嵌入层对象
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 设置模型的输入嵌入层对象为指定的值
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    # 返回模型的输出嵌入层对象
    def get_output_embeddings(self):
        return self.lm_head

    # 设置模型的输出嵌入层对象为指定的新嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 设置解码器为指定的 decoder 对象
    def set_decoder(self, decoder):
        self.model = decoder

    # 获取当前使用的解码器对象
    def get_decoder(self):
        return self.model

    # 前向传播方法,接收多个输入参数,详见装饰器中的 COHERE_INPUTS_DOCSTRING 描述
    # 返回类型为 CausalLMOutputWithPast,详见配置类 _CONFIG_FOR_DOC
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        # 前向传播逻辑在后续实现中定义,具体实现细节参考具体模型文档

    # 为生成准备输入数据的静态方法,处理生成需要的输入参数
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
    ):
        # 输入数据准备逻辑在后续实现中定义,具体实现细节参考具体模型文档

    # 静态方法,重新排序缓存中的过去键值,以适应 beam search 中的索引变化
    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

.\models\cohere\tokenization_cohere_fast.py

import pickle  # 导入pickle模块,用于序列化和反序列化Python对象
from typing import Dict, List, Literal, Union  # 导入类型提示相关的模块

from tokenizers import processors  # 从tokenizers模块导入processors

from ...pipelines.conversational import Conversation  # 导入对话处理相关模块
from ...tokenization_utils_base import BatchEncoding  # 导入批量编码相关模块
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入预训练的快速分词器
from ...utils import logging  # 导入日志记录工具
from ...utils.versions import require_version  # 导入版本要求检查函数

require_version("tokenizers>=0.13.3")  # 要求tokenizers版本至少为0.13.3

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}  # 定义词汇文件名映射

PRETRAINED_VOCAB_FILES_MAP = {
    "tokenizer_file": {
        "Cohere/Command-nightly": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json",
    },  # 预训练词汇文件的映射,指定了Cohere/Command-nightly模型的tokenizer.json文件位置
}

# fmt: off
DEFAULT_SYSTEM_PROMPT = "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere."
DEFAULT_RAG_PREAMBLE = """## Task and Context
You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.

## Style Guide
Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."""
# fmt: on


class CohereTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding.

    This uses notably ByteFallback and NFC normalization.

    ```
    >>> from transformers import AutoTokenizer

    >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
    >>> tokenizer.encode("Hello this is a test")
    [5, 28339, 2075, 1801, 1671, 3282]
    ```

    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    padding_side = "left"
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = None
    # No `max_model_input_sizes`

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        clean_up_tokenization_spaces=False,
        unk_token="<UNK>",
        bos_token="<BOS_TOKEN>",
        eos_token="<|END_OF_TURN_TOKEN|>",
        add_bos_token=True,
        add_eos_token=False,
        use_default_system_prompt=False,
        add_prefix_space=False,
        **kwargs,
    ):


        vocab_files_names = VOCAB_FILES_NAMES
        # 初始化类的属性 `vocab_files_names`,使用预定义的常量 `VOCAB_FILES_NAMES`

        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
        # 初始化类的属性 `pretrained_vocab_files_map`,使用预定义的常量 `PRETRAINED_VOCAB_FILES_MAP`

        padding_side = "left"
        # 初始化类的属性 `padding_side`,设置为字符串 "left"

        model_input_names = ["input_ids", "attention_mask"]
        # 初始化类的属性 `model_input_names`,设置为包含两个字符串元素的列表

        slow_tokenizer_class = None
        # 初始化类的属性 `slow_tokenizer_class`,设置为 `None`

        # No `max_model_input_sizes`
        # 没有定义 `max_model_input_sizes` 属性

        def __init__(
            self,
            vocab_file=None,
            merges_file=None,
            tokenizer_file=None,
            clean_up_tokenization_spaces=False,
            unk_token="<UNK>",
            bos_token="<BOS_TOKEN>",
            eos_token="<|END_OF_TURN_TOKEN|>",
            add_bos_token=True,
            add_eos_token=False,
            use_default_system_prompt=False,
            add_prefix_space=False,
            **kwargs,
        ):
        # 类的初始化方法,定义了多个可选参数和默认值,用于实例化一个 tokenizer 对象
        ):
        super().__init__(
            vocab_file=vocab_file,
            merges_file=merges_file,
            tokenizer_file=tokenizer_file,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            use_default_system_prompt=use_default_system_prompt,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
        self._add_bos_token = add_bos_token
        self._add_eos_token = add_eos_token
        self.update_post_processor()
        self.use_default_system_prompt = use_default_system_prompt
        self.vocab_file = vocab_file
        self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
        self.tool_use_template = kwargs.pop("tool_use_template", None)

        # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
        # check this as they were green before.
        # 序列化并存储当前后端分词器的预处理器和解码器状态
        pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
        decoder_state = pickle.dumps(self.backend_tokenizer.decoder)

        # 如果设置了 add_prefix_space 为 True,则修改序列化状态中的相应配置
        if add_prefix_space:
            pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
            decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
        
        # 从序列化状态中恢复后端分词器的预处理器和解码器
        self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
        self.backend_tokenizer.decoder = pickle.loads(decoder_state)

        self.add_prefix_space = add_prefix_space

    # 对输入进行批量编码,并返回 BatchEncoding 对象
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)
        
        # 如果未设置 add_prefix_space 为 True 或者输入未经预分词,则抛出异常
        if not (self.add_prefix_space or not is_split_into_words):
            raise Exception(
                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
                " pretokenized inputs."
            )

        return super()._batch_encode_plus(*args, **kwargs)

    # 对单个输入进行编码,并返回 BatchEncoding 对象
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)

        # 如果未设置 add_prefix_space 为 True 或者输入未经预分词,则抛出异常
        if not (self.add_prefix_space or not is_split_into_words):
            raise Exception(
                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
                " pretokenized inputs."
            )

        return super()._encode_plus(*args, **kwargs)
    def update_post_processor(self):
        """
        更新底层的后处理器,使用当前的 `bos_token` 和 `eos_token`。
        """
        bos = self.bos_token  # 获取开始词(bos_token)
        bos_token_id = self.bos_token_id  # 获取开始词的 ID(bos_token_id)
        if bos is None and self.add_bos_token:
            raise ValueError("add_bos_token = True but bos_token = None")  # 如果 add_bos_token 为 True 但 bos_token 为 None,则引发错误

        eos = self.eos_token  # 获取结束词(eos_token)
        eos_token_id = self.eos_token_id  # 获取结束词的 ID(eos_token_id)
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")  # 如果 add_eos_token 为 True 但 eos_token 为 None,则引发错误

        # 创建单个和双语句模板
        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"

        special_tokens = []
        if self.add_bos_token:
            special_tokens.append((bos, bos_token_id))  # 如果需要添加开始词,则将开始词及其 ID 添加到特殊词列表中
        if self.add_eos_token:
            special_tokens.append((eos, eos_token_id))  # 如果需要添加结束词,则将结束词及其 ID 添加到特殊词列表中
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=single, pair=pair, special_tokens=special_tokens
        )  # 更新 Tokenizer 的后处理器使用新的模板和特殊词列表

    @property
    def add_eos_token(self):
        return self._add_eos_token  # 返回是否添加结束词的属性值

    @property
    def add_bos_token(self):
        return self._add_bos_token  # 返回是否添加开始词的属性值

    @add_eos_token.setter
    def add_eos_token(self, value):
        self._add_eos_token = value  # 设置是否添加结束词的属性值
        self.update_post_processor()  # 更新后处理器以反映属性变化

    @add_bos_token.setter
    def add_bos_token(self, value):
        self._add_bos_token = value  # 设置是否添加开始词的属性值
        self.update_post_processor()  # 更新后处理器以反映属性变化

    @property
    def apply_tool_use_template(
        self,
        conversation: Union[List[Dict[str, str]], "Conversation"],
        tools: List[Dict],
        **kwargs,
    ):
        """
        应用工具使用模板到给定对话和工具列表中的工具。
        """
        # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers

    def apply_grounded_generation_template(
        self,
        conversation: Union[List[Dict[str, str]], "Conversation"],
        documents: List[Dict],
        citation_mode: Literal["fast", "accurate"] = "accurate",
        **kwargs,
    ):
        """
        应用基于文档生成的模板到给定对话和文档列表中的文档。
        """
        # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        构建包含特殊标记的输入序列。

        Args:
        - token_ids_0: 第一个输入序列的 token IDs
        - token_ids_1: 可选,第二个输入序列的 token IDs

        Returns:
        - 包含特殊标记的输入序列
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []  # 如果需要添加开始词,则创建开始词的 ID 列表,否则为空列表
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []  # 如果需要添加结束词,则创建结束词的 ID 列表,否则为空列表

        output = bos_token_id + token_ids_0 + eos_token_id  # 构建输出序列,包含开始词 ID、第一个输入序列的 token IDs、结束词 ID

        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id  # 如果存在第二个输入序列,则将第二个输入序列的 token IDs 同样添加到输出序列中

        return output  # 返回包含特殊标记的完整输入序列

.\models\cohere\__init__.py

# 版权声明和许可证信息,指明代码版权所有者和使用许可
# Copyright 2024 Cohere and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入需要的类型检查模块
from typing import TYPE_CHECKING

# 导入必要的依赖项和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_cohere": ["COHERE_PRETRAINED_CONFIG_ARCHIVE_MAP", "CohereConfig"],
}

# 检查是否有 tokenizers 库可用,若不可用则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加 tokenization_cohere_fast 模块到导入结构中
    _import_structure["tokenization_cohere_fast"] = ["CohereTokenizerFast"]

# 检查是否有 torch 库可用,若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用,则添加 modeling_cohere 模块到导入结构中
    _import_structure["modeling_cohere"] = [
        "CohereForCausalLM",
        "CohereModel",
        "CoherePreTrainedModel",
    ]

# 如果是类型检查环境
if TYPE_CHECKING:
    # 导入配置和模型相关的模块
    from .configuration_cohere import COHERE_PRETRAINED_CONFIG_ARCHIVE_MAP, CohereConfig

    # 检查是否有 tokenizers 库可用,若不可用则忽略
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,则导入 tokenization_cohere_fast 模块
        from .tokenization_cohere_fast import CohereTokenizerFast

    # 检查是否有 torch 库可用,若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,则导入 modeling_cohere 模块中的类
        from .modeling_cohere import (
            CohereForCausalLM,
            CohereModel,
            CoherePreTrainedModel,
        )

# 如果不是类型检查环境
else:
    # 导入 sys 模块
    import sys

    # 使用 _LazyModule 将当前模块作为惰性模块导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\conditional_detr\configuration_conditional_detr.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Conditional DETR model configuration"""

# 导入 OrderedDict 和 Mapping 类型
from collections import OrderedDict
from typing import Mapping

# 导入 version 函数从 packaging 模块中
from packaging import version

# 从相对路径中导入配置相关的类和函数
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型配置文件的映射字典
CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/conditional-detr-resnet-50": (
        "https://huggingface.co/microsoft/conditional-detr-resnet-50/resolve/main/config.json"
    ),
}

# ConditionalDetrConfig 类,继承自 PretrainedConfig 类
class ConditionalDetrConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ConditionalDetrModel`]. It is used to instantiate
    a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Conditional DETR
    [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Examples:

    ```
    >>> from transformers import ConditionalDetrConfig, ConditionalDetrModel

    >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
    >>> configuration = ConditionalDetrConfig()

    >>> # Initializing a model (with random weights) from the microsoft/conditional-detr-resnet-50 style configuration
    >>> model = ConditionalDetrModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 模型类型为 conditional_detr
    model_type = "conditional_detr"
    
    # 推断阶段要忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]
    
    # 属性映射字典,用于配置属性名的转换
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
    }
    # 初始化方法,用于创建一个新的对象实例
    def __init__(
        self,
        use_timm_backbone=True,  # 是否使用timm作为backbone,默认为True
        backbone_config=None,  # backbone的配置参数,默认为None
        num_channels=3,  # 输入图像的通道数,默认为3
        num_queries=300,  # 查询向量的数量,默认为300
        encoder_layers=6,  # 编码器层数,默认为6层
        encoder_ffn_dim=2048,  # 编码器中FFN层的维度,默认为2048
        encoder_attention_heads=8,  # 编码器注意力头的数量,默认为8
        decoder_layers=6,  # 解码器层数,默认为6层
        decoder_ffn_dim=2048,  # 解码器中FFN层的维度,默认为2048
        decoder_attention_heads=8,  # 解码器注意力头的数量,默认为8
        encoder_layerdrop=0.0,  # 编码器层dropout的概率,默认为0.0
        decoder_layerdrop=0.0,  # 解码器层dropout的概率,默认为0.0
        is_encoder_decoder=True,  # 是否是编码器-解码器结构,默认为True
        activation_function="relu",  # 激活函数的类型,默认为ReLU
        d_model=256,  # 模型的维度,默认为256
        dropout=0.1,  # 全局dropout的概率,默认为0.1
        attention_dropout=0.0,  # 注意力层dropout的概率,默认为0.0
        activation_dropout=0.0,  # 激活函数dropout的概率,默认为0.0
        init_std=0.02,  # 权重初始化的标准差,默认为0.02
        init_xavier_std=1.0,  # Xavier初始化的标准差,默认为1.0
        auxiliary_loss=False,  # 是否使用辅助损失,默认为False
        position_embedding_type="sine",  # 位置编码的类型,默认为"sine"
        backbone="resnet50",  # 使用的backbone网络,默认为"resnet50"
        use_pretrained_backbone=True,  # 是否使用预训练的backbone,默认为True
        backbone_kwargs=None,  # backbone网络的额外参数,默认为None
        dilation=False,  # 是否使用扩张卷积,默认为False
        class_cost=2,  # 类别损失的权重,默认为2
        bbox_cost=5,  # 边界框损失的权重,默认为5
        giou_cost=2,  # GIoU损失的权重,默认为2
        mask_loss_coefficient=1,  # 掩码损失的系数,默认为1
        dice_loss_coefficient=1,  # Dice损失的系数,默认为1
        cls_loss_coefficient=2,  # 类别损失的系数,默认为2
        bbox_loss_coefficient=5,  # 边界框损失的系数,默认为5
        giou_loss_coefficient=2,  # GIoU损失的系数,默认为2
        focal_alpha=0.25,  # Focal损失的alpha参数,默认为0.25
        **kwargs,  # 其他未列出的关键字参数
    ):
        # 编码器的注意力头数量,从类属性中获取
        @property
        def num_attention_heads(self) -> int:
            return self.encoder_attention_heads

        # 隐藏层大小,从类属性中获取
        @property
        def hidden_size(self) -> int:
            return self.d_model
# 定义一个名为 ConditionalDetrOnnxConfig 的类,继承自 OnnxConfig 类
class ConditionalDetrOnnxConfig(OnnxConfig):
    # 设定 torch_onnx_minimum_version 属性为版本号 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义一个 inputs 的属性方法,返回一个有序字典,描述了模型的输入信息
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
                ("pixel_mask", {0: "batch"}),
            ]
        )

    # 定义一个 atol_for_validation 的属性方法,返回一个用于验证的浮点数容差值
    @property
    def atol_for_validation(self) -> float:
        return 1e-5

    # 定义一个 default_onnx_opset 的属性方法,返回默认的 ONNX 操作集版本号
    @property
    def default_onnx_opset(self) -> int:
        return 12

.\models\conditional_detr\convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Conditional DETR checkpoints."""


import argparse
import json
from collections import OrderedDict
from pathlib import Path

import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image

from transformers import (
    ConditionalDetrConfig,
    ConditionalDetrForObjectDetection,
    ConditionalDetrForSegmentation,
    ConditionalDetrImageProcessor,
)
from transformers.utils import logging


logging.set_verbosity_info()  # 设置日志输出级别为INFO
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

# here we list all keys to be renamed (original name on the left, our name on the right)
rename_keys = []
for i in range(6):
    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
    )
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
    )
    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
    )
    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
    )
    # 将下面的键值对添加到 rename_keys 列表中,用于重命名模型参数的键名
    rename_keys.append(
        (
            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
        )
    )
    rename_keys.append(
        (
            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
        )
    )
    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
    )
    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
    )
    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))

    # 为条件式 DETR 的解码器中的自注意力和交叉注意力添加投影权重
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
    )
    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
    )
    # 以下键注释被注释掉,因为它们在代码中已经被注释掉
    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
    )
    # 将以下键值对添加到 rename_keys 列表中,用于重命名模型参数的路径
    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
    )

    # 添加以下键值对到 rename_keys 列表,用于重命名模型参数的路径
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
    )
    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))

    # 添加以下键值对到 rename_keys 列表,用于重命名模型参数的路径
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
    )
    # 注释以下行代码被注释掉,不会添加到 rename_keys 列表中
    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
    )
    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))

    # 添加以下键值对到 rename_keys 列表,用于重命名模型参数的路径
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
    )
# 定义需要重命名的键值对列表,用于转换模型参数命名空间
rename_keys.extend(
    [
        ("input_proj.weight", "input_projection.weight"),
        ("input_proj.bias", "input_projection.bias"),
        ("query_embed.weight", "query_position_embeddings.weight"),
        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
        ("class_embed.weight", "class_labels_classifier.weight"),
        ("class_embed.bias", "class_labels_classifier.bias"),
        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
    ]
)


def rename_key(state_dict, old, new):
    # 从状态字典中弹出旧的键值对,然后插入新的键值对
    val = state_dict.pop(old)
    state_dict[new] = val


def rename_backbone_keys(state_dict):
    # 创建新的有序字典,用于存储重命名后的状态字典
    new_state_dict = OrderedDict()
    for key, value in state_dict.items():
        if "backbone.0.body" in key:
            # 替换特定的键名以适应新的模型结构
            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value

    return new_state_dict


def read_in_q_k_v(state_dict, is_panoptic=False):
    # 根据条件是否为全景检测,选择不同的模型前缀
    prefix = ""
    if is_panoptic:
        prefix = "conditional_detr."

    # 第一步:Transformer 编码器
    # 循环处理6次,分别处理每个层的自注意力机制参数
    for i in range(6):
        # 从状态字典中弹出输入投影层权重和偏置(在PyTorch的MultiHeadAttention中,这是单个矩阵加偏置)
        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
        
        # 将权重切片并添加到状态字典中作为查询、键和值的投影权重
        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
# 我们将在一张可爱猫咪的图片上验证我们的结果
def prepare_img():
    # 图片的 URL 地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 HTTP 请求获取图片的原始数据流,并打开为图像对象
    im = Image.open(requests.get(url, stream=True).raw)

    # 返回处理后的图像对象
    return im


@torch.no_grad()
def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
    """
    将模型权重复制/粘贴/调整到我们的 CONDITIONAL_DETR 结构中。
    """

    # 加载默认配置
    config = ConditionalDetrConfig()
    # 设置骨干网络和膨胀属性
    if "resnet101" in model_name:
        config.backbone = "resnet101"
    if "dc5" in model_name:
        config.dilation = True
    is_panoptic = "panoptic" in model_name
    if is_panoptic:
        # 如果是全景分割模型,则设置类别数为 250
        config.num_labels = 250
    else:
        # 如果是检测模型,则设置类别数为 91
        config.num_labels = 91
        # 下载 COCO 检测任务标签映射文件并加载
        repo_id = "huggingface/label-files"
        filename = "coco-detection-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    # 加载图像处理器
    format = "coco_panoptic" if is_panoptic else "coco_detection"
    image_processor = ConditionalDetrImageProcessor(format=format)

    # 准备图像
    img = prepare_img()
    # 使用图像处理器对图像进行编码,返回 PyTorch 张量
    encoding = image_processor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    # 记录日志,显示正在转换的模型名称
    logger.info(f"Converting model {model_name}...")

    # 从 torch hub 加载原始模型
    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
    state_dict = conditional_detr.state_dict()
    # 重命名键名
    for src, dest in rename_keys:
        if is_panoptic:
            src = "conditional_detr." + src
        rename_key(state_dict, src, dest)
    state_dict = rename_backbone_keys(state_dict)
    # 针对查询、键和值矩阵进行特殊处理
    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
    # 重要提示:对于基础模型的每个键名,我们需要添加前缀,因为头模型使用不同的属性
    prefix = "conditional_detr.model." if is_panoptic else "model."
    # 遍历状态字典的复制版本中的所有键
    for key in state_dict.copy().keys():
        # 如果是全景视觉任务
        if is_panoptic:
            # 如果键以"conditional_detr"开头,并且不以"class_labels_classifier"或"bbox_predictor"开头
            if (
                key.startswith("conditional_detr")
                and not key.startswith("class_labels_classifier")
                and not key.startswith("bbox_predictor")
            ):
                # 弹出该键对应的值,并将其添加到新键中,加上前缀"conditional_detr.model"
                val = state_dict.pop(key)
                state_dict["conditional_detr.model" + key[4:]] = val
            # 如果键包含"class_labels_classifier"或"bbox_predictor"
            elif "class_labels_classifier" in key or "bbox_predictor" in key:
                # 弹出该键对应的值,并将其添加到新键中,加上前缀"conditional_detr."
                val = state_dict.pop(key)
                state_dict["conditional_detr." + key] = val
            # 如果键以"bbox_attention"或"mask_head"开头,则跳过此次循环
            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
                continue
            # 否则,弹出该键对应的值,并将其添加到新键中,加上指定前缀
            else:
                val = state_dict.pop(key)
                state_dict[prefix + key] = val
        # 如果不是全景视觉任务
        else:
            # 如果键既不以"class_labels_classifier"开头也不以"bbox_predictor"开头
            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
                # 弹出该键对应的值,并将其添加到新键中,加上指定前缀
                val = state_dict.pop(key)
                state_dict[prefix + key] = val

    # 最后,根据是否为全景视觉任务创建 HuggingFace 模型并加载状态字典
    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()
    
    # 将模型推送到 Hub 上的指定仓库
    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
    
    # 验证转换的正确性
    original_outputs = conditional_detr(pixel_values)
    outputs = model(pixel_values)
    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
    # 如果是全景视觉任务,还需验证预测的掩膜
    if is_panoptic:
        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
    
    # 保存模型和图像处理器
    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
    image_processor.save_pretrained(pytorch_dump_folder_path)
# 如果当前脚本作为主程序运行(而非被导入到其他脚本中执行),则执行以下代码块
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()

    # 向参数解析器中添加一个参数,用于指定模型名称,默认为"conditional_detr_resnet50"
    parser.add_argument(
        "--model_name",
        default="conditional_detr_resnet50",
        type=str,
        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
    )

    # 向参数解析器中添加一个参数,用于指定输出 PyTorch 模型的文件夹路径,默认为 None
    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        help="Path to the folder to output PyTorch model."
    )

    # 解析命令行参数并将其存储在 args 对象中
    args = parser.parse_args()

    # 调用函数 convert_conditional_detr_checkpoint,将解析得到的模型名称和输出路径作为参数传递
    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
posted @ 2024-06-29 15:50  绝不原创的飞龙  阅读(35)  评论(0编辑  收藏  举报