Transformers-源码解析-一百三十二-

Transformers 源码解析(一百三十二)

.\pipelines\pt_utils.py

import numpy as np
import torch
from torch.utils.data import Dataset, IterableDataset

from ..utils.generic import ModelOutput


class PipelineDataset(Dataset):
    def __init__(self, dataset, process, params):
        self.dataset = dataset  # 存储数据集对象
        self.process = process  # 存储数据处理函数
        self.params = params    # 存储参数

    def __len__(self):
        return len(self.dataset)  # 返回数据集的长度

    def __getitem__(self, i):
        item = self.dataset[i]                      # 获取索引为i的数据集元素
        processed = self.process(item, **self.params)  # 使用给定的处理函数和参数处理数据
        return processed  # 返回处理后的数据


class PipelineIterator(IterableDataset):
    def __init__(self, loader, infer, params, loader_batch_size=None):
        """
        大致相当于

        ```
        for item in loader:
            yield infer(item, **params)
        ```

        参数:
            loader (`torch.utils.data.DataLoader` 或任何迭代器):
                将应用 `infer` 函数的迭代器。
            infer (任何函数):
                要应用于 `loader` 每个元素的函数。
            params (`dict`):
                传递给 `infer` 函数的参数。
            loader_batch_size (`int`, *可选*):
                如果指定,则假设 `loader` 中的项作为批次进行处理,并在此处加载批次,
                使其大致行为为

        ```
        for items in loader:
            for i in loader_batch_size:
                item = items[i]
                yield infer(item, **params)
        ```"""
        self.loader = loader                  # 存储数据加载器对象
        self.infer = infer                    # 存储推断函数
        self.params = params                  # 存储参数
        if loader_batch_size == 1:
            # 省略一些时间通过全部停用
            loader_batch_size = None
        self.loader_batch_size = loader_batch_size  # 存储加载器批次大小

        # 内部记录
        self._loader_batch_index = None   # 加载器批次索引
        self._loader_batch_data = None    # 加载器批次数据

    def __len__(self):
        return len(self.loader)  # 返回加载器中的元素数量

    def __iter__(self):
        self.iterator = iter(self.loader)  # 创建加载器的迭代器
        return self
    def loader_batch_item(self):
        """
        Return item located at `loader_batch_index` within the current `loader_batch_data`.
        """
        # 如果 `_loader_batch_data` 是 torch.Tensor 类型
        if isinstance(self._loader_batch_data, torch.Tensor):
            # 批处理数据是简单的张量,直接获取切片
            result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0)
        else:
            # 批处理数据假定为 BaseModelOutput(或字典)
            loader_batched = {}
            # 遍历 `_loader_batch_data` 的项目
            for k, element in self._loader_batch_data.items():
                # 如果 element 是 ModelOutput 类型
                if isinstance(element, ModelOutput):
                    # 首先将 ModelOutput 转换为元组
                    element = element.to_tuple()
                    # 如果元组中第一个元素是 torch.Tensor
                    if isinstance(element[0], torch.Tensor):
                        # 将每个元素的正确批处理数据提取出来,并在第一维上增加维度为 1
                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
                    elif isinstance(element[0], np.ndarray):
                        # 将每个元素的正确批处理数据提取出来,并在第一维上增加维度为 1
                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
                    continue
                # 如果 k 是 {"hidden_states", "past_key_values", "attentions"} 之一,并且 element 是元组
                if k in {"hidden_states", "past_key_values", "attentions"} and isinstance(element, tuple):
                    # 这些通常存储为张量列表,因此需要特定的解批处理操作
                    if isinstance(element[0], torch.Tensor):
                        # 将每个元素的正确批处理数据提取出来,并在第一维上增加维度为 1
                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
                    elif isinstance(element[0], np.ndarray):
                        # 将每个元素的正确批处理数据提取出来,并在第一维上增加维度为 1
                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
                    continue
                # 如果 element 是 None,通常表示可选数据
                if element is None:
                    loader_batched[k] = None
                elif isinstance(element[self._loader_batch_index], torch.Tensor):
                    # 取出正确的批处理数据,但使其看起来像 batch_size=1
                    # 以便与 transformers 中的其他方法兼容
                    loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
                elif isinstance(element[self._loader_batch_index], np.ndarray):
                    # 取出正确的批处理数据,但使其看起来像 batch_size=1
                    # 以便与 transformers 中的其他方法兼容
                    loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
                else:
                    # 这通常是一个列表,因此无需 `unsqueeze` 操作
                    loader_batched[k] = element[self._loader_batch_index]
            # 通过使用原始类重新创建元素,使其看起来像 batch_size=1
            result = self._loader_batch_data.__class__(loader_batched)
        # 增加 `_loader_batch_index` 以便下次调用获取下一个批处理项
        self._loader_batch_index += 1
        return result
    # 定义迭代器的下一个元素方法
    def __next__(self):
        # 检查是否当前正在展开一个批次,并且批次内部索引小于批次大小
        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
            # 当前正在展开批次,直接返回批次内的当前项
            return self.loader_batch_item()

        # 批次内的项已经用完
        # 获取下一个迭代器中的项
        item = next(self.iterator)
        # 对获取的项进行推断处理,使用预定义的参数
        processed = self.infer(item, **self.params)
        # 现在有了一批推断出的数据
        if self.loader_batch_size is not None:
            # 尝试推断出批次的大小
            if isinstance(processed, torch.Tensor):
                first_tensor = processed
            else:
                key = list(processed.keys())[0]
                first_tensor = processed[key]
            if isinstance(first_tensor, list):
                observed_batch_size = len(first_tensor)
            else:
                observed_batch_size = first_tensor.shape[0]
            if 0 < observed_batch_size < self.loader_batch_size:
                # 可能是最后一个批次,因此无法展开太多元素
                self.loader_batch_size = observed_batch_size
            # 设置内部索引以展开批次数据
            self._loader_batch_data = processed
            self._loader_batch_index = 0
            # 返回批次内的当前项
            return self.loader_batch_item()
        else:
            # 不需要展开批次
            return processed
class PipelineChunkIterator(PipelineIterator):
    def __init__(self, loader, infer, params, loader_batch_size=None):
        """
        Roughly equivalent to

        ```
        for iterator in loader:
            for item in iterator:
                yield infer(item, **params)
        ```

        Arguments:
            loader (`torch.utils.data.DataLoader` or any iterator):
                The iterator that will be used to apply `infer` on.
            infer (any function):
                The function to apply of each element of `loader`.
            params (`dict`):
                The parameters passed to `infer` along with every item
        """
        super().__init__(loader, infer, params)

    def __iter__(self):
        # Initialize the main iterator over the loader
        self.iterator = iter(self.loader)
        # Initialize subiterator to None
        self.subiterator = None
        return self

    def __next__(self):
        if self.subiterator is None:
            # If subiterator is None, start the preprocessing on the next item
            self.subiterator = self.infer(next(self.iterator), **self.params)
        
        try:
            # Try to retrieve the next processed item from subiterator
            processed = next(self.subiterator)
        except StopIteration:
            # If subiterator is exhausted, move to the next item in the main iterator
            # This is akin to flattening nested iterators into a single sequence
            self.subiterator = self.infer(next(self.iterator), **self.params)
            processed = next(self.subiterator)
        
        return processed


class PipelinePackIterator(PipelineIterator):
    """
    Roughly equivalent to

    ```
    packed =  []
    for item in loader:
        packed.append(item)
        if item["is_last"]:
            yield packed
            packed = []
    ```

    but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1). In
    that case it does

    ```
    packed =  []
    for batch in loader:
        # item is batched
        for item in batch:
            packed.append(item)
            if item["is_last"]:
                yield packed
                packed = []
    ```

    Arguments:
        loader (`torch.utils.data.DataLoader` or any iterator):
            The iterator that will be used to apply `infer` on.
        infer (any function):
            The function to apply to each element of `loader`.
        params (`dict`):
            The parameters passed to `infer` along with every item
        loader_batch_size (`int`, *optional*):
            If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making
            it roughly behave as

    """
    for items in loader:
        for i in loader_batch_size:
            item = items[i]
            yield infer(item, **params)



    def __iter__(self):
        # 设置迭代器为加载器的迭代器
        self.iterator = iter(self.loader)
        return self

    def __next__(self):
        # 与 PipelineIterator 非常相似的解包机制
        # 但是,我们有一个额外的必需项,即 `is_last` 的存在
        # 这是因为一切都被 `PipelineChunkIterator` 扁平化了,我们需要在这里
        # 在原始的 `process` 边界处跟踪如何重新分组,以便 `process` 和 `postprocess` 看到相同的数据。

        # 此迭代器累积项目(可能在取消批处理时),直到遇到 `is_last`,然后将其传递给调用者。
        is_last = False
        accumulator = []
        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
            while self._loader_batch_index < self.loader_batch_size:
                # 获取加载器批处理项
                item = self.loader_batch_item()
                is_last = item.pop("is_last")
                accumulator.append(item)
                if is_last:
                    return accumulator

        while not is_last:
            # 处理下一个加载器项
            processed = self.infer(next(self.iterator), **self.params)
            if self.loader_batch_size is not None:
                if isinstance(processed, torch.Tensor):
                    first_tensor = processed
                else:
                    key = list(processed.keys())[0]
                    first_tensor = processed[key]
                if isinstance(first_tensor, list):
                    observed_batch_size = len(first_tensor)
                else:
                    observed_batch_size = first_tensor.shape[0]
                if 0 < observed_batch_size < self.loader_batch_size:
                    # 可能是最后一个批次,因此我们不能展开太多元素。
                    self.loader_batch_size = observed_batch_size
                self._loader_batch_data = processed
                self._loader_batch_index = 0
                while self._loader_batch_index < self.loader_batch_size:
                    # 获取加载器批处理项
                    item = self.loader_batch_item()
                    is_last = item.pop("is_last")
                    accumulator.append(item)
                    if is_last:
                        return accumulator
            else:
                # 单个加载器项处理
                item = processed
                is_last = item.pop("is_last")
                accumulator.append(item)
        return accumulator
# 定义一个名为 KeyDataset 的类,继承自 Dataset 类
class KeyDataset(Dataset):
    # 初始化方法,接收一个 dataset 和一个 key 参数
    def __init__(self, dataset: Dataset, key: str):
        # 将传入的 dataset 参数赋值给实例变量 self.dataset
        self.dataset = dataset
        # 将传入的 key 参数赋值给实例变量 self.key
        self.key = key

    # 返回数据集的长度
    def __len__(self):
        return len(self.dataset)

    # 根据索引 i 返回对应元素的 key 字段值
    def __getitem__(self, i):
        return self.dataset[i][self.key]


# 定义一个名为 KeyPairDataset 的类,继承自 Dataset 类
class KeyPairDataset(Dataset):
    # 初始化方法,接收一个 dataset 和两个 key 参数
    def __init__(self, dataset: Dataset, key1: str, key2: str):
        # 将传入的 dataset 参数赋值给实例变量 self.dataset
        self.dataset = dataset
        # 将传入的 key1 参数赋值给实例变量 self.key1
        self.key1 = key1
        # 将传入的 key2 参数赋值给实例变量 self.key2
        self.key2 = key2

    # 返回数据集的长度
    def __len__(self):
        return len(self.dataset)

    # 根据索引 i 返回一个字典,包含 text 键和 text_pair 键,分别对应 dataset 中的 key1 和 key2 字段值
    def __getitem__(self, i):
        return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]}

.\pipelines\question_answering.py

# 导入inspect模块,用于获取对象信息
import inspect
# 导入types模块,用于类型检查和动态类型创建
import types
# 导入warnings模块,用于处理警告信息
import warnings
# 从collections.abc导入Iterable,用于检查对象是否可迭代
from collections.abc import Iterable
# 导入typing模块中的类型标记,用于类型提示
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union

# 导入numpy库,用于处理数值计算
import numpy as np

# 从相对路径导入特定模块
from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
# 从相对路径导入模型卡片模块
from ..modelcard import ModelCard
# 从相对路径导入预训练分词器模块
from ..tokenization_utils import PreTrainedTokenizer
# 从相对路径导入工具模块
from ..utils import (
    PaddingStrategy,
    add_end_docstrings,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
    logging,
)
# 从当前路径导入base模块中的具体内容
from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 如果是类型检查环境,则导入相应的模型预训练模块
if TYPE_CHECKING:
    from ..modeling_tf_utils import TFPreTrainedModel
    from ..modeling_utils import PreTrainedModel
    # 如果支持tokenizers,则导入tokenizers模块
    if is_tokenizers_available():
        import tokenizers

# 如果支持TensorFlow,则导入TensorFlow模块和相关模型映射名称
if is_tf_available():
    import tensorflow as tf
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
    # 初始化Dataset为None
    Dataset = None

# 如果支持PyTorch,则导入PyTorch模块和相关模型映射名称,以及数据集类
if is_torch_available():
    import torch
    from torch.utils.data import Dataset
    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES

# 定义函数decode_spans,接受一系列参数并返回一个元组
def decode_spans(
    start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
) -> Tuple:
    """
    从任何ModelForQuestionAnswering的输出中获取每个可能答案区间的概率,并生成这些区间的概率。

    同时过滤掉一些不想要/不可能的情况,比如答案长度大于max_answer_len或者答案结束位置在起始位置之前。
    通过topk参数支持输出k个最佳答案区间。

    Args:
        start (`np.ndarray`): 每个token的起始概率
        end (`np.ndarray`): 每个token的结束概率
        topk (`int`): 指示从模型输出中提取多少个可能的答案区间
        max_answer_len (`int`): 从模型输出中提取的答案的最大长度
        undesired_tokens (`np.ndarray`): 掩码,确定可以成为答案一部分的token
    """
    # 确保start具有批处理轴
    if start.ndim == 1:
        start = start[None]

    # 确保end具有批处理轴
    if end.ndim == 1:
        end = end[None]

    # 计算每个元组(start, end)作为真实答案的得分
    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

    # 删除end < start或end - start > max_answer_len的候选答案
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    # 受Chen & al.启发 (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    # 从flatten的scores中解码出starts和ends的索引
    starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
    # 根据 undesired_tokens 中非零值的位置,筛选出 starts 和 ends 中对应位置也在 undesired_tokens 中非零值位置的元素
    desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
    
    # 根据 desired_spans 筛选出符合条件的 starts 和 ends 的元素
    starts = starts[desired_spans]
    ends = ends[desired_spans]
    
    # 从 candidates 中取出符合条件的分数,这些条件由 starts 和 ends 确定
    scores = candidates[0, starts, ends]
    
    # 返回经过筛选后的 starts、ends 和相应的 scores
    return starts, ends, scores
# 定义一个函数,用于处理模型问答输出的原始数据,并进行规范化处理,然后使用 `decode_spans()` 生成每个可能答案区间的概率。
def select_starts_ends(
    start,
    end,
    p_mask,
    attention_mask,
    min_null_score=1000000,
    top_k=1,
    handle_impossible_answer=False,
    max_answer_len=15,
):
    """
    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
    `decode_spans()` to generate probabilities for each span to be the actual answer.

    Args:
        start (`np.ndarray`): Individual start logits for each token.
            每个 token 的起始 logits。
        end (`np.ndarray`): Individual end logits for each token.
            每个 token 的结束 logits。
        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
            不能作为答案的值的掩码,值为1。
        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
            由分词器生成的注意力掩码。
        min_null_score(`float`): The minimum null (empty) answer score seen so far.
            到目前为止看到的最小空答案分数。
        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
            指示从模型输出中提取多少个可能的答案区间。
        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
            是否允许空答案。
        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
            从模型输出中提取的答案的最大长度。
    """
    # 确保填充的 token 和问题的 token 不能成为候选答案的一部分。
    undesired_tokens = np.abs(np.array(p_mask) - 1)

    if attention_mask is not None:
        undesired_tokens = undesired_tokens & attention_mask

    # 生成掩码
    undesired_tokens_mask = undesired_tokens == 0.0

    # 确保张量中非上下文索引不能对 softmax 贡献
    start = np.where(undesired_tokens_mask, -10000.0, start)
    end = np.where(undesired_tokens_mask, -10000.0, end)

    # 对 logits 和区间进行归一化,以检索答案
    start = np.exp(start - start.max(axis=-1, keepdims=True))
    start = start / start.sum()

    end = np.exp(end - end.max(axis=-1, keepdims=True))
    end = end / end.sum()

    if handle_impossible_answer:
        min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())

    # Mask CLS
    # 掩码 CLS token
    start[0, 0] = end[0, 0] = 0.0

    # 使用 decode_spans 解码生成答案区间的起始点、结束点和分数
    starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
    return starts, ends, scores, min_null_score


class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
    internal [`SquadExample`].

    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
    supplied arguments.
    """
    # 定义一个方法用于规范化输入数据,确保符合预期的数据结构
    def normalize(self, item):
        # 如果输入是 SquadExample 类型,则直接返回
        if isinstance(item, SquadExample):
            return item
        # 如果输入是字典类型,则检查其包含的字段是否完整和有效,然后创建相应的样本对象
        elif isinstance(item, dict):
            for k in ["question", "context"]:
                # 检查字典中是否包含必需的键
                if k not in item:
                    raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
                # 检查对应的值是否为 None
                elif item[k] is None:
                    raise ValueError(f"`{k}` cannot be None")
                # 检查对应的字符串值是否为空
                elif isinstance(item[k], str) and len(item[k]) == 0:
                    raise ValueError(f"`{k}` cannot be empty")

            # 根据字典创建 QuestionAnsweringPipeline 的样本对象
            return QuestionAnsweringPipeline.create_sample(**item)
        # 如果输入既不是 SquadExample 类型也不是字典类型,则抛出异常
        raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)")

    # 定义对象可调用方法,根据不同类型的输入进行处理并返回规范化后的数据
    def __call__(self, *args, **kwargs):
        # 检测实际输入数据的来源
        if args is not None and len(args) > 0:
            # 如果只有一个参数,则直接将其视为输入数据
            if len(args) == 1:
                inputs = args[0]
            # 如果有两个参数且都是字符串,则将其作为问题和上下文创建为字典格式的输入数据
            elif len(args) == 2 and {type(el) for el in args} == {str}:
                inputs = [{"question": args[0], "context": args[1]}]
            else:
                # 否则将所有参数作为列表
                inputs = list(args)
        # 如果通过关键字参数指定了 "X" 或 "data" 则将其作为输入数据
        elif "X" in kwargs:
            inputs = kwargs["X"]
        elif "data" in kwargs:
            inputs = kwargs["data"]
        # 如果指定了 "question" 和 "context" 则根据其类型构造相应的输入数据
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
                # 如果问题是列表而上下文是字符串,则创建多个包含问题和上下文的字典
                inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]]
            elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list):
                # 如果问题和上下文都是列表,则按索引逐一配对创建字典
                if len(kwargs["question"]) != len(kwargs["context"]):
                    raise ValueError("Questions and contexts don't have the same lengths")
                inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])]
            elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str):
                # 如果问题和上下文都是字符串,则创建一个包含问题和上下文的字典
                inputs = [{"question": kwargs["question"], "context": kwargs["context"]}]
            else:
                # 其他情况抛出异常
                raise ValueError("Arguments can't be understood")
        else:
            # 如果无法识别输入类型,则抛出异常
            raise ValueError(f"Unknown arguments {kwargs}")

        # 当输入为生成器类型时,直接返回输入数据
        generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,)
        if isinstance(inputs, generator_types):
            return inputs

        # 规范化输入数据
        if isinstance(inputs, dict):
            inputs = [inputs]
        elif isinstance(inputs, Iterable):
            # 复制以避免覆盖参数
            inputs = list(inputs)
        else:
            # 如果输入类型无效,则抛出异常
            raise ValueError(f"Invalid arguments {kwargs}")

        # 对每个输入项进行规范化处理
        for i, item in enumerate(inputs):
            inputs[i] = self.normalize(item)

        # 返回规范化后的输入数据
        return inputs
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
class QuestionAnsweringPipeline(ChunkPipeline):
    """
    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
    examples](../task_summary#question-answering) for more information.

    Example:

    ```
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="deepset/roberta-base-squad2")
    >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
    {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
    """

    default_input_names = "question,context"
    handle_impossible_answer = False

    def __init__(
        self,
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
        task: str = "",
        **kwargs,
    ):
        """
        Initialize the QuestionAnsweringPipeline object.

        Args:
            model (Union["PreTrainedModel", "TFPreTrainedModel"]): The pre-trained model to use for question answering.
            tokenizer (PreTrainedTokenizer): Tokenizer associated with the model for tokenizing inputs.
            modelcard (Optional[ModelCard]): Model card containing details about the model (optional).
            framework (Optional[str]): The framework for the model, e.g., "tf" or "pt" (optional).
            task (str): Identifier for the task associated with the pipeline (optional).
            **kwargs: Additional keyword arguments passed to parent class constructor.
        """
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            task=task,
            **kwargs,
        )

        # Initialize argument parser for question answering specific arguments
        self._args_parser = QuestionAnsweringArgumentHandler()

        # Check and set the appropriate mapping names based on the framework
        self.check_model_type(
            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
            if self.framework == "tf"
            else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
        )

    @staticmethod
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
        """
        Create a SquadExample or a list of SquadExamples from given questions and contexts.

        This helper method encapsulates the logic for converting questions and contexts into SquadExample instances.

        Args:
            question (Union[str, List[str]]): The question or list of questions.
            context (Union[str, List[str]]): The context or list of contexts.

        Returns:
            Union[SquadExample, List[SquadExample]]: The corresponding SquadExample or list of SquadExamples.
        """
        if isinstance(question, list):
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)
    # 函数用于清理和设置参数,确保预处理和后处理的参数正确传递
    def _sanitize_parameters(
        self,
        padding=None,  # 设置填充值的参数,如果不为None,则将其加入预处理参数字典中
        topk=None,  # 设置topk的参数
        top_k=None,  # 设置top_k的参数,如果topk不为None,则发出警告并将其赋值给top_k
        doc_stride=None,  # 文档跨度参数,控制处理文本时的步长
        max_answer_len=None,  # 最大答案长度参数,用于后处理
        max_seq_len=None,  # 最大序列长度参数,用于预处理
        max_question_len=None,  # 最大问题长度参数,用于预处理
        handle_impossible_answer=None,  # 处理不可能答案的参数,用于后处理
        align_to_words=None,  # 对齐到单词的布尔参数,用于后处理
        **kwargs,  # 其他未指定的参数以关键字方式接收
    ):
        # 设置预处理参数的默认空字典
        preprocess_params = {}
        # 根据传入的参数,如果不为None则添加到预处理参数字典中
        if padding is not None:
            preprocess_params["padding"] = padding
        if doc_stride is not None:
            preprocess_params["doc_stride"] = doc_stride
        if max_question_len is not None:
            preprocess_params["max_question_len"] = max_question_len
        if max_seq_len is not None:
            preprocess_params["max_seq_len"] = max_seq_len

        # 设置后处理参数的默认空字典
        postprocess_params = {}
        # 如果topk不为None且top_k为None,则发出警告并将topk赋值给top_k
        if topk is not None and top_k is None:
            warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning)
            top_k = topk
        # 如果top_k不为None,则检查其是否合法,若小于1则抛出异常
        if top_k is not None:
            if top_k < 1:
                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
            postprocess_params["top_k"] = top_k
        # 如果max_answer_len不为None,则检查其是否合法,若小于1则抛出异常
        if max_answer_len is not None:
            if max_answer_len < 1:
                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
            postprocess_params["max_answer_len"] = max_answer_len
        # 将handle_impossible_answer参数加入后处理参数字典中
        if handle_impossible_answer is not None:
            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
        # 将align_to_words参数加入后处理参数字典中
        if align_to_words is not None:
            postprocess_params["align_to_words"] = align_to_words
        # 返回预处理参数字典、空字典和后处理参数字典
        return preprocess_params, {}, postprocess_params

    # 执行模型的前向传播
    def _forward(self, inputs):
        # 获取输入中的示例
        example = inputs["example"]
        # 从输入中选择与模型输入名称匹配的键值对,构建模型输入字典
        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
        # 根据框架类型选择正确的模型前向方法
        model_forward = self.model.forward if self.framework == "pt" else self.model.call
        # 如果模型前向方法支持"use_cache"参数,则设置为False
        if "use_cache" in inspect.signature(model_forward).parameters.keys():
            model_inputs["use_cache"] = False
        # 执行模型的前向传播,获取输出
        output = self.model(**model_inputs)
        # 如果输出是字典,则返回字典包含的"start"和"end" logits及示例和输入的其他键值对
        if isinstance(output, dict):
            return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
        # 否则,假设输出是一个元组,将第一和第二个元素作为"start"和"end" logits返回,同时返回示例和输入的其他键值对
        else:
            start, end = output[:2]
            return {"start": start, "end": end, "example": example, **inputs}

    # 后处理函数,处理模型输出
    def postprocess(
        self,
        model_outputs,  # 模型的输出
        top_k=1,  # 用于选择top_k个答案
        handle_impossible_answer=False,  # 是否处理不可能的答案
        max_answer_len=15,  # 最大答案长度限制
        align_to_words=True,  # 是否对齐到单词
    ):
        # 函数体未提供,未包含在此处的代码块中

    # 获取编码中指定范围内的索引
    def get_indices(
        self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool
    ):
        # 函数体未提供,未包含在此处的代码块中
    ) -> Tuple[int, int]:
        # 如果 align_to_words 为 True,则尝试使用编码器将 token 转换为单词,并获取单词在文本中的起始和结束字符索引
        if align_to_words:
            try:
                start_word = enc.token_to_word(s)
                end_word = enc.token_to_word(e)
                start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
                end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
            except Exception:
                # 如果某些分词器不能真正处理单词,则回退到使用偏移量
                start_index = enc.offsets[s][0]
                end_index = enc.offsets[e][1]
        else:
            # 否则直接使用偏移量来获取字符索引
            start_index = enc.offsets[s][0]
            end_index = enc.offsets[e][1]
        # 返回起始和结束字符索引
        return start_index, end_index

    def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
        """
        从 token 概率解码时,将 token 索引映射回初始上下文中的实际单词。

        Args:
            text (`str`): 要从中提取答案的实际上下文。
            start (`int`): 答案起始 token 索引。
            end (`int`): 答案结束 token 索引。

        Returns:
            类似 `{'answer': str, 'start': int, 'end': int}` 的字典。
        """
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        # 按空格分割文本,遍历每个单词及其 token
        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # 如果单词的 token 索引在答案范围内,则记录单词
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx  # 记录起始字符索引

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)  # 记录结束字符索引

                words += [word]

            # 如果超过答案范围,停止追加
            if token_idx > end:
                break

            # 更新 token 索引和字符索引
            token_idx += len(token)
            chars_idx += len(word) + 1  # 加上空格的长度

        # 返回答案文本及其起始和结束字符索引
        return {
            "answer": " ".join(words),
            "start": max(0, char_start_idx),
            "end": min(len(text), char_end_idx),
        }

.\pipelines\table_question_answering.py

# 导入必要的库和模块

import collections  # 导入 collections 模块,用于高效操作集合数据类型
import types  # 导入 types 模块,用于操作 Python 中的类型信息

import numpy as np  # 导入 numpy 库,并简写为 np,用于科学计算

# 导入相对路径下的模块和函数
from ..utils import (
    add_end_docstrings,  # 导入 add_end_docstrings 函数,用于处理文档字符串的附加信息
    is_tf_available,  # 导入 is_tf_available 函数,用于检查 TensorFlow 是否可用
    is_torch_available,  # 导入 is_torch_available 函数,用于检查 PyTorch 是否可用
    requires_backends,  # 导入 requires_backends 装饰器,用于声明依赖的后端库
)

# 导入本地模块
from .base import (
    ArgumentHandler,  # 从 base 模块中导入 ArgumentHandler 类,处理参数相关逻辑
    Dataset,  # 从 base 模块中导入 Dataset 类,处理数据集相关逻辑
    Pipeline,  # 从 base 模块中导入 Pipeline 类,处理数据处理流程相关逻辑
    PipelineException,  # 从 base 模块中导入 PipelineException 类,处理数据处理流程中的异常
    build_pipeline_init_args,  # 从 base 模块中导入 build_pipeline_init_args 函数,用于构建流程初始化参数
)

# 如果 Torch 可用,则导入相关模块和函数
if is_torch_available():
    import torch  # 导入 torch 库,用于深度学习模型构建和训练

    from ..models.auto.modeling_auto import (
        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,  # 导入模型名称映射字典,用于序列到序列生成模型
        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,  # 导入模型名称映射字典,用于表格问答模型
    )

# 如果 TensorFlow 可用,则导入相关模块和函数
if is_tf_available():
    import tensorflow as tf  # 导入 tensorflow 库,用于构建和训练深度学习模型

    from ..models.auto.modeling_tf_auto import (
        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,  # 导入 TensorFlow 模型名称映射字典,用于序列到序列生成模型
        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,  # 导入 TensorFlow 模型名称映射字典,用于表格问答模型
    )


class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    处理 TableQuestionAnsweringPipeline 的参数
    """
    def __call__(self, table=None, query=None, **kwargs):
        # 定义一个特殊方法,用于调用对象实例,接收参数 table 和 query
        # 返回 tqa_pipeline_inputs 的结构如下:
        # [
        #   {"table": pd.DataFrame, "query": List[str]},
        #   ...,
        #   {"table": pd.DataFrame, "query" : List[str]}
        # ]
        
        # 要求导入 pandas 库
        requires_backends(self, "pandas")
        import pandas as pd

        # 如果 table 参数为 None,则抛出数值错误异常
        if table is None:
            raise ValueError("Keyword argument `table` cannot be None.")
        
        # 如果 query 参数为 None
        elif query is None:
            # 如果 table 是一个字典,且包含 "query" 和 "table" 键
            if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None:
                # 创建包含单个元素的 tqa_pipeline_inputs 列表
                tqa_pipeline_inputs = [table]
            
            # 如果 table 是一个列表且长度大于 0
            elif isinstance(table, list) and len(table) > 0:
                # 如果 table 中的所有元素都是字典
                if not all(isinstance(d, dict) for d in table):
                    raise ValueError(
                        f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}"
                    )

                # 如果第一个字典包含 "query" 和 "table" 键
                if table[0].get("query") is not None and table[0].get("table") is not None:
                    # 使用整个 table 列表作为 tqa_pipeline_inputs
                    tqa_pipeline_inputs = table
                else:
                    raise ValueError(
                        "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`"
                        f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
                    )
            
            # 如果 Dataset 不为 None,且 table 是 Dataset 类型或生成器类型
            elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
                return table
            
            # 其他情况抛出数值错误异常
            else:
                raise ValueError(
                    "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
                    f"is {type(table)})"
                )
        
        # 如果 query 参数不为 None
        else:
            # 创建包含单个元素的 tqa_pipeline_inputs 列表,其中包含传入的 table 和 query
            tqa_pipeline_inputs = [{"table": table, "query": query}]

        # 对于 tqa_pipeline_inputs 中的每个元素
        for tqa_pipeline_input in tqa_pipeline_inputs:
            # 如果 table 不是 pd.DataFrame 类型
            if not isinstance(tqa_pipeline_input["table"], pd.DataFrame):
                # 如果 table 为 None,则抛出数值错误异常
                if tqa_pipeline_input["table"] is None:
                    raise ValueError("Table cannot be None.")

                # 将非 DataFrame 类型的 table 转换为 pd.DataFrame 类型
                tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"])

        # 返回处理后的 tqa_pipeline_inputs 列表
        return tqa_pipeline_inputs
# 使用装饰器为 TableQuestionAnsweringPipeline 添加文档字符串,文档内容由 build_pipeline_init_args(has_tokenizer=True) 函数生成
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
# 定义 TableQuestionAnsweringPipeline 类,继承自 Pipeline 类
class TableQuestionAnsweringPipeline(Pipeline):
    """
    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
    PyTorch.

    Example:

    ```
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
    >>> table = {
    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
    ...     "Stars": ["36542", "4512", "3934"],
    ...     "Contributors": ["651", "77", "34"],
    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
    ... }
    >>> oracle(query="How many stars does the transformers repository have?", table=table)
    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"table-question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
    See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
    """

    # 默认输入参数名设定为 "table,query"
    default_input_names = "table,query"

    # 初始化方法,接受一个参数解析器 args_parser,默认为 TableQuestionAnsweringArgumentHandler 的实例
    def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs):
        # 调用父类 Pipeline 的初始化方法
        super().__init__(*args, **kwargs)
        # 将参数解析器存储在 _args_parser 属性中
        self._args_parser = args_parser

        # 根据框架类型选择模型映射,更新映射表
        if self.framework == "tf":
            mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
            mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
        else:
            mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
            mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
        # 检查模型类型是否符合要求
        self.check_model_type(mapping)

        # 根据模型配置检查是否支持聚合操作,并存储相关信息
        self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool(
            getattr(self.model.config, "num_aggregation_labels", None)
        )
        self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None

    # 批量推断方法,接受任意输入并调用模型进行推断
    def batch_inference(self, **inputs):
        return self.model(**inputs)

    # 内部方法,用于清理和规范化输入参数,返回预处理和前向传递所需的参数字典
    def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs):
        preprocess_params = {}
        if padding is not None:
            preprocess_params["padding"] = padding
        if truncation is not None:
            preprocess_params["truncation"] = truncation

        forward_params = {}
        if sequential is not None:
            forward_params["sequential"] = sequential

        # 返回预处理参数、前向传递参数和空字典
        return preprocess_params, forward_params, {}
    # 预处理函数,接受输入并进行预处理,返回模型所需的输入格式
    def preprocess(self, pipeline_input, sequential=None, padding=True, truncation=None):
        # 如果未指定截断方式,则根据模型类型设置默认截断方式
        if truncation is None:
            if self.type == "tapas":
                truncation = "drop_rows_to_fit"
            else:
                truncation = "do_not_truncate"

        # 从pipeline_input中获取数据表和查询语句
        table, query = pipeline_input["table"], pipeline_input["query"]
        # 如果数据表为空,则抛出数值错误
        if table.empty:
            raise ValueError("table is empty")
        # 如果查询为空或空字符串,则抛出数值错误
        if query is None or query == "":
            raise ValueError("query is empty")
        
        # 使用tokenizer对数据表和查询进行标记化处理,返回模型所需的输入格式
        inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding)
        inputs["table"] = table  # 将数据表添加到模型输入中
        return inputs  # 返回预处理后的输入数据格式

    # 私有方法,模型前向传播函数,接受模型输入并返回模型输出
    def _forward(self, model_inputs, sequential=False, **generate_kwargs):
        # 从模型输入中弹出数据表
        table = model_inputs.pop("table")

        # 根据模型类型选择不同的推断方式
        if self.type == "tapas":
            if sequential:
                outputs = self.sequential_inference(**model_inputs)  # 顺序推断方式
            else:
                outputs = self.batch_inference(**model_inputs)  # 批量推断方式
        else:
            outputs = self.model.generate(**model_inputs, **generate_kwargs)  # 使用模型生成方法进行推断
        
        # 组装模型输出,包括模型输入、数据表和模型输出结果
        model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
        return model_outputs  # 返回模型输出结果
    # 定义一个方法用于后处理模型输出
    def postprocess(self, model_outputs):
        # 从模型输出中获取输入数据、表格和输出结果
        inputs = model_outputs["model_inputs"]
        table = model_outputs["table"]
        outputs = model_outputs["outputs"]
        
        # 如果模型类型是 "tapas"
        if self.type == "tapas":
            # 如果需要聚合结果
            if self.aggregate:
                # 从输出中获取 logits 和 logits_agg
                logits, logits_agg = outputs[:2]
                # 使用 tokenizer 将 logits 和 logits_agg 转换为预测结果
                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg)
                # 分别获取答案坐标批次和聚合器预测结果
                answer_coordinates_batch, agg_predictions = predictions
                # 根据预测结果和配置获取聚合器标签
                aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}

                # 获取没有聚合标签的索引
                no_agg_label_index = self.model.config.no_aggregation_label_index
                # 创建聚合器前缀字典,仅包含非空聚合结果的索引
                aggregators_prefix = {
                    i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index
                }
            else:
                # 如果不需要聚合结果,从输出中获取 logits
                logits = outputs[0]
                # 使用 tokenizer 将 logits 转换为预测结果
                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits)
                # 获取答案坐标批次
                answer_coordinates_batch = predictions[0]
                # 初始化空的聚合器和聚合器前缀字典
                aggregators = {}
                aggregators_prefix = {}

            # 初始化空的答案列表
            answers = []
            # 遍历答案坐标批次中的索引和坐标
            for index, coordinates in enumerate(answer_coordinates_batch):
                # 根据坐标从表格中获取单元格内容
                cells = [table.iat[coordinate] for coordinate in coordinates]
                # 获取当前索引对应的聚合器和聚合器前缀
                aggregator = aggregators.get(index, "")
                aggregator_prefix = aggregators_prefix.get(index, "")
                # 创建答案对象,包括单元格内容、坐标和可能的聚合器信息
                answer = {
                    "answer": aggregator_prefix + ", ".join(cells),
                    "coordinates": coordinates,
                    "cells": [table.iat[coordinate] for coordinate in coordinates],
                }
                # 如果存在聚合器,则将其添加到答案对象中
                if aggregator:
                    answer["aggregator"] = aggregator

                # 将答案对象添加到答案列表中
                answers.append(answer)

            # 如果答案列表为空,则抛出流水线异常
            if len(answers) == 0:
                raise PipelineException("Empty answer")
        else:
            # 如果模型类型不是 "tapas",则直接将输出结果解码为答案列表
            answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]

        # 返回答案列表,如果列表长度大于 1 则返回所有答案,否则返回第一个答案
        return answers if len(answers) > 1 else answers[0]

.\pipelines\text2text_generation.py

import enum  # 导入 Python 标准库中的 enum 模块,用于定义枚举类型
import warnings  # 导入 Python 标准库中的 warnings 模块,用于警告处理

from ..tokenization_utils import TruncationStrategy  # 从上级目录的 tokenization_utils 模块导入 TruncationStrategy 类
from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging  # 从上级目录的 utils 模块导入函数和变量
from .base import Pipeline, build_pipeline_init_args  # 从当前目录的 base 模块导入 Pipeline 类和函数 build_pipeline_init_args

if is_tf_available():  # 如果 TensorFlow 可用
    import tensorflow as tf  # 导入 TensorFlow 库

    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES  # 从上级目录的 models 中导入 TensorFlow 自动化建模模块中的常量

if is_torch_available():  # 如果 PyTorch 可用
    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES  # 从上级目录的 models 中导入 PyTorch 自动化建模模块中的常量

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

class ReturnType(enum.Enum):  # 定义 ReturnType 枚举类
    TENSORS = 0  # 枚举成员:TENSORS 对应值为 0
    TEXT = 1  # 枚举成员:TEXT 对应值为 1

@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))  # 装饰器,用于为类添加文档字符串的结束部分
class Text2TextGenerationPipeline(Pipeline):  # 文本到文本生成管道类,继承自 Pipeline 类
    """
    Pipeline for text to text generation using seq2seq models.

    Example:

    ```
    >>> from transformers import pipeline

    >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap")
    >>> generator(
    ...     "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google"
    ... )
    [{'generated_text': 'question: Who created the RuPERTa-base?'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
    generation](text_generation).

    This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"text2text-generation"`.

    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available
    parameters, see the [following
    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)

    Usage:

    ```
    text2text_generator = pipeline("text2text-generation")
    text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
    ```
    """

    # Used in the return key of the pipeline.
    return_name = "generated"  # 定义管道返回结果的键名为 "generated"

    def __init__(self, *args, **kwargs):  # 初始化方法
        super().__init__(*args, **kwargs)  # 调用父类的初始化方法

        self.check_model_type(
            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES  # 如果使用 TensorFlow 框架,检查模型类型是否在 TensorFlow 映射名称中
            if self.framework == "tf"  # 如果当前框架为 TensorFlow
            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES  # 否则检查模型类型是否在 PyTorch 映射名称中
        )

    def _sanitize_parameters(  # 定义私有方法 _sanitize_parameters,用于处理生成器参数
        self,
        return_tensors=None,  # 返回张量数据
        return_text=None,  # 返回文本数据
        return_type=None,  # 返回类型
        clean_up_tokenization_spaces=None,  # 清理分词空格
        truncation=None,  # 截断
        stop_sequence=None,  # 停止序列
        **generate_kwargs,  # 其余生成参数
    ):
        preprocess_params = {}
        if truncation is not None:
            preprocess_params["truncation"] = truncation
        # 将生成参数设置为 generate_kwargs
        forward_params = generate_kwargs

        postprocess_params = {}
        # 如果 return_tensors 不为 None 且 return_type 为 None,则根据 return_tensors 的值确定 return_type
        if return_tensors is not None and return_type is None:
            return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT
        # 如果 return_type 不为 None,则设置 postprocess_params 中的 return_type
        if return_type is not None:
            postprocess_params["return_type"] = return_type

        # 如果 clean_up_tokenization_spaces 不为 None,则设置 postprocess_params 中的 clean_up_tokenization_spaces
        if clean_up_tokenization_spaces is not None:
            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces

        # 如果 stop_sequence 不为 None,则根据 stop_sequence 编码并设置生成参数中的 eos_token_id
        if stop_sequence is not None:
            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
            if len(stop_sequence_ids) > 1:
                # 如果 stop_sequence_ids 包含多个 token,则发出警告
                warnings.warn(
                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
                    " the stop sequence will be used as the stop sequence string in the interim."
                )
            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]

        # 返回预处理参数、生成参数和后处理参数
        return preprocess_params, forward_params, postprocess_params

    def check_inputs(self, input_length: int, min_length: int, max_length: int):
        """
        Checks whether there might be something wrong with given input with regard to the model.
        """
        return True

    def _parse_and_tokenize(self, *args, truncation):
        # 获取模型配置中的前缀,如果不存在则为空字符串
        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
        # 如果 args[0] 是列表,则检查 tokenizer 是否有 pad_token_id,没有则抛出 ValueError
        if isinstance(args[0], list):
            if self.tokenizer.pad_token_id is None:
                raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
            # 将列表中的每个元素加上前缀 prefix,并设置 padding 为 True
            args = ([prefix + arg for arg in args[0]],)
            padding = True

        # 如果 args[0] 是字符串,则将其加上前缀 prefix,并设置 padding 为 False
        elif isinstance(args[0], str):
            args = (prefix + args[0],)
            padding = False
        else:
            # 如果 args[0] 不是字符串也不是列表,则抛出 ValueError
            raise ValueError(
                f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
            )
        # 使用 tokenizer 对参数进行解析和标记化,根据参数设置 padding、truncation、return_tensors 和 framework
        inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework)
        # 如果 inputs 中存在 "token_type_ids" 键,则将其删除(这是 tokenizers 生成的无效生成参数)
        if "token_type_ids" in inputs:
            del inputs["token_type_ids"]
        return inputs
    def __call__(self, *args, **kwargs):
        r"""
        Generate the output text(s) using text(s) given as inputs.

        Args:
            args (`str` or `List[str]`):
                Input text for the encoder.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Whether or not to include the tensors of predictions (as token indices) in the outputs.
            return_text (`bool`, *optional*, defaults to `True`):
                Whether or not to include the decoded texts in the outputs.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the potential extra spaces in the text output.
            truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
                The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE`
                (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's
                max_length instead of throwing an error down the line.
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./model#generative-models)).

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:

            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
              ids of the generated text.
        """

        # 调用父类的方法处理输入参数并生成结果
        result = super().__call__(*args, **kwargs)
        
        # 如果输入参数是一个列表且所有元素都是字符串,并且每个结果都是长度为1的列表,则返回结果中的第一个元素
        if (
            isinstance(args[0], list)
            and all(isinstance(el, str) for el in args[0])
            and all(len(res) == 1 for res in result)
        ):
            return [res[0] for res in result]
        
        # 否则直接返回生成的结果
        return result

    def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
        # 解析和标记化输入文本,并根据指定的截断策略进行处理
        inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs)
        
        # 返回预处理后的输入
        return inputs
    # 定义一个方法 `_forward`,用于模型推理的前向过程,接受模型输入和生成参数
    def _forward(self, model_inputs, **generate_kwargs):
        # 根据选择的深度学习框架确定输入张量的形状
        if self.framework == "pt":
            in_b, input_length = model_inputs["input_ids"].shape
        elif self.framework == "tf":
            # 使用 TensorFlow 的 API 获取张量的形状,并转换为 NumPy 数组
            in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy()

        # 检查输入的长度是否在指定的最小和最大长度范围内
        self.check_inputs(
            input_length,
            generate_kwargs.get("min_length", self.model.config.min_length),
            generate_kwargs.get("max_length", self.model.config.max_length),
        )
        # 调用模型的生成方法,生成输出序列的标识符
        output_ids = self.model.generate(**model_inputs, **generate_kwargs)
        # 计算输出张量的第一维度大小
        out_b = output_ids.shape[0]
        # 根据选择的深度学习框架进行输出张量的形状重塑
        if self.framework == "pt":
            output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
        elif self.framework == "tf":
            output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:]))
        # 返回包含输出标识符的字典
        return {"output_ids": output_ids}

    # 定义一个方法 `postprocess`,用于处理模型输出,根据返回类型进行后处理
    def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False):
        # 初始化记录列表,用于存储处理后的结果
        records = []
        # 遍历模型输出中的每个输出标识符序列
        for output_ids in model_outputs["output_ids"][0]:
            # 根据返回类型不同,创建不同形式的记录对象
            if return_type == ReturnType.TENSORS:
                record = {f"{self.return_name}_token_ids": output_ids}
            elif return_type == ReturnType.TEXT:
                # 使用分词器解码标识符序列,生成文本,并根据参数进行清理处理
                record = {
                    f"{self.return_name}_text": self.tokenizer.decode(
                        output_ids,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                }
            # 将记录对象添加到记录列表中
            records.append(record)
        # 返回处理后的记录列表
        return records
# 使用装饰器 `@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))` 对类进行装饰,添加初始化参数和文档字符串。
class SummarizationPipeline(Text2TextGenerationPipeline):
    """
    Summarize news articles and other documents.

    This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"summarization"`.

    The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
    currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
    list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
    of available parameters, see the [following
    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)

    Usage:

    ```
    # use bart in pytorch
    summarizer = pipeline("summarization")
    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)

    # use t5 in tf
    summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
    ```
    """

    # 定义用于标识返回摘要的键名
    return_name = "summary"

    def __call__(self, *args, **kwargs):
        r"""
        Summarize the text(s) given as inputs.

        Args:
            documents (*str* or `List[str]`):
                One or several articles (or one list of articles) to summarize.
            return_text (`bool`, *optional*, defaults to `True`):
                Whether or not to include the decoded texts in the outputs
            return_tensors (`bool`, *optional*, defaults to `False`):
                Whether or not to include the tensors of predictions (as token indices) in the outputs.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./model#generative-models)).

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:

            - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input.
            - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
              ids of the summary.
        """
        # 调用父类的 `__call__` 方法,传递所有参数和关键字参数
        return super().__call__(*args, **kwargs)
    def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
        """
        Checks whether there might be something wrong with given input with regard to the model.
        """
        # 检查最大长度是否小于最小长度,如果是则记录警告日志
        if max_length < min_length:
            logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.")

        # 检查输入长度是否小于最大长度,如果是则记录警告日志
        if input_length < max_length:
            logger.warning(
                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
                "a summarization task, where outputs shorter than the input are typically wanted, you might "
                f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})"
            )
# 使用装饰器为类添加文档字符串,并调用函数`build_pipeline_init_args`作为参数
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
# 定义一个翻译管道类,继承自`Text2TextGenerationPipeline`
class TranslationPipeline(Text2TextGenerationPipeline):
    """
    Translates from one language to another.

    This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"translation_xx_to_yy"`.

    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
    For a list of available parameters, see the [following
    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)

    Usage:

    ```
    en_fr_translator = pipeline("translation_en_to_fr")
    en_fr_translator("How old are you?")
    ```
    """

    # 定义一个类变量,表示此管道的返回键
    return_name = "translation"

    # 检查输入长度是否符合要求的方法
    def check_inputs(self, input_length: int, min_length: int, max_length: int):
        # 如果输入长度超过最大长度的90%,发出警告信息
        if input_length > 0.9 * max_length:
            logger.warning(
                f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider "
                "increasing your max_length manually, e.g. translator('...', max_length=400)"
            )
        # 返回True表示检查通过
        return True

    # 数据预处理方法,根据条件使用不同的处理方式
    def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None):
        # 如果存在`_build_translation_inputs`方法,则调用该方法构建翻译输入
        if getattr(self.tokenizer, "_build_translation_inputs", None):
            return self.tokenizer._build_translation_inputs(
                *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang
            )
        else:
            # 否则调用父类的`_parse_and_tokenize`方法进行分析和标记化处理
            return super()._parse_and_tokenize(*args, truncation=truncation)

    # 参数清理方法,用于处理源语言和目标语言的参数,并返回处理后的参数字典
    def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs):
        # 调用父类的方法,获取预处理、前向和后处理的参数字典
        preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs)
        # 如果指定了源语言,则将其添加到预处理参数中
        if src_lang is not None:
            preprocess_params["src_lang"] = src_lang
        # 如果指定了目标语言,则将其添加到预处理参数中
        if tgt_lang is not None:
            preprocess_params["tgt_lang"] = tgt_lang
        # 如果既未指定源语言也未指定目标语言,则尝试从任务标识中解析出语言信息
        if src_lang is None and tgt_lang is None:
            # 向后兼容性,优先使用直接参数
            task = kwargs.get("task", self.task)
            items = task.split("_")
            if task and len(items) == 4:
                # translation, XX, to YY 格式
                preprocess_params["src_lang"] = items[1]
                preprocess_params["tgt_lang"] = items[3]
        # 返回清理后的参数字典
        return preprocess_params, forward_params, postprocess_params
    # 重写 `__call__` 方法,使其能够将输入的文本进行翻译
    def __call__(self, *args, **kwargs):
        # 翻译输入的文本
        r"""
        Translate the text(s) given as inputs.

        Args:
            args (`str` or `List[str]`):
                Texts to be translated.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Whether or not to include the tensors of predictions (as token indices) in the outputs.
            return_text (`bool`, *optional*, defaults to `True`):
                Whether or not to include the decoded texts in the outputs.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the potential extra spaces in the text output.
            src_lang (`str`, *optional*):
                The language of the input. Might be required for multilingual models. Will not have any effect for
                single pair translation models
            tgt_lang (`str`, *optional*):
                The language of the desired output. Might be required for multilingual models. Will not have any effect
                for single pair translation models
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./model#generative-models)).

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:

            - **translation_text** (`str`, present when `return_text=True`) -- The translation.
            - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The
              token ids of the translation.
        """
        # 调用父类的 `__call__` 方法,并传入参数
        return super().__call__(*args, **kwargs)

.\pipelines\text_classification.py

import inspect  # 导入inspect模块,用于获取对象信息
import warnings  # 导入warnings模块,用于处理警告信息
from typing import Dict  # 从typing模块导入Dict类型提示

import numpy as np  # 导入NumPy库,用于数值计算

from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available  # 导入自定义模块和函数
from .base import GenericTensor, Pipeline, build_pipeline_init_args  # 从本地模块导入指定类和函数

if is_tf_available():  # 如果TensorFlow可用,则导入相关模型映射名称
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES

if is_torch_available():  # 如果PyTorch可用,则导入相关模型映射名称
    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES


def sigmoid(_outputs):  # 定义sigmoid函数,接受一个参数_outputs
    return 1.0 / (1.0 + np.exp(-_outputs))  # 返回sigmoid函数的计算结果


def softmax(_outputs):  # 定义softmax函数,接受一个参数_outputs
    maxes = np.max(_outputs, axis=-1, keepdims=True)  # 计算_outputs在最后一个轴上的最大值,并保持维度
    shifted_exp = np.exp(_outputs - maxes)  # 计算_outputs减去最大值后的指数值
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)  # 返回softmax归一化后的结果


class ClassificationFunction(ExplicitEnum):  # 定义一个枚举类ClassificationFunction
    SIGMOID = "sigmoid"  # 枚举项:sigmoid
    SOFTMAX = "softmax"  # 枚举项:softmax
    NONE = "none"  # 枚举项:none


@add_end_docstrings(  # 使用add_end_docstrings装饰器,添加文档字符串
    build_pipeline_init_args(has_tokenizer=True),  # 调用build_pipeline_init_args函数生成初始化参数文档
    r"""
        return_all_scores (`bool`, *optional*, defaults to `False`):
            Whether to return all prediction scores or just the one of the predicted class.
        function_to_apply (`str`, *optional*, defaults to `"default"`):
            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:

            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
              has several labels, will apply the softmax function on the output.
            - `"sigmoid"`: Applies the sigmoid function on the output.
            - `"softmax"`: Applies the softmax function on the output.
            - `"none"`: Does not apply any function on the output.""",
)
class TextClassificationPipeline(Pipeline):  # 定义TextClassificationPipeline类,继承自Pipeline类
    """
    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
    examples](../task_summary#sequence-classification) for more information.

    Example:

    ```
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
    >>> classifier("This movie is disgustingly good !")
    [{'label': 'POSITIVE', 'score': 1.0}]

    >>> classifier("Director tried too much.")
    [{'label': 'NEGATIVE', 'score': 0.996}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).

    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
    over the results. If there is a single label, the pipeline will run a sigmoid over the result.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
    """
    """
    the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
    """

    return_all_scores = False  # 初始化一个布尔变量,表示是否返回所有分数,默认为 False
    function_to_apply = ClassificationFunction.NONE  # 初始化一个枚举变量,表示应用的分类函数,默认为 NONE

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.check_model_type(
            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
            if self.framework == "tf"
            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
        )
        # 初始化函数,调用父类的初始化方法,并根据框架类型检查模型类型

    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
        # 使用 "" 作为默认参数是因为我们将在用户代码中使用 `top_k=None` 来表示"没有 top_k"
        preprocess_params = tokenizer_kwargs  # 将除了预处理参数外的其他参数赋值给 preprocess_params

        postprocess_params = {}  # 初始化后处理参数的字典
        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
            return_all_scores = self.model.config.return_all_scores
            # 如果模型配置有 `return_all_scores` 属性且用户没有提供 return_all_scores 参数,则使用模型配置的值

        if isinstance(top_k, int) or top_k is None:
            postprocess_params["top_k"] = top_k  # 设置后处理参数中的 top_k
            postprocess_params["_legacy"] = False  # 设置后处理参数中的 _legacy 属性为 False
        elif return_all_scores is not None:
            warnings.warn(
                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
                UserWarning,
            )
            if return_all_scores:
                postprocess_params["top_k"] = None  # 如果 return_all_scores 为 True,则将 top_k 设置为 None
            else:
                postprocess_params["top_k"] = 1  # 如果 return_all_scores 为 False,则将 top_k 设置为 1

        if isinstance(function_to_apply, str):
            function_to_apply = ClassificationFunction[function_to_apply.upper()]
            # 如果 function_to_apply 是字符串,则将其转换为大写并尝试从 ClassificationFunction 枚举中获取对应的值

        if function_to_apply is not None:
            postprocess_params["function_to_apply"] = function_to_apply
            # 如果 function_to_apply 不为 None,则将其添加到后处理参数中的 function_to_apply 中

        return preprocess_params, {}, postprocess_params
        # 返回预处理参数、空字典和后处理参数
    def __call__(self, inputs, **kwargs):
        """
        Classify the text(s) given as inputs.

        Args:
            inputs (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`):
                One or several texts to classify. In order to use text pairs for your classification, you can send a
                dictionary containing `{"text", "text_pair"}` keys, or a list of those.
            top_k (`int`, *optional*, defaults to `1`):
                How many results to return.
            function_to_apply (`str`, *optional*, defaults to `"default"`):
                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
                values:

                If this argument is not specified, then it will apply the following functions according to the number
                of labels:

                - If the model has a single label, will apply the sigmoid function on the output.
                - If the model has several labels, will apply the softmax function on the output.

                Possible values are:

                - `"sigmoid"`: Applies the sigmoid function on the output.
                - `"softmax"`: Applies the softmax function on the output.
                - `"none"`: Does not apply any function on the output.

        Return:
            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:

            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.

            If `top_k` is used, one such dictionary is returned per label.
        """
        # Ensure inputs are treated as a tuple, even if initially a single string
        inputs = (inputs,)
        # Call the superclass's __call__ method to perform the classification
        result = super().__call__(*inputs, **kwargs)
        # TODO try and retrieve it in a nicer way from _sanitize_parameters.
        # Check if 'top_k' is not in kwargs to determine legacy behavior
        _legacy = "top_k" not in kwargs
        # If inputs are a single string and _legacy is True, return result as a list
        if isinstance(inputs[0], str) and _legacy:
            # This pipeline is odd, and returns a list when a single item is processed
            return [result]
        else:
            # Otherwise, return the result as it is
            return result
    # 预处理方法,将输入转换为模型所需的张量字典
    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
        # 确定返回的张量类型由框架决定
        return_tensors = self.framework
        # 如果输入是字典类型,则使用标记器处理并返回结果
        if isinstance(inputs, dict):
            return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
        # 如果输入是列表且符合特定条件(用于处理文本对),则继续使用旧有的路径兼容处理
        elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
            return self.tokenizer(
                text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
            )
        # 如果输入是列表但不符合上述条件,则抛出数值错误,提示不支持的输入方式
        elif isinstance(inputs, list):
            raise ValueError(
                "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
                ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
            )
        # 对于其他类型的输入,使用标记器处理并返回结果
        return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)

    # 内部方法,根据模型输入调用模型的前向传播方法
    def _forward(self, model_inputs):
        # 对于 `XXXForSequenceClassification` 类型的模型,即使支持 `use_cache=True`,也不应该使用
        model_forward = self.model.forward if self.framework == "pt" else self.model.call
        # 检查模型的前向传播方法签名,如果支持 `use_cache` 参数,则设为 False
        if "use_cache" in inspect.signature(model_forward).parameters.keys():
            model_inputs["use_cache"] = False
        # 调用模型的前向传播方法并返回结果
        return self.model(**model_inputs)
    def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
        # `_legacy` 用于确定我们是在运行裸管道和向后兼容模式,还是在运行带有 `pipeline(..., top_k=1)` 的更自然结果包含的列表。
        # 在 `set_parameters` 之前的默认值
        
        # 如果未指定应用的函数,则根据模型配置确定默认的应用函数
        if function_to_apply is None:
            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
                function_to_apply = ClassificationFunction.SIGMOID
            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
                function_to_apply = ClassificationFunction.SOFTMAX
            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
                function_to_apply = self.model.config.function_to_apply
            else:
                function_to_apply = ClassificationFunction.NONE

        # 获取模型输出的 logits,并将其转换为 numpy 数组
        outputs = model_outputs["logits"][0]
        outputs = outputs.numpy()

        # 根据指定的函数应用对输出进行转换
        if function_to_apply == ClassificationFunction.SIGMOID:
            scores = sigmoid(outputs)
        elif function_to_apply == ClassificationFunction.SOFTMAX:
            scores = softmax(outputs)
        elif function_to_apply == ClassificationFunction.NONE:
            scores = outputs
        else:
            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")

        # 如果 `top_k` 为 1 并且 `_legacy` 为 True,则返回最高分的标签和分数
        if top_k == 1 and _legacy:
            return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}

        # 否则,构建包含所有标签及其分数的字典列表
        dict_scores = [
            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
        ]
        
        # 如果不是 `_legacy` 模式,则根据分数降序排序字典列表,并根据 `top_k` 进行截断
        if not _legacy:
            dict_scores.sort(key=lambda x: x["score"], reverse=True)
            if top_k is not None:
                dict_scores = dict_scores[:top_k]
        
        # 返回最终的标签及其分数的字典列表
        return dict_scores

.\pipelines\text_generation.py

import enum  # 导入枚举类型的模块
import warnings  # 导入警告模块
from typing import Dict  # 导入字典类型的类型提示

from ..utils import add_end_docstrings, is_tf_available, is_torch_available  # 导入自定义工具函数和判断TensorFlow、PyTorch是否可用的函数
from .base import Pipeline, build_pipeline_init_args  # 导入基础类Pipeline和构建初始化参数的函数


if is_torch_available():  # 如果PyTorch可用
    from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES  # 导入PyTorch自动模型命名映射

if is_tf_available():  # 如果TensorFlow可用
    import tensorflow as tf  # 导入TensorFlow模块

    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES  # 导入TensorFlow自动模型命名映射


class ReturnType(enum.Enum):  # 定义返回类型枚举类
    TENSORS = 0  # 返回张量类型
    NEW_TEXT = 1  # 返回新文本类型
    FULL_TEXT = 2  # 返回完整文本类型


class Chat:  # 聊天类定义
    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
    to this format because the rest of the pipeline code tends to assume that lists of messages are
    actually a batch of samples rather than messages in the same conversation."""

    def __init__(self, messages: Dict):  # 初始化方法,接收消息字典作为参数
        for message in messages:  # 遍历消息字典中的每个消息
            if not ("role" in message and "content" in message):  # 检查消息中是否包含必要的'role'和'content'键
                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
        self.messages = messages  # 将消息字典赋值给实例变量self.messages


@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))  # 添加尾部文档字符串,调用build_pipeline_init_args函数,声明带有分词器的初始化参数
class TextGenerationPipeline(Pipeline):  # 文本生成管道类,继承自基础类Pipeline
    """
    Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a
    specified text prompt. It can also accept one or more chats. Each chat takes the form of a list of dicts,
    where each dict contains "role" and "content" keys.

    Example:

    ```
    >>> from transformers import pipeline

    >>> generator = pipeline(model="openai-community/gpt2")
    >>> generator("I can't believe you did such a ", do_sample=False)
    [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]

    >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions.
    >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
    generation](text_generation).

    This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"text-generation"`.

    The models that this pipeline can use are models that have been trained with an autoregressive language modeling
    objective, which includes the uni-directional models in the library (e.g. openai-community/gpt2). See the list of available models
    on [huggingface.co/models](https://huggingface.co/models?filter=text-generation).
    """
    # 定义一个用于 XLNet 和 TransformerXL 模型的前缀文本,以帮助处理短提示
    XL_PREFIX = """
    In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
    voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
    Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
    and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
    accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
    begging for his blessing. <eod> </s> <eos>
    """

    # 初始化方法,继承自父类构造函数,并检查模型类型
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 根据框架类型选择相应的映射名称列表,然后检查模型类型
        self.check_model_type(
            TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
        )
        # 如果预处理参数中不存在 "prefix"
        if "prefix" not in self._preprocess_params:
            # 设置默认值为 None
            prefix = None
            # 如果模型配置中的 prefix 不为 None,则将其赋给 prefix
            if self.model.config.prefix is not None:
                prefix = self.model.config.prefix
            # 如果 prefix 仍为 None,并且模型类名在指定列表中
            if prefix is None and self.model.__class__.__name__ in [
                "XLNetLMHeadModel",
                "TransfoXLLMHeadModel",
                "TFXLNetLMHeadModel",
                "TFTransfoXLLMHeadModel",
            ]:
                # 对于 XLNet 和 TransformerXL 模型,使用预先定义的 XL_PREFIX 作为 prefix
                prefix = self.XL_PREFIX
            # 如果最终确定了 prefix 的值
            if prefix is not None:
                # 重新计算与 prefix 相关的一些生成参数
                preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params)
                # 更新预处理参数和前向参数
                self._preprocess_params = {**self._preprocess_params, **preprocess_params}
                self._forward_params = {**self._forward_params, **forward_params}

    # 根据指定参数进行参数清理和更新的内部方法
    def _sanitize_parameters(
        self,
        return_full_text=None,
        return_tensors=None,
        return_text=None,
        return_type=None,
        clean_up_tokenization_spaces=None,
        prefix=None,
        handle_long_generation=None,
        stop_sequence=None,
        add_special_tokens=False,
        truncation=None,
        padding=False,
        max_length=None,
        **generate_kwargs,
        ):
            # 定义预处理参数字典,包括特殊标记添加、截断、填充和最大长度等设置
            preprocess_params = {
                "add_special_tokens": add_special_tokens,
                "truncation": truncation,
                "padding": padding,
                "max_length": max_length,
            }
            # 如果设置了最大长度,将其添加到生成参数中
            if max_length is not None:
                generate_kwargs["max_length"] = max_length

            # 如果设置了前缀,将其加入预处理参数中
            if prefix is not None:
                preprocess_params["prefix"] = prefix
            # 如果前缀不为空,则通过分词器处理前缀输入并设置前缀长度
            if prefix:
                prefix_inputs = self.tokenizer(
                    prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
                )
                generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]

            # 如果设置了处理长生成文本的选项,验证选项的有效性
            if handle_long_generation is not None:
                if handle_long_generation not in {"hole"}:
                    raise ValueError(
                        f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected"
                        " [None, 'hole']"
                    )
                preprocess_params["handle_long_generation"] = handle_long_generation

            # 将生成参数更新到预处理参数中
            preprocess_params.update(generate_kwargs)
            forward_params = generate_kwargs

            # 定义后处理参数字典
            postprocess_params = {}
            # 如果设置了返回全文和返回类型为空,则根据返回全文和返回文本的互斥关系设置返回类型
            if return_full_text is not None and return_type is None:
                if return_text is not None:
                    raise ValueError("`return_text` is mutually exclusive with `return_full_text`")
                if return_tensors is not None:
                    raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
                return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
            # 如果设置了返回张量并且返回类型为空,则设置返回类型为张量
            if return_tensors is not None and return_type is None:
                if return_text is not None:
                    raise ValueError("`return_text` is mutually exclusive with `return_tensors`")
                return_type = ReturnType.TENSORS
            # 如果设置了返回类型,则加入后处理参数中
            if return_type is not None:
                postprocess_params["return_type"] = return_type
            # 如果设置了清理分词空格选项,则加入后处理参数中
            if clean_up_tokenization_spaces is not None:
                postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces

            # 如果设置了停止序列,则编码停止序列并设置生成参数中的结束标记 ID
            if stop_sequence is not None:
                stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
                if len(stop_sequence_ids) > 1:
                    warnings.warn(
                        "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
                        " the stop sequence will be used as the stop sequence string in the interim."
                    )
                generate_kwargs["eos_token_id"] = stop_sequence_ids[0]

            # 返回预处理参数、前向参数和后处理参数
            return preprocess_params, forward_params, postprocess_params

        # 重写 _parse_and_tokenize 方法以允许非常规的语言建模分词器参数
    def _parse_and_tokenize(self, *args, **kwargs):
        """
        Parse arguments and tokenize
        """
        # 解析参数
        # 如果模型的类名在特定列表中,则更新kwargs以添加一个标志
        if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
            kwargs.update({"add_space_before_punct_symbol": True})

        # 调用父类方法,将解析后的参数和标记化处理
        return super()._parse_and_tokenize(*args, **kwargs)

    def preprocess(
        self,
        prompt_text,
        prefix="",
        handle_long_generation=None,
        add_special_tokens=False,
        truncation=None,
        padding=False,
        max_length=None,
        **generate_kwargs,
    ):
        # 如果prompt_text是Chat类型的对象,则应用特定的tokenizer方法
        if isinstance(prompt_text, Chat):
            inputs = self.tokenizer.apply_chat_template(
                prompt_text.messages,
                truncation=truncation,
                padding=padding,
                max_length=max_length,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors=self.framework,
            )
        else:
            # 否则,使用tokenizer对prompt_text进行标记化处理
            inputs = self.tokenizer(
                prefix + prompt_text,
                truncation=truncation,
                padding=padding,
                max_length=max_length,
                add_special_tokens=add_special_tokens,
                return_tensors=self.framework,
            )
        
        # 将原始的prompt_text存储在inputs中
        inputs["prompt_text"] = prompt_text

        # 处理长生成情况下的特殊处理
        if handle_long_generation == "hole":
            cur_len = inputs["input_ids"].shape[-1]
            # 根据generate_kwargs获取最大新增token数或长度
            if "max_new_tokens" in generate_kwargs:
                new_tokens = generate_kwargs["max_new_tokens"]
            else:
                new_tokens = generate_kwargs.get("max_length", self.model.config.max_length) - cur_len
                if new_tokens < 0:
                    raise ValueError("We cannot infer how many new tokens are expected")
            
            # 如果当前长度加上新token数超过了tokenizer的最大长度限制
            if cur_len + new_tokens > self.tokenizer.model_max_length:
                keep_length = self.tokenizer.model_max_length - new_tokens
                if keep_length <= 0:
                    raise ValueError(
                        "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
                        " models max length"
                    )

                # 裁剪input_ids和attention_mask以保持长度在tokenizer的最大长度内
                inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
                if "attention_mask" in inputs:
                    inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:]

        # 返回处理后的inputs
        return inputs
    # 定义一个方法 `_forward`,用于执行模型的前向推理
    def _forward(self, model_inputs, **generate_kwargs):
        # 从模型输入中获取输入的 token IDs
        input_ids = model_inputs["input_ids"]
        # 获取注意力掩码,如果不存在则设为 None
        attention_mask = model_inputs.get("attention_mask", None)
        
        # 允许空的提示文本
        # 如果输入的 token IDs 的第二维度为 0,则将 input_ids 和 attention_mask 设为 None,并设置 in_b 为 1
        if input_ids.shape[1] == 0:
            input_ids = None
            attention_mask = None
            in_b = 1
        else:
            # 否则,in_b 等于输入 token IDs 的第一维度的大小
            in_b = input_ids.shape[0]
        
        # 从模型输入中弹出提示文本
        prompt_text = model_inputs.pop("prompt_text")

        # 如果有前缀,则可能需要调整生成长度。
        # 在不永久修改 generate_kwargs 的情况下进行调整,因为一些参数可能来自管道的初始化。
        # 弹出并获取前缀长度
        prefix_length = generate_kwargs.pop("prefix_length", 0)
        if prefix_length > 0:
            # 检查是否存在 max_new_tokens 参数或者在 generate_kwargs 中的 generation_config 中存在 max_new_tokens 参数
            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
                "generation_config" in generate_kwargs
                and generate_kwargs["generation_config"].max_new_tokens is not None
            )
            # 如果没有 max_new_tokens 参数,则将 max_length 设置为 generate_kwargs 中的 max_length 或者模型配置中的 max_length,并增加 prefix_length
            if not has_max_new_tokens:
                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
                generate_kwargs["max_length"] += prefix_length
            
            # 检查是否存在 min_new_tokens 参数或者在 generate_kwargs 中的 generation_config 中存在 min_new_tokens 参数
            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
                "generation_config" in generate_kwargs
                and generate_kwargs["generation_config"].min_new_tokens is not None
            )
            # 如果没有 min_new_tokens 参数,并且存在 min_length 参数,则将 min_length 增加 prefix_length
            if not has_min_new_tokens and "min_length" in generate_kwargs:
                generate_kwargs["min_length"] += prefix_length

        # 使用模型生成方法生成序列,传入 input_ids 和 attention_mask,以及其他生成参数
        generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
        # 获取生成序列的第一维度的大小
        out_b = generated_sequence.shape[0]
        
        # 根据框架类型进行形状调整
        if self.framework == "pt":  # 如果框架是 PyTorch
            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
        elif self.framework == "tf":  # 如果框架是 TensorFlow
            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
        
        # 返回包含生成序列、输入的 token IDs 和提示文本的字典
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}
    def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, clean_up_tokenization_spaces=True):
        # 获取生成的文本序列
        generated_sequence = model_outputs["generated_sequence"][0]
        # 获取输入的 token IDs
        input_ids = model_outputs["input_ids"]
        # 获取提示文本
        prompt_text = model_outputs["prompt_text"]
        # 将生成的序列转换为 numpy 数组,再转换为 Python 列表
        generated_sequence = generated_sequence.numpy().tolist()
        # 初始化记录列表
        records = []
        # 遍历生成的序列
        for sequence in generated_sequence:
            if return_type == ReturnType.TENSORS:
                # 如果返回类型是 TENSORS,则记录生成的 token IDs
                record = {"generated_token_ids": sequence}
            elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
                # 解码生成的文本
                text = self.tokenizer.decode(
                    sequence,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                )

                # 如果 input_ids 为空,则使用的是 XLNet 或 Transfo-XL 模型,需要移除 PADDING prompt
                if input_ids is None:
                    prompt_length = 0
                else:
                    prompt_length = len(
                        self.tokenizer.decode(
                            input_ids[0],
                            skip_special_tokens=True,
                            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                        )
                    )

                # 移除提示长度对应的文本
                all_text = text[prompt_length:]
                # 如果返回类型是 FULL_TEXT
                if return_type == ReturnType.FULL_TEXT:
                    # 如果提示文本是字符串,则将其与生成的文本合并
                    if isinstance(prompt_text, str):
                        all_text = prompt_text + all_text
                    # 如果提示文本是 Chat 类型,则将其消息与生成的文本合并
                    elif isinstance(prompt_text, Chat):
                        all_text = prompt_text.messages + [{"role": "assistant", "content": all_text}]

                # 创建记录包含生成的文本
                record = {"generated_text": all_text}
            # 将记录加入记录列表
            records.append(record)

        # 返回所有记录
        return records

.\pipelines\text_to_audio.py

# 导入必要的模块和函数
from typing import List, Union
from ..utils import is_torch_available
from .base import Pipeline

# 如果 torch 可用,导入特定模型的映射和 SpeechT5HifiGan
if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
    from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan

# 默认的声码器模型标识符
DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"

class TextToAudioPipeline(Pipeline):
    """
    文本到音频生成管道,使用任意 `AutoModelForTextToWaveform` 或 `AutoModelForTextToSpectrogram` 模型。
    该管道从输入文本生成音频文件,并可选地接收其他条件输入。

    Example:

    ```
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    了解如何使用管道的基础知识,参见[pipeline tutorial](../pipeline_tutorial)

    <Tip>

    可以通过使用 [`TextToAudioPipeline.__call__.forward_params`] 或 [`TextToAudioPipeline.__call__.generate_kwargs`] 来指定传递给模型的参数。

    Example:

    ```
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

    >>> # 通过使用较高的温度添加随机性来增强音乐生成,并设置最大音乐长度
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    目前可以通过 [`pipeline`] 加载此管道,使用以下任务标识符:"text-to-speech" 或 "text-to-audio"。

    查看 [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech) 上可用模型列表。
    """
    # 初始化函数,接受任意数量的位置参数和关键字参数
    def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
        # 调用父类的初始化函数
        super().__init__(*args, **kwargs)

        # 如果使用 TensorFlow 框架,则抛出数值错误异常,因为 TextToAudioPipeline 只能在 PyTorch 中使用
        if self.framework == "tf":
            raise ValueError("The TextToAudioPipeline is only available in PyTorch.")

        # 初始化属性 vocoder 为 None
        self.vocoder = None
        # 如果模型的类在 MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING 的值之中
        if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
            # 根据 DEFAULT_VOCODER_ID 加载 SpeechT5HifiGan 模型,并放置在模型所在的设备上
            self.vocoder = (
                SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
                if vocoder is None  # 如果未提供 vocoder 参数,则使用默认的 vocoder
                else vocoder  # 否则使用传入的 vocoder
            )

        # 初始化属性 sampling_rate 为传入的 sampling_rate 参数
        self.sampling_rate = sampling_rate
        # 如果 vocoder 不为 None,则设置 sampling_rate 为 vocoder 配置的 sampling_rate
        if self.vocoder is not None:
            self.sampling_rate = self.vocoder.config.sampling_rate

        # 如果 sampling_rate 仍为 None,则从模型的配置和生成配置中获取 sampling_rate
        if self.sampling_rate is None:
            # 获取模型的配置
            config = self.model.config
            # 获取模型的生成配置(如果存在)
            gen_config = self.model.__dict__.get("generation_config", None)
            # 如果生成配置存在,则更新模型配置
            if gen_config is not None:
                config.update(gen_config.to_dict())

            # 尝试从配置中的多个可能的属性名中获取 sampling_rate
            for sampling_rate_name in ["sample_rate", "sampling_rate"]:
                sampling_rate = getattr(config, sampling_rate_name, None)
                # 如果成功获取到 sampling_rate,则将其赋值给 self.sampling_rate,并结束循环
                if sampling_rate is not None:
                    self.sampling_rate = sampling_rate

    # 预处理函数,接受文本输入和其他关键字参数
    def preprocess(self, text, **kwargs):
        # 如果 text 是字符串,则转换为单元素列表
        if isinstance(text, str):
            text = [text]

        # 如果模型的类型是 "bark"
        if self.model.config.model_type == "bark":
            # 创建一个新的关键字参数字典 new_kwargs
            new_kwargs = {
                # 设置 max_length 为模型生成配置中的 max_input_semantic_length,最大长度默认为 256
                "max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256),
                "add_special_tokens": False,  # 不添加特殊标记
                "return_attention_mask": True,  # 返回注意力掩码
                "return_token_type_ids": False,  # 不返回 token 类型 IDs
                "padding": "max_length",  # 使用 "max_length" 进行填充
            }

            # 优先使用传入的 kwargs 更新 new_kwargs
            new_kwargs.update(kwargs)

            # 将 kwargs 指向 new_kwargs
            kwargs = new_kwargs

        # 使用 tokenizer 对文本进行处理,返回 PyTorch 张量表示的输出
        output = self.tokenizer(text, **kwargs, return_tensors="pt")

        return output
    # 定义私有方法 `_forward`,用于执行模型的前向推断过程
    def _forward(self, model_inputs, **kwargs):
        # 需要确保一些关键字参数处于正确的设备上
        kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
        # 获取前向推断所需的参数
        forward_params = kwargs["forward_params"]
        # 获取生成过程的关键字参数
        generate_kwargs = kwargs["generate_kwargs"]

        # 如果模型支持生成操作
        if self.model.can_generate():
            # 确保生成过程的关键字参数处于正确的设备上
            generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)

            # 生成过程的参数优先级高于前向推断的参数
            forward_params.update(generate_kwargs)

            # 调用模型的生成方法
            output = self.model.generate(**model_inputs, **forward_params)
        else:
            # 如果不支持生成操作,则使用前向推断的参数调用模型
            if len(generate_kwargs):
                # 抛出数值错误,提醒用户使用前向推断模型时应使用 forward_params 而不是 generate_kwargs
                raise ValueError(
                    f"""You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty.
                                 For forward-only TTA models, please use `forward_params` instead of of
                                 `generate_kwargs`. For reference, here are the `generate_kwargs` used here:
                                 {generate_kwargs.keys()}"""
                )
            # 使用前向推断的参数调用模型,并取第一个输出
            output = self.model(**model_inputs, **forward_params)[0]

        # 如果存在声码器,将输出转换为波形
        if self.vocoder is not None:
            # 将输出作为频谱图输入声码器,得到波形作为最终输出
            output = self.vocoder(output)

        # 返回最终输出
        return output

    # 重载 `__call__` 方法,允许对象被调用,用于从文本生成语音/音频
    def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
        """
        从输入文本生成语音/音频。详细信息请参阅 [`TextToAudioPipeline`] 文档。

        Args:
            text_inputs (`str` or `List[str]`):
                要生成的文本或文本列表。
            forward_params (`dict`, *可选*):
                传递给模型生成/前向方法的参数。`forward_params` 总是传递给底层模型。

        Return:
            `dict` 或 `list` of `dict`: 返回的字典包含两个键值对:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- 生成的音频波形。
            - **sampling_rate** (`int`) -- 生成的音频波形的采样率。
        """
        return super().__call__(text_inputs, **forward_params)

    # 定义私有方法 `_sanitize_parameters`,用于清理和规范化输入参数
    def _sanitize_parameters(
        self,
        preprocess_params=None,
        forward_params=None,
        generate_kwargs=None,
    ):
    ):
        # 定义一个包含参数的字典,包括前向参数和生成参数
        params = {
            "forward_params": forward_params if forward_params else {},  # 如果前向参数存在则使用,否则使用空字典
            "generate_kwargs": generate_kwargs if generate_kwargs else {},  # 如果生成参数存在则使用,否则使用空字典
        }

        # 如果预处理参数为None,则将其设为空字典
        if preprocess_params is None:
            preprocess_params = {}
        postprocess_params = {}  # 初始化后处理参数为空字典

        # 返回预处理参数、params字典和后处理参数
        return preprocess_params, params, postprocess_params

    def postprocess(self, waveform):
        # 定义一个空的输出字典
        output_dict = {}

        # 将音波数据转换为CPU上的浮点数数组,并存入输出字典中的"audio"键
        output_dict["audio"] = waveform.cpu().float().numpy()
        output_dict["sampling_rate"] = self.sampling_rate  # 将采样率存入输出字典的"sampling_rate"键

        # 返回填充了音频数据和采样率的输出字典
        return output_dict

.\pipelines\token_classification.py

import types
import warnings
from typing import List, Optional, Tuple, Union

import numpy as np

# 导入BasicTokenizer类,用于处理文本的基本分词操作
from ..models.bert.tokenization_bert import BasicTokenizer
# 导入必要的工具函数和类
from ..utils import (
    ExplicitEnum,
    add_end_docstrings,
    is_tf_available,
    is_torch_available,
)
# 导入处理文本和数据的基础类和函数
from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args

# 如果TensorFlow可用,导入相关模型和函数
if is_tf_available():
    import tensorflow as tf
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES

# 如果PyTorch可用,导入相关模型和函数
if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES


class TokenClassificationArgumentHandler(ArgumentHandler):
    """
    Handles arguments for token classification.
    """

    def __call__(self, inputs: Union[str, List[str]], **kwargs):
        # 根据输入类型处理输入数据
        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
            inputs = list(inputs)
            batch_size = len(inputs)
        elif isinstance(inputs, str):
            inputs = [inputs]
            batch_size = 1
        elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
            return inputs, None
        else:
            raise ValueError("At least one input is required.")

        # 处理偏移映射(offset_mapping)参数
        offset_mapping = kwargs.get("offset_mapping")
        if offset_mapping:
            if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
                offset_mapping = [offset_mapping]
            if len(offset_mapping) != batch_size:
                raise ValueError("offset_mapping should have the same batch size as the input")
        
        # 返回处理后的输入数据和偏移映射
        return inputs, offset_mapping


class AggregationStrategy(ExplicitEnum):
    """All the valid aggregation strategies for TokenClassificationPipeline"""

    # 定义TokenClassificationPipeline的有效聚合策略
    NONE = "none"
    SIMPLE = "simple"
    FIRST = "first"
    AVERAGE = "average"
    MAX = "max"


@add_end_docstrings(
    build_pipeline_init_args(has_tokenizer=True),
        r"""
        ignore_labels (`List[str]`, defaults to `["O"]`):
            A list of labels to ignore.
        grouped_entities (`bool`, *optional*, defaults to `False`):
            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
            same entity together in the predictions or not.
        stride (`int`, *optional*):
            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
            The strategy to fuse (or not) tokens based on the model prediction.

                - "none" : Will simply not do any aggregation and simply return raw results from the model
                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
                  that support that meaning, which is basically tokens separated by a space). These mitigations will
                  only work on real words, "New york" might still be tagged with two different entities.
                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
                  end up with different tags. Words will simply use the tag of the first token of the word when there
                  is ambiguity.
                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
                  label is applied.
                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
                  end up with different tags. Word entity will simply be the token with the maximum score.
        """
)
class TokenClassificationPipeline(ChunkPipeline):
    """
    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
    examples](../task_summary#named-entity-recognition) for more information.

    Example:

    ```
    >>> from transformers import pipeline

    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
    >>> tokens = token_classifier(sentence)
    >>> tokens
    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]

    >>> token = tokens[0]
    >>> # Start and end provide an easy way to highlight words in the original text.
    >>> sentence[token["start"] : token["end"]]
    ' jean-baptiste'

    >>> # Some models use the same idea to do part of speech.
    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
    >>> syntaxer("My name is Sarah and I live in London")
    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).

    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
    """

    default_input_names = "sequences"

    def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 检查并设置模型类型,根据框架不同选择不同的模型映射名称
        self.check_model_type(
            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
            if self.framework == "tf"
            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
        )

        # 初始化基本分词器,不进行大小写转换
        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        # 使用指定的参数解析器
        self._args_parser = args_parser
    # 定义一个方法 `_sanitize_parameters`,用于处理和清理输入的参数,确保它们符合预期格式
    def _sanitize_parameters(
        self,
        ignore_labels=None,  # 忽略的标签列表,可以为 None
        grouped_entities: Optional[bool] = None,  # 是否对实体进行分组的标志,可以为 None
        ignore_subwords: Optional[bool] = None,  # 是否忽略子词的标志,可以为 None
        aggregation_strategy: Optional[AggregationStrategy] = None,  # 聚合策略,可以为 None
        offset_mapping: Optional[List[Tuple[int, int]]] = None,  # 偏移映射的列表,可以为 None
        stride: Optional[int] = None,  # 步幅,可以为 None
    ):

        """
        实现 `__call__` 方法,用于对给定的文本输入进行令牌分类。

        Args:
            inputs (`str` or `List[str]`):
                一个或多个文本(或文本列表)用于令牌分类。

        Return:
            A list or a list of list of `dict`: 每个结果都作为一个字典列表返回(每个输入的每个令牌,或者如果此管道是
            使用了聚合策略实例化,则每个实体都对应一个字典)具有以下键:

            - **word** (`str`) -- 被分类的令牌/单词。这是通过解码所选令牌获得的。如果要获得原始句子中的确切字符串,请使用 `start` 和 `end`。
            - **score** (`float`) -- `entity` 的相应概率。
            - **entity** (`str`) -- 预测的令牌/单词的实体(当 *aggregation_strategy* 不是 `"none"` 时命名为 *entity_group*)。
            - **index** (`int`, 仅在 `aggregation_strategy="none"` 时存在) -- 句子中对应令牌的索引。
            - **start** (`int`, *可选*) -- 句子中对应实体的起始索引。仅在 tokenizer 中可用偏移时存在。
            - **end** (`int`, *可选*) -- 句子中对应实体的结束索引。仅在 tokenizer 中可用偏移时存在。
        """
        
        # 使用 `_args_parser` 方法解析输入,并获取偏移映射
        _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
        
        # 如果存在偏移映射,则将其添加到 kwargs 中
        if offset_mapping:
            kwargs["offset_mapping"] = offset_mapping
        
        # 调用父类的 `__call__` 方法,执行实际的令牌分类任务,并返回结果
        return super().__call__(inputs, **kwargs)
    # 对输入句子进行预处理,返回生成器对象,每次生成一个模型输入字典
    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
        # 提取预处理参数中的 tokenizer_params,并从 preprocess_params 中移除
        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
        # 根据模型的最大长度和是否启用截断来确定是否截断输入句子
        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
        # 使用 Tokenizer 对句子进行处理,返回模型输入字典
        inputs = self.tokenizer(
            sentence,
            return_tensors=self.framework,  # 返回张量格式由 self.framework 决定
            truncation=truncation,  # 是否截断输入句子
            return_special_tokens_mask=True,  # 返回特殊 token 掩码
            return_offsets_mapping=self.tokenizer.is_fast,  # 返回偏移映射(如果 Tokenizer 支持)
            **tokenizer_params,  # 其他 tokenizer 参数
        )
        # 移除字典中的 "overflow_to_sample_mapping" 键值对
        inputs.pop("overflow_to_sample_mapping", None)
        # 计算分块数量
        num_chunks = len(inputs["input_ids"])

        # 遍历每个分块,生成模型输入字典
        for i in range(num_chunks):
            if self.framework == "tf":
                # 如果使用 TensorFlow 框架,对每个值张量进行扩展维度
                model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
            else:
                # 如果使用其他框架,对每个值张量进行 unsqueeze 操作
                model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
            # 如果提供了偏移映射,则将其添加到模型输入中
            if offset_mapping is not None:
                model_inputs["offset_mapping"] = offset_mapping
            # 将句子添加到模型输入中(仅在第一个分块时添加)
            model_inputs["sentence"] = sentence if i == 0 else None
            # 指示当前分块是否为最后一个分块
            model_inputs["is_last"] = i == num_chunks - 1

            # 使用生成器的 yield 返回模型输入字典
            yield model_inputs

    # 私有方法:模型的前向传播过程
    def _forward(self, model_inputs):
        # 提取模型输入字典中的特殊 token 掩码
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        # 提取模型输入字典中的偏移映射(如果存在)
        offset_mapping = model_inputs.pop("offset_mapping", None)
        # 提取模型输入字典中的句子
        sentence = model_inputs.pop("sentence")
        # 提取模型输入字典中的 is_last 标志
        is_last = model_inputs.pop("is_last")

        # 根据框架类型选择不同的前向传播方式
        if self.framework == "tf":
            # 如果使用 TensorFlow 框架,调用模型的前向传播,返回 logits
            logits = self.model(**model_inputs)[0]
        else:
            # 如果使用其他框架,调用模型的前向传播,获取输出
            output = self.model(**model_inputs)
            # 如果输出为字典,则从中提取 logits;否则,假设输出为 logits
            logits = output["logits"] if isinstance(output, dict) else output[0]

        # 返回包含各种信息的字典,包括 logits、特殊 token 掩码、偏移映射、句子和 is_last 标志
        return {
            "logits": logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            "is_last": is_last,
            **model_inputs,  # 将其余模型输入字典内容一并返回
        }
    # 对模型输出进行后处理,根据指定策略聚合结果
    def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
        # 如果未提供忽略标签,则默认忽略 "O" 标签
        if ignore_labels is None:
            ignore_labels = ["O"]
        # 存储所有实体的列表
        all_entities = []
        # 遍历所有模型的输出
        for model_outputs in all_outputs:
            # 获取模型预测的 logits,并转换为 NumPy 数组
            logits = model_outputs["logits"][0].numpy()
            # 获取句子文本,假设是所有输出中的第一个句子
            sentence = all_outputs[0]["sentence"]
            # 获取模型输入的 token IDs
            input_ids = model_outputs["input_ids"][0]
            # 获取偏移映射,如果存在的话,转换为 NumPy 数组
            offset_mapping = (
                model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
            )
            # 获取特殊 token 掩码,并转换为 NumPy 数组
            special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()

            # 对 logits 进行 softmax 处理,得到每个标签的概率分数
            maxes = np.max(logits, axis=-1, keepdims=True)
            shifted_exp = np.exp(logits - maxes)
            scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

            # 如果使用 TensorFlow 框架,将 input_ids 和 offset_mapping 转换为 NumPy 数组
            if self.framework == "tf":
                input_ids = input_ids.numpy()
                offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None

            # 调用 gather_pre_entities 方法,获取预测的实体信息
            pre_entities = self.gather_pre_entities(
                sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
            )
            # 调用 aggregate 方法,根据指定策略聚合实体
            grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
            # 过滤掉在 ignore_labels 中的实体或实体组
            entities = [
                entity
                for entity in grouped_entities
                if entity.get("entity", None) not in ignore_labels
                and entity.get("entity_group", None) not in ignore_labels
            ]
            # 将过滤后的实体列表添加到 all_entities 中
            all_entities.extend(entities)
        # 如果输出包含多个部分(chunks),则对实体进行重叠处理
        num_chunks = len(all_outputs)
        if num_chunks > 1:
            all_entities = self.aggregate_overlapping_entities(all_entities)
        # 返回所有处理后的实体列表
        return all_entities

    # 对重叠的实体进行聚合处理
    def aggregate_overlapping_entities(self, entities):
        # 如果实体列表为空,直接返回空列表
        if len(entities) == 0:
            return entities
        # 按照实体的起始位置进行排序
        entities = sorted(entities, key=lambda x: x["start"])
        # 存储聚合后的实体列表
        aggregated_entities = []
        # 初始化前一个实体为列表中的第一个实体
        previous_entity = entities[0]
        # 遍历所有实体进行聚合处理
        for entity in entities:
            # 如果当前实体的起始位置在前一个实体的范围内
            if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
                # 比较当前实体和前一个实体的长度,选择长度更长或得分更高的实体
                current_length = entity["end"] - entity["start"]
                previous_length = previous_entity["end"] - previous_entity["start"]
                if current_length > previous_length:
                    previous_entity = entity
                elif current_length == previous_length and entity["score"] > previous_entity["score"]:
                    previous_entity = entity
            else:
                # 将前一个实体添加到聚合列表中,并更新为当前实体
                aggregated_entities.append(previous_entity)
                previous_entity = entity
        # 添加最后一个实体到聚合列表中
        aggregated_entities.append(previous_entity)
        # 返回聚合后的实体列表
        return aggregated_entities
    def gather_pre_entities(
        self,
        sentence: str,
        input_ids: np.ndarray,
        scores: np.ndarray,
        offset_mapping: Optional[List[Tuple[int, int]]],
        special_tokens_mask: np.ndarray,
        aggregation_strategy: AggregationStrategy,
    ) -> List[dict]:
        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
        # 初始化空列表,用于存储预实体字典
        pre_entities = []
        
        # 遍历每个索引和对应的 token_scores
        for idx, token_scores in enumerate(scores):
            # 过滤掉特殊 token
            if special_tokens_mask[idx]:
                continue
            
            # 将输入 token ID 转换为词汇
            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
            
            # 如果提供了偏移映射,则获取起始和结束索引
            if offset_mapping is not None:
                start_ind, end_ind = offset_mapping[idx]
                
                # 处理非整数类型的索引(通常出现在 PyTorch 框架中)
                if not isinstance(start_ind, int):
                    if self.framework == "pt":
                        start_ind = start_ind.item()
                        end_ind = end_ind.item()
                
                # 根据偏移映射从原始句子中获取参考词
                word_ref = sentence[start_ind:end_ind]
                
                # 检查是否是子词(针对 BPE 类型的 tokenizer)
                if getattr(self.tokenizer, "_tokenizer", None) and getattr(
                    self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
                ):
                    # 这是一个 BPE、词感知型 tokenizer,有正确的方式来融合 token
                    is_subword = len(word) != len(word_ref)
                else:
                    # 这是一个回退启发式方法,对于文本和标点混合的情况可能无法正确识别为 "word"。
                    # 非词感知型模型在这种情况下通常无法做得更好。
                    if aggregation_strategy in {
                        AggregationStrategy.FIRST,
                        AggregationStrategy.AVERAGE,
                        AggregationStrategy.MAX,
                    }:
                        warnings.warn(
                            "Tokenizer does not support real words, using fallback heuristic",
                            UserWarning,
                        )
                    is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
                
                # 如果输入 token 是未知标记,使用参考词替换并设置 is_subword 为 False
                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
                    word = word_ref
                    is_subword = False
            else:
                # 如果没有提供偏移映射,则将索引设置为 None,并且 is_subword 设置为 False
                start_ind = None
                end_ind = None
                is_subword = False
            
            # 创建预实体字典
            pre_entity = {
                "word": word,
                "scores": token_scores,
                "start": start_ind,
                "end": end_ind,
                "index": idx,
                "is_subword": is_subword,
            }
            
            # 将预实体字典添加到预实体列表中
            pre_entities.append(pre_entity)
        
        # 返回所有预实体的列表
        return pre_entities
    # 根据预先提供的实体列表和聚合策略,返回聚合后的实体列表
    def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
        # 检查聚合策略是否为NONE或SIMPLE
        if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
            # 初始化一个空列表来存储聚合后的实体
            entities = []
            # 遍历预先提供的实体列表
            for pre_entity in pre_entities:
                # 获取具有最高分数的实体索引
                entity_idx = pre_entity["scores"].argmax()
                # 获取该实体的分数
                score = pre_entity["scores"][entity_idx]
                # 创建新的实体字典,包含实体名称、分数、索引、单词、起始位置和结束位置
                entity = {
                    "entity": self.model.config.id2label[entity_idx],  # 实体名称
                    "score": score,                                     # 实体分数
                    "index": pre_entity["index"],                       # 实体索引
                    "word": pre_entity["word"],                         # 实体单词
                    "start": pre_entity["start"],                       # 实体起始位置
                    "end": pre_entity["end"],                           # 实体结束位置
                }
                # 将新创建的实体添加到实体列表中
                entities.append(entity)
        else:
            # 使用指定的聚合策略对实体列表进行聚合
            entities = self.aggregate_words(pre_entities, aggregation_strategy)

        # 如果聚合策略为NONE,则直接返回实体列表
        if aggregation_strategy == AggregationStrategy.NONE:
            return entities
        # 否则,调用方法将实体列表按某种方式分组并返回
        return self.group_entities(entities)

    # 根据给定的实体列表和聚合策略,返回聚合后的单个实体字典
    def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict:
        # 将实体列表中的单词转换为字符串形式
        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
        
        # 根据聚合策略选择不同的聚合方式
        if aggregation_strategy == AggregationStrategy.FIRST:
            # 对于FIRST策略,选择第一个实体的分数最高的标签作为聚合后的实体
            scores = entities[0]["scores"]
            idx = scores.argmax()
            score = scores[idx]
            entity = self.model.config.id2label[idx]
        elif aggregation_strategy == AggregationStrategy.MAX:
            # 对于MAX策略,选择分数最高的实体作为聚合后的实体
            max_entity = max(entities, key=lambda entity: entity["scores"].max())
            scores = max_entity["scores"]
            idx = scores.argmax()
            score = scores[idx]
            entity = self.model.config.id2label[idx]
        elif aggregation_strategy == AggregationStrategy.AVERAGE:
            # 对于AVERAGE策略,计算所有实体分数的平均值,并选择平均分数最高的实体作为聚合后的实体
            scores = np.stack([entity["scores"] for entity in entities])
            average_scores = np.nanmean(scores, axis=0)
            entity_idx = average_scores.argmax()
            entity = self.model.config.id2label[entity_idx]
            score = average_scores[entity_idx]
        else:
            # 若聚合策略不是NONE、SIMPLE、FIRST、MAX、AVERAGE中的任何一种,则抛出异常
            raise ValueError("Invalid aggregation_strategy")
        
        # 创建新的聚合后的实体字典,包含实体名称、分数、单词、起始位置和结束位置
        new_entity = {
            "entity": entity,                   # 实体名称
            "score": score,                     # 实体分数
            "word": word,                       # 实体单词
            "start": entities[0]["start"],      # 第一个实体的起始位置
            "end": entities[-1]["end"],         # 最后一个实体的结束位置
        }
        # 返回聚合后的实体字典
        return new_entity
    # 覆盖不同意的单词,强制在单词边界上达成一致的聚合策略
    def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
        """
        Override tokens from a given word that disagree to force agreement on word boundaries.

        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
        company| B-ENT I-ENT
        """
        # 检查聚合策略是否为 NONE 或 SIMPLE,这两种策略无效
        if aggregation_strategy in {
            AggregationStrategy.NONE,
            AggregationStrategy.SIMPLE,
        }:
            raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")

        # 存储聚合后的单词实体列表
        word_entities = []
        # 初始化单词组列表
        word_group = None
        # 遍历实体列表
        for entity in entities:
            # 如果当前单词组为空,则初始化为当前实体
            if word_group is None:
                word_group = [entity]
            # 如果当前实体是子词,则添加到当前单词组
            elif entity["is_subword"]:
                word_group.append(entity)
            else:
                # 否则,对当前单词组进行聚合并添加到结果列表中
                word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
                word_group = [entity]
        # 处理最后一个单词组
        if word_group is not None:
            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
        return word_entities

    # 将相邻的具有相同预测实体的标记组合在一起
    def group_sub_entities(self, entities: List[dict]) -> dict:
        """
        Group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        """
        # 获取实体组中第一个实体的标记,去掉可能存在的 B- 或 I- 前缀
        entity = entities[0]["entity"].split("-", 1)[-1]
        # 计算实体组的分数均值
        scores = np.nanmean([entity["score"] for entity in entities])
        # 提取实体组中的单词列表
        tokens = [entity["word"] for entity in entities]

        # 构建实体组字典
        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
            "start": entities[0]["start"],
            "end": entities[-1]["end"],
        }
        return entity_group

    # 获取实体名称的标签类型(B- 或 I-)和实体标签
    def get_tag(self, entity_name: str) -> Tuple[str, str]:
        if entity_name.startswith("B-"):
            bi = "B"
            tag = entity_name[2:]
        elif entity_name.startswith("I-"):
            bi = "I"
            tag = entity_name[2:]
        else:
            # 如果不以 B- 或 I- 开头,默认为 I- 类型(用于连续实体)
            bi = "I"
            tag = entity_name
        return bi, tag
    # 定义一个方法,用于将具有相同实体预测的相邻标记组合在一起
    def group_entities(self, entities: List[dict]) -> List[dict]:
        """
        Find and group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        """

        # 初始化一个空列表,用于存储最终的实体组
        entity_groups = []
        # 初始化一个空列表,用于存储分解后的实体组
        entity_group_disagg = []

        # 遍历预测的每个实体
        for entity in entities:
            # 如果分解后的实体组为空,直接将当前实体加入其中
            if not entity_group_disagg:
                entity_group_disagg.append(entity)
                continue

            # 获取当前实体和上一个实体的类型标签和前缀信息
            bi, tag = self.get_tag(entity["entity"])
            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])

            # 如果当前实体的类型标签和前一个实体相同且是相邻的,并且不是"B"类型,将其加入分解后的实体组
            if tag == last_tag and bi != "B":
                entity_group_disagg.append(entity)
            else:
                # 如果当前实体和前一个实体不同,将分解后的实体组聚合为一个完整的实体组,并重新开始新的分解组
                entity_groups.append(self.group_sub_entities(entity_group_disagg))
                entity_group_disagg = [entity]

        # 处理最后剩余的分解后的实体组
        if entity_group_disagg:
            entity_groups.append(self.group_sub_entities(entity_group_disagg))

        # 返回最终的实体组列表
        return entity_groups
# 将 TokenClassificationPipeline 类的别名设置为 NerPipeline
NerPipeline = TokenClassificationPipeline

.\pipelines\video_classification.py

# 导入所需模块和类
from io import BytesIO  # 导入 BytesIO 类
from typing import List, Union  # 导入 List 和 Union 类型

import requests  # 导入 requests 模块

from ..utils import add_end_docstrings, is_decord_available, is_torch_available, logging, requires_backends  # 导入自定义模块和函数
from .base import Pipeline, build_pipeline_init_args  # 从 base 模块中导入 Pipeline 和 build_pipeline_init_args 函数

# 如果 decord 可用,则导入相应模块
if is_decord_available():
    import numpy as np  # 导入 numpy 模块
    from decord import VideoReader  # 从 decord 模块中导入 VideoReader 类

# 如果 torch 可用,则从自动模型中导入 MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES 变量
if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES  # 从自动模型中导入 MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES 变量

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


# 添加文档字符串的装饰器
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class VideoClassificationPipeline(Pipeline):
    """
    Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
    video.

    This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"video-classification"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=video-classification).
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)  # 调用父类的初始化方法
        requires_backends(self, "decord")  # 确保 decord 可用
        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)  # 检查模型类型

    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
        # 对参数进行预处理
        preprocess_params = {}
        if frame_sampling_rate is not None:
            preprocess_params["frame_sampling_rate"] = frame_sampling_rate
        if num_frames is not None:
            preprocess_params["num_frames"] = num_frames

        # 对参数进行后处理
        postprocess_params = {}
        if top_k is not None:
            postprocess_params["top_k"] = top_k
        return preprocess_params, {}, postprocess_params  # 返回处理后的参数
    def __call__(self, videos: Union[str, List[str]], **kwargs):
        """
        将标签分配给作为输入传递的视频。

        Args:
            videos (`str`, `List[str]`):
                管道处理三种类型的视频:

                - 包含指向视频的 HTTP 链接的字符串
                - 包含指向视频的本地路径的字符串

                管道接受单个视频或视频批处理,必须作为字符串传递。
                批处理中的所有视频必须具有相同的格式:全部是 HTTP 链接或全部是本地路径。
            top_k (`int`, *可选*, 默认为 5):
                管道将返回的顶部标签数。如果提供的数字高于模型配置中可用的标签数,将默认为标签数。
            num_frames (`int`, *可选*, 默认为 `self.model.config.num_frames`):
                从视频中抽样的帧数。如果未提供,则默认为模型配置中指定的帧数。
            frame_sampling_rate (`int`, *可选*, 默认为 1):
                用于从视频中选择帧的采样率。如果未提供,则默认为 1,即每帧都将被使用。

        Return:
            包含结果的字典或字典列表。如果输入为单个视频,则返回一个字典;如果输入为多个视频,则返回相应的字典列表。

            字典包含以下键:

            - **label** (`str`) -- 模型识别的标签。
            - **score** (`int`) -- 模型为该标签分配的分数。
        """
        return super().__call__(videos, **kwargs)

    def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
        """
        预处理视频以用于模型输入。

        Args:
            video (`str` or `BytesIO`):
                视频的路径或 BytesIO 对象。
            num_frames (`int`, *可选*):
                从视频中抽样的帧数。如果未提供,则默认为 self.model.config.num_frames。
            frame_sampling_rate (`int`, *可选*, 默认为 1):
                用于从视频中选择帧的采样率。

        Returns:
            模型输入的字典表示形式。

        Raises:
            ValueError: 如果视频格式不支持或无法识别。
        """
        if num_frames is None:
            num_frames = self.model.config.num_frames

        if video.startswith("http://") or video.startswith("https://"):
            # 如果视频是一个 HTTP/HTTPS 链接,则从网络获取视频内容
            video = BytesIO(requests.get(video).content)

        # 创建视频阅读器对象
        videoreader = VideoReader(video)
        videoreader.seek(0)  # 将视频的读取位置设置为起始位置

        start_idx = 0
        end_idx = num_frames * frame_sampling_rate - 1
        # 生成需要抽样的帧索引
        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)

        # 从视频中获取指定索引的帧
        video = videoreader.get_batch(indices).asnumpy()
        video = list(video)

        # 使用图像处理器将视频帧处理为模型输入
        model_inputs = self.image_processor(video, return_tensors=self.framework)
        return model_inputs

    def _forward(self, model_inputs):
        """
        将模型输入传递给模型并获取模型输出。

        Args:
            model_inputs:
                模型的输入数据字典。

        Returns:
            模型的输出结果。
        """
        model_outputs = self.model(**model_inputs)
        return model_outputs
    # 对模型输出进行后处理,返回top_k个预测结果
    def postprocess(self, model_outputs, top_k=5):
        # 如果top_k大于模型配置的标签数,则将top_k设为模型配置的标签数
        if top_k > self.model.config.num_labels:
            top_k = self.model.config.num_labels

        # 根据选择的框架进行后处理
        if self.framework == "pt":
            # 对PyTorch框架的模型输出进行softmax处理,获取概率分布
            probs = model_outputs.logits.softmax(-1)[0]
            # 获取top_k个最高概率对应的分数和标签ID
            scores, ids = probs.topk(top_k)
        else:
            # 如果框架不是pt(PyTorch),则抛出错误
            raise ValueError(f"Unsupported framework: {self.framework}")

        # 将分数和标签ID转换为列表形式
        scores = scores.tolist()
        ids = ids.tolist()
        
        # 返回包含分数和对应标签的列表字典
        return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]

.\pipelines\visual_question_answering.py

from typing import Union

from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
# 导入必要的模块和函数

from .base import Pipeline, build_pipeline_init_args
# 从当前目录下的base模块导入Pipeline类和build_pipeline_init_args函数

if is_vision_available():
    from PIL import Image
    # 如果PIL库可用,则从PIL模块导入Image类

    from ..image_utils import load_image
    # 导入load_image函数,从上一级目录中的image_utils模块

if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
    # 如果torch可用,则从models.auto.modeling_auto模块导入MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
# 使用装饰器为VisualQuestionAnsweringPipeline类添加文档字符串,调用build_pipeline_init_args函数生成参数

class VisualQuestionAnsweringPipeline(Pipeline):
    """
    Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
    available in PyTorch.

    Example:

    ```
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
    >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
    >>> oracle(question="What is she wearing ?", image=image_url)
    [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]

    >>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
    [{'score': 0.948, 'answer': 'hat'}]

    >>> oracle(question="Is this a person ?", image=image_url, top_k=1)
    [{'score': 0.993, 'answer': 'yes'}]

    >>> oracle(question="Is this a man ?", image=image_url, top_k=1)
    [{'score': 0.996, 'answer': 'no'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifiers: `"visual-question-answering", "vqa"`.

    The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
    the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
    """
    # Visual Question Answering Pipeline类的文档字符串,描述了使用AutoModelForVisualQuestionAnswering的视觉问答流水线

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 调用父类Pipeline的初始化方法,传递所有位置参数和关键字参数

        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
        # 调用当前对象的check_model_type方法,检查模型类型是否匹配MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES

    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
        preprocess_params, postprocess_params = {}, {}
        # 初始化预处理和后处理参数字典

        if padding is not None:
            preprocess_params["padding"] = padding
        # 如果padding参数不为None,则将其加入预处理参数字典中

        if truncation is not None:
            preprocess_params["truncation"] = truncation
        # 如果truncation参数不为None,则将其加入预处理参数字典中

        if timeout is not None:
            preprocess_params["timeout"] = timeout
        # 如果timeout参数不为None,则将其加入预处理参数字典中

        if top_k is not None:
            postprocess_params["top_k"] = top_k
        # 如果top_k参数不为None,则将其加入后处理参数字典中

        return preprocess_params, {}, postprocess_params
        # 返回预处理参数字典、空字典和后处理参数字典作为元组的形式
    def __call__(self, image: Union["Image.Image", str], question: str = None, **kwargs):
        r"""
        Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
        below:

        - `pipeline(image=image, question=question)`
        - `pipeline({"image": image, "question": question})`
        - `pipeline([{"image": image, "question": question}])`
        - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`

        Args:
            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
                broadcasted to multiple questions.
            question (`str`, `List[str]`):
                The question(s) asked. If given a single question, it can be broadcasted to multiple images.
            top_k (`int`, *optional*, defaults to 5):
                The number of top labels that will be returned by the pipeline. If the provided number is higher than
                the number of labels available in the model configuration, it will default to the number of labels.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.
        Return:
            A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:

            - **label** (`str`) -- The label identified by the model.
            - **score** (`int`) -- The score attributed by the model for that label.
        """
        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
            # 如果 `image` 是 PIL.Image 或者字符串,且 `question` 是字符串,则组装成单个输入字典
            inputs = {"image": image, "question": question}
        else:
            """
            如果输入不符合上述条件,支持以下格式:
            - {"image": image, "question": question}
            - [{"image": image, "question": question}]
            - 生成器和数据集
            """
            # 否则,直接使用给定的输入作为 `inputs`
            inputs = image
        # 调用父类方法处理输入并返回结果
        results = super().__call__(inputs, **kwargs)
        return results

    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
        # 加载图像,并根据超时设置加载图像
        image = load_image(inputs["image"], timeout=timeout)
        # 使用分词器处理问题文本,并返回模型输入
        model_inputs = self.tokenizer(
            inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
        )
        # 使用图像处理器处理图像特征,并更新模型输入
        image_features = self.image_processor(images=image, return_tensors=self.framework)
        model_inputs.update(image_features)
        return model_inputs
    # 定义一个私有方法 `_forward`,用于模型推理过程中的前向传播
    def _forward(self, model_inputs, **generate_kwargs):
        # 如果模型支持生成任务
        if self.model.can_generate():
            # 调用模型的生成方法,生成模型输出
            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
        else:
            # 否则,调用模型的正常推理方法
            model_outputs = self.model(**model_inputs)
        # 返回模型的输出结果
        return model_outputs

    # 定义后处理方法 `postprocess`,用于处理模型输出并返回结果
    def postprocess(self, model_outputs, top_k=5):
        # 如果模型支持生成任务
        if self.model.can_generate():
            # 对每个模型输出的标识符进行解码,生成答案字符串,并去除特殊标记
            return [
                {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
                for output_ids in model_outputs
            ]
        else:
            # 如果 `top_k` 大于模型配置的标签数量,则将其设置为标签数量
            if top_k > self.model.config.num_labels:
                top_k = self.model.config.num_labels

            # 根据不同框架进行后处理
            if self.framework == "pt":
                # 计算模型输出的概率,并进行逻辑斯蒂处理,取第一个元素
                probs = model_outputs.logits.sigmoid()[0]
                # 获取概率最高的 `top_k` 个分数及其对应的标识符
                scores, ids = probs.topk(top_k)
            else:
                # 如果框架不支持,则抛出错误
                raise ValueError(f"Unsupported framework: {self.framework}")

            # 将分数和标识符转换为列表形式,并构建结果列表
            scores = scores.tolist()
            ids = ids.tolist()
            return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]

.\pipelines\zero_shot_audio_classification.py

# 导入所需模块和库
from collections import UserDict  # 导入 UserDict 类,用于自定义字典类型
from typing import Union  # 导入 Union 用于支持多种类型的注解

import numpy as np  # 导入 numpy 库,用于数值计算
import requests  # 导入 requests 库,用于发送 HTTP 请求

# 从相对路径导入工具函数和模块
from ..utils import (
    add_end_docstrings,  # 导入函数 add_end_docstrings,用于添加文档字符串
    logging,  # 导入 logging 模块,用于记录日志
)
# 从本地模块中导入音频分类相关函数
from .audio_classification import ffmpeg_read  # 导入音频处理函数 ffmpeg_read
from .base import Pipeline, build_pipeline_init_args  # 从基础模块导入 Pipeline 类和构建初始化参数函数 build_pipeline_init_args

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True))
# 使用装饰器 add_end_docstrings,为类添加结尾文档字符串,并指定初始化参数的特性
class ZeroShotAudioClassificationPipeline(Pipeline):
    """
    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
    provide an audio and a set of `candidate_labels`.

    Example:
    ```
    >>> from transformers import pipeline
    >>> from datasets import load_dataset

    >>> dataset = load_dataset("ashraq/esc50")
    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-audio-classification"`. See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)  # 调用父类的初始化方法

        if self.framework != "pt":
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
        # 检查框架是否为 PyTorch,如果不是则抛出 ValueError 异常
        # 暂时没有特定的 FOR_XXX 可用
    # 继承父类的 __call__ 方法,用于给传入的音频分配标签

    def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
        """
        Assign labels to the audio(s) passed as inputs.

        Args:
            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
                The pipeline handles three types of inputs:
                - A string containing a http link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in numpy
            candidate_labels (`List[str]`):
                The candidate labels for this audio
            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                The sentence used in conjunction with *candidate_labels* to attempt the audio classification by
                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                logits_per_audio
        Return:
            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
            following keys:
            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
        """
        # 调用父类的 __call__ 方法,处理传入的音频数据和其他参数
        return super().__call__(audios, **kwargs)

    def _sanitize_parameters(self, **kwargs):
        # 初始化预处理参数字典
        preprocess_params = {}
        
        # 如果参数中包含 candidate_labels,将其添加到预处理参数中
        if "candidate_labels" in kwargs:
            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
        
        # 如果参数中包含 hypothesis_template,将其添加到预处理参数中
        if "hypothesis_template" in kwargs:
            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]

        # 返回预处理参数字典和空字典(用于其他参数)
        return preprocess_params, {}, {}
    # 对音频进行预处理,将音频转换为字节流或从 URL 下载音频内容
    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):

        # 如果音频参数是字符串类型且以 "http://" 或 "https://" 开头,则下载远程音频内容
        if isinstance(audio, str):
            if audio.startswith("http://") or audio.startswith("https://"):
                # 实际需要检查协议是否存在,否则无法使用像 http_huggingface_co.png 这样的本地文件
                audio = requests.get(audio).content
            else:
                # 否则假定为本地文件路径,以二进制形式读取音频内容
                with open(audio, "rb") as f:
                    audio = f.read()

        # 如果音频是字节流,则使用特征提取器的采样率将其解码为 numpy 数组
        if isinstance(audio, bytes):
            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)

        # 检查音频是否为 numpy 数组
        if not isinstance(audio, np.ndarray):
            raise ValueError("We expect a numpy ndarray as input")
        
        # 检查音频是否为单通道音频
        if len(audio.shape) != 1:
            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")

        # 使用特征提取器提取音频特征,转换为 PyTorch 张量输入
        inputs = self.feature_extractor(
            [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
        )
        
        # 将候选标签添加到输入中
        inputs["candidate_labels"] = candidate_labels
        
        # 根据模板生成假设序列
        sequences = [hypothesis_template.format(x) for x in candidate_labels]
        
        # 使用分词器处理文本输入,返回适当的张量类型
        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
        
        # 将文本输入添加到输入字典中
        inputs["text_inputs"] = [text_inputs]
        
        # 返回处理后的输入字典
        return inputs

    # 私有方法:模型前向推断
    def _forward(self, model_inputs):
        # 弹出候选标签
        candidate_labels = model_inputs.pop("candidate_labels")
        
        # 弹出文本输入
        text_inputs = model_inputs.pop("text_inputs")
        
        # 如果文本输入是 UserDict 类型,则获取其第一个元素
        if isinstance(text_inputs[0], UserDict):
            text_inputs = text_inputs[0]
        else:
            # 否则为批处理情况,获取其第一个元素的第一个元素
            text_inputs = text_inputs[0][0]
        
        # 使用模型进行推断,传入文本输入和其他模型输入
        outputs = self.model(**text_inputs, **model_inputs)
        
        # 构建模型输出字典
        model_outputs = {
            "candidate_labels": candidate_labels,
            "logits": outputs.logits_per_audio,
        }
        
        # 返回模型输出
        return model_outputs

    # 后处理方法:处理模型输出,生成最终结果
    def postprocess(self, model_outputs):
        # 弹出候选标签
        candidate_labels = model_outputs.pop("candidate_labels")
        
        # 获取 logits
        logits = model_outputs["logits"][0]

        # 如果使用 PyTorch 框架,则对 logits 进行 softmax 处理,得到概率分数
        if self.framework == "pt":
            probs = logits.softmax(dim=0)
            scores = probs.tolist()
        else:
            # 不支持的框架类型
            raise ValueError("`tf` framework not supported.")

        # 根据分数排序候选标签,生成结果列表
        result = [
            {"score": score, "label": candidate_label}
            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
        ]
        
        # 返回最终结果
        return result

.\pipelines\zero_shot_classification.py

import inspect  # 导入 inspect 模块,用于获取对象的信息
from typing import List, Union  # 引入类型提示中的 List 和 Union 类型

import numpy as np  # 导入 NumPy 库,用于数值计算

from ..tokenization_utils import TruncationStrategy  # 导入相对路径下的 tokenization_utils 模块中的 TruncationStrategy 类
from ..utils import add_end_docstrings, logging  # 导入相对路径下的 utils 模块中的 add_end_docstrings 和 logging

from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args  # 从当前目录下的 base 模块中导入 ArgumentHandler、ChunkPipeline 和 build_pipeline_init_args 类

logger = logging.get_logger(__name__)  # 获取当前模块的 logger 对象


class ZeroShotClassificationArgumentHandler(ArgumentHandler):
    """
    Handles arguments for zero-shot for text classification by turning each possible label into an NLI
    premise/hypothesis pair.
    """

    def _parse_labels(self, labels):
        # 将 labels 转换为列表形式,如果 labels 是字符串则按逗号分隔并去除空白项
        if isinstance(labels, str):
            labels = [label.strip() for label in labels.split(",") if label.strip()]
        return labels

    def __call__(self, sequences, labels, hypothesis_template):
        # 处理调用,验证输入的 sequences 和 labels,确保至少有一个 label 和一个 sequence
        if len(labels) == 0 or len(sequences) == 0:
            raise ValueError("You must include at least one label and at least one sequence.")
        if hypothesis_template.format(labels[0]) == hypothesis_template:
            raise ValueError(
                (
                    'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
                    "Make sure the passed template includes formatting syntax such as {{}} where the label should go."
                ).format(hypothesis_template)
            )

        if isinstance(sequences, str):
            sequences = [sequences]

        sequence_pairs = []
        for sequence in sequences:
            # 生成每个 sequence 和对应 label 格式化后的假设/前提对
            sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels])

        return sequence_pairs, sequences


@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
class ZeroShotClassificationPipeline(ChunkPipeline):
    """
    NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural
    language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a
    hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is
    **much** more flexible.

    Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
    pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate
    label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model
    config's :attr:*~transformers.PretrainedConfig.label2id*.

    Example:

    ```
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="facebook/bart-large-mnli")
    >>> oracle(
    ...     "I have a problem with my iphone that needs to be resolved asap!!",
    ...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
    ... )

    """

    pass  # 这是一个基于 NLI 的零样本分类管道,继承自 ChunkPipeline 类,但未实现额外功能
    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}

    >>> oracle(
    ...     "I have a problem with my iphone that needs to be resolved asap!!",
    ...     candidate_labels=["english", "german"],
    ... )
    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]}


# 示例输入和输出,展示了使用 oracle 函数进行 zero-shot 分类的过程。
# 输入包含一个文本序列和候选标签列表,输出包含预测的标签和对应的置信度分数。

Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"zero-shot-classification"`.

The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
of available models on [huggingface.co/models](https://huggingface.co/models?search=nli).


# 提供了关于如何使用流水线的基础知识的链接和一些相关信息。
# 说明了当前的 NLI 流水线可以通过指定的任务标识符 `"zero-shot-classification"` 来加载。

    def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
        self._args_parser = args_parser
        super().__init__(*args, **kwargs)
        if self.entailment_id == -1:
            logger.warning(
                "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to "
                "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs."
            )


# 初始化方法,接受参数解析器并调用父类的初始化方法。
# 如果 entailment_id 为 -1,记录警告信息,指示无法从模型配置中的 label2id 映射中确定 'entailment' 标签的 id。
# 建议在模型配置中定义一个描述性的 label2id 映射以确保正确的输出。


    @property
    def entailment_id(self):
        for label, ind in self.model.config.label2id.items():
            if label.lower().startswith("entail"):
                return ind
        return -1


# 属性方法,用于获取模型配置中与 'entail' 开头的标签对应的 id。
# 如果找不到符合条件的标签,则返回 -1。


    def _parse_and_tokenize(
        self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs


# 私有方法,用于解析和标记化序列对。
# 接受序列对、是否填充、是否添加特殊标记、截断策略等参数。
    ):
        """
        Parse arguments and tokenize only_first so that hypothesis (label) is not truncated
        """
        # 将返回的张量设置为框架的默认值
        return_tensors = self.framework
        # 如果当前分词器不支持填充操作
        if self.tokenizer.pad_token is None:
            # 为不支持填充的分词器设置 `pad_token` 为 `eos_token`
            logger.error(
                "Tokenizer was not supporting padding necessary for zero-shot, attempting to use "
                " `pad_token=eos_token`"
            )
            self.tokenizer.pad_token = self.tokenizer.eos_token
        try:
            # 使用分词器对序列对进行分词处理,包括特殊标记、返回张量、填充和截断设置
            inputs = self.tokenizer(
                sequence_pairs,
                add_special_tokens=add_special_tokens,
                return_tensors=return_tensors,
                padding=padding,
                truncation=truncation,
            )
        except Exception as e:
            # 如果出现异常且异常信息包含 "too short"
            if "too short" in str(e):
                # 分词器可能会报告我们想要截断到一个甚至不会被输入达到的值。
                # 在这种情况下,我们不希望进行截断。
                # 看起来没有更好的方法来捕获这个异常。
                
                # 以 `DO_NOT_TRUNCATE` 策略再次尝试使用分词器对序列对进行处理
                inputs = self.tokenizer(
                    sequence_pairs,
                    add_special_tokens=add_special_tokens,
                    return_tensors=return_tensors,
                    padding=padding,
                    truncation=TruncationStrategy.DO_NOT_TRUNCATE,
                )
            else:
                # 如果异常不符合上述条件,则抛出该异常
                raise e

        # 返回分词后的输入
        return inputs

    def _sanitize_parameters(self, **kwargs):
        # 如果 `multi_class` 参数不为 None,则将其重命名为 `multi_label`
        if kwargs.get("multi_class", None) is not None:
            kwargs["multi_label"] = kwargs["multi_class"]
            logger.warning(
                "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
                "`multi_class` will be removed in a future version of Transformers."
            )
        preprocess_params = {}
        # 如果参数中包含 "candidate_labels",则解析标签并存储到预处理参数中
        if "candidate_labels" in kwargs:
            preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"])
        # 如果参数中包含 "hypothesis_template",则存储到预处理参数中
        if "hypothesis_template" in kwargs:
            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]

        postprocess_params = {}
        # 如果参数中包含 "multi_label",则存储到后处理参数中
        if "multi_label" in kwargs:
            postprocess_params["multi_label"] = kwargs["multi_label"]
        # 返回预处理参数、空字典和后处理参数
        return preprocess_params, {}, postprocess_params

    def __call__(
        self,
        sequences: Union[str, List[str]],
        *args,
        **kwargs,
    ):
        """
        Classify the sequence(s) given as inputs using a zero-shot classification pipeline.

        Args:
            sequences (`str` or `List[str]`):
                The sequence(s) to classify, will be truncated if the model input is too large.
            candidate_labels (`str` or `List[str]`):
                The set of possible class labels to classify each sequence into. Can be a single label, a string of
                comma-separated labels, or a list of labels.
            hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`):
                The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
                similar syntax for the candidate label to be inserted into the template. For example, the default
                template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the
                model like `"<cls> sequence to classify <sep> This example is sports . <sep>"`. The default template
                works well in many cases, but it may be worthwhile to experiment with different templates depending on
                the task setting.
            multi_label (`bool`, *optional*, defaults to `False`):
                Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that
                the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
                independent and probabilities are normalized for each candidate by doing a softmax of the entailment
                score vs. the contradiction score.

        Return:
            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:

            - **sequence** (`str`) -- The sequence for which this is the output.
            - **labels** (`List[str]`) -- The labels sorted by order of likelihood.
            - **scores** (`List[float]`) -- The probabilities for each of the labels.
        """
        if len(args) == 0:
            # If no positional arguments (`args`) are provided, do nothing
            pass
        elif len(args) == 1 and "candidate_labels" not in kwargs:
            # If exactly one positional argument is provided and `candidate_labels` is not already in keyword arguments (`kwargs`), set it to the provided argument
            kwargs["candidate_labels"] = args[0]
        else:
            # Raise a ValueError if extra arguments (`args`) are present or if `candidate_labels` is already specified in `kwargs`
            raise ValueError(f"Unable to understand extra arguments {args}")

        # Call the superclass's `__call__` method with `sequences` and other keyword arguments (`kwargs`)
        return super().__call__(sequences, **kwargs)

    def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
        # Parse input arguments and prepare for processing
        sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)

        # Iterate over candidate labels and corresponding sequence pairs
        for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
            # Parse and tokenize the sequence pair for model input
            model_input = self._parse_and_tokenize([sequence_pair])

            # Yield a dictionary containing processed information
            yield {
                "candidate_label": candidate_label,
                "sequence": sequences[0],  # Assuming sequences contains only one element
                "is_last": i == len(candidate_labels) - 1,  # Flag indicating if it's the last iteration
                **model_input,  # Include parsed and tokenized model input
            }
    # 定义一个方法 `_forward`,用于执行模型的前向推理过程
    def _forward(self, inputs):
        # 从输入中获取候选标签
        candidate_label = inputs["candidate_label"]
        # 从输入中获取序列数据
        sequence = inputs["sequence"]
        # 创建一个字典,包含模型所需的输入数据,使用 tokenizer 支持的模型输入名称作为键
        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
        # 根据框架选择适当的模型前向推理函数
        model_forward = self.model.forward if self.framework == "pt" else self.model.call
        # 如果模型前向推理函数支持 `use_cache` 参数,则设为 False
        if "use_cache" in inspect.signature(model_forward).parameters.keys():
            model_inputs["use_cache"] = False
        # 执行模型推理,并获取输出
        outputs = self.model(**model_inputs)

        # 构建模型输出字典,包括候选标签、序列和是否最后一个输入的标志,以及模型的其他输出
        model_outputs = {
            "candidate_label": candidate_label,
            "sequence": sequence,
            "is_last": inputs["is_last"],
            **outputs,
        }
        # 返回模型的输出
        return model_outputs

    # 定义一个后处理方法 `postprocess`,用于处理模型的输出结果
    def postprocess(self, model_outputs, multi_label=False):
        # 从模型输出中提取候选标签列表和序列列表
        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
        sequences = [outputs["sequence"] for outputs in model_outputs]
        # 提取模型输出中的 logits,并拼接成一个 numpy 数组
        logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
        # 获取 logits 的维度信息
        N = logits.shape[0]
        n = len(candidate_labels)
        # 计算序列的数量
        num_sequences = N // n
        # 将 logits 重塑成三维数组
        reshaped_outputs = logits.reshape((num_sequences, n, -1))

        if multi_label or len(candidate_labels) == 1:
            # 对每个标签独立进行 entailment vs. contradiction 的 softmax 处理
            entailment_id = self.entailment_id
            contradiction_id = -1 if entailment_id == 0 else 0
            entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]]
            scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
            # 提取 entailment 的概率分数
            scores = scores[..., 1]
        else:
            # 对所有候选标签的 "entailment" logits 进行 softmax 处理
            entail_logits = reshaped_outputs[..., self.entailment_id]
            scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)

        # 获取 top K 概率最高的标签索引
        top_inds = list(reversed(scores[0].argsort()))
        # 返回后处理的结果,包括序列、按概率排序的标签和对应的分数列表
        return {
            "sequence": sequences[0],
            "labels": [candidate_labels[i] for i in top_inds],
            "scores": scores[0, top_inds].tolist(),
        }

.\pipelines\zero_shot_image_classification.py

# 导入必要的模块和函数
from collections import UserDict  # 导入UserDict用于创建自定义字典
from typing import List, Union  # 导入List和Union用于类型提示

# 从上级目录的utils模块导入各种函数和类
from ..utils import (
    add_end_docstrings,  # 导入函数add_end_docstrings,用于添加文档字符串
    is_tf_available,  # 导入函数is_tf_available,检查是否可以使用TensorFlow
    is_torch_available,  # 导入函数is_torch_available,检查是否可以使用PyTorch
    is_vision_available,  # 导入函数is_vision_available,检查是否可以使用视觉处理功能
    logging,  # 导入logging模块,用于日志记录
    requires_backends,  # 导入requires_backends函数,用于检查后端依赖
)

# 从当前目录的base模块导入Pipeline类和build_pipeline_init_args函数
from .base import Pipeline, build_pipeline_init_args

# 如果可以使用视觉处理功能
if is_vision_available():
    # 从PIL库中导入Image模块,用于处理图像
    from PIL import Image
    # 从image_utils模块导入load_image函数,用于加载图像数据

# 如果可以使用PyTorch
if is_torch_available():
    # 导入torch库,用于深度学习任务
    import torch
    # 从models.auto模块导入模型映射名称字典

# 如果可以使用TensorFlow
if is_tf_available():
    # 从models.auto模块导入TensorFlow相关的模型映射名称字典
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
    # 从tf_utils模块导入稳定的softmax函数,用于概率计算

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 使用装饰器add_end_docstrings为ZeroShotImageClassificationPipeline类添加文档字符串
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class ZeroShotImageClassificationPipeline(Pipeline):
    """
    Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
    provide an image and a set of `candidate_labels`.

    Example:

    ```
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="google/siglip-so400m-patch14-384")
    >>> classifier(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["animals", "humans", "landscape"],
    ... )
    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]

    >>> classifier(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["black and white", "photorealist", "painting"],
    ... )
    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-image-classification"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification).
    """

    # 初始化函数,继承自Pipeline类
    def __init__(self, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 检查当前实例是否满足视觉后端的依赖
        requires_backends(self, "vision")
        
        # 根据当前框架选择适当的模型映射名称字典,用于后续任务
        self.check_model_type(
            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
            if self.framework == "tf"
            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
        )
    def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs):
        """
        将标签分配给作为输入传递的图像。

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                处理三种类型的图像:

                - 包含指向图像的 http 链接的字符串
                - 包含指向本地图像路径的字符串
                - 直接加载到 PIL 中的图像

            candidate_labels (`List[str]`):
                此图像的候选标签列表

            hypothesis_template (`str`, *可选*, 默认为 `"This is a photo of {}"`):
                与 *candidate_labels* 结合使用的句子,通过将占位符替换为 candidate_labels 尝试图像分类。
                然后使用 logits_per_image 估算可能性。

            timeout (`float`, *可选*, 默认为 None):
                从网络获取图像的最长等待时间(以秒为单位)。如果为 None,则不设置超时,调用可能会永远阻塞。

        Return:
            包含结果的字典列表,每个提议的标签一个字典。字典包含以下键:

            - **label** (`str`) -- 模型识别的标签之一。它是建议的 `candidate_label` 之一。
            - **score** (`float`) -- 模型为该标签分配的分数(介于0和1之间)。
        """
        return super().__call__(images, **kwargs)

    def _sanitize_parameters(self, **kwargs):
        preprocess_params = {}
        if "candidate_labels" in kwargs:
            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
        if "timeout" in kwargs:
            preprocess_params["timeout"] = kwargs["timeout"]
        if "hypothesis_template" in kwargs:
            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]

        return preprocess_params, {}, {}

    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
        """
        预处理图像及其相关参数。

        Args:
            image: 图像数据
            candidate_labels (`List[str]`, optional): 图像的候选标签
            hypothesis_template (`str`, optional, defaults to `"This is a photo of {}."`):
                用于替换占位符生成假设句子的模板
            timeout (`float`, optional): 从网络获取图像的最长等待时间(以秒为单位)

        Returns:
            inputs: 包含预处理后数据的字典
        """
        image = load_image(image, timeout=timeout)  # 加载图像数据
        inputs = self.image_processor(images=[image], return_tensors=self.framework)  # 处理图像数据
        inputs["candidate_labels"] = candidate_labels  # 设置候选标签
        sequences = [hypothesis_template.format(x) for x in candidate_labels]  # 根据模板生成假设句子序列
        padding = "max_length" if self.model.config.model_type == "siglip" else True  # 根据模型类型设置填充方式
        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding)  # 对假设句子序列进行tokenize
        inputs["text_inputs"] = [text_inputs]  # 设置文本输入
        return inputs
    # 定义一个方法用于模型推断,接收模型输入
    def _forward(self, model_inputs):
        # 弹出输入中的候选标签
        candidate_labels = model_inputs.pop("candidate_labels")
        # 弹出输入中的文本数据
        text_inputs = model_inputs.pop("text_inputs")
        
        # 如果文本输入的第一个元素是 UserDict 类型的对象
        if isinstance(text_inputs[0], UserDict):
            # 将文本输入重新赋值为第一个元素(UserDict对象)
            text_inputs = text_inputs[0]
        else:
            # 如果不是 UserDict 对象,则为批处理情况,取第一个元素的第一个元素
            # (这里假设 text_inputs 是一个二重嵌套列表,第一个元素是批处理的列表)
            text_inputs = text_inputs[0][0]

        # 使用模型进行推断,传入文本输入和模型输入
        outputs = self.model(**text_inputs, **model_inputs)

        # 构建模型输出字典,包括候选标签和模型的 logits
        model_outputs = {
            "candidate_labels": candidate_labels,
            "logits": outputs.logits_per_image,
        }
        return model_outputs

    # 定义一个方法用于后处理模型输出
    def postprocess(self, model_outputs):
        # 弹出模型输出中的候选标签
        candidate_labels = model_outputs.pop("candidate_labels")
        # 取出 logits,并在第一个维度上进行压缩,即去除维度为1的维度
        logits = model_outputs["logits"][0]

        # 根据不同的框架和模型类型进行处理概率
        if self.framework == "pt" and self.model.config.model_type == "siglip":
            # 对 logits 应用 sigmoid 函数,并在最后一个维度上进行压缩
            probs = torch.sigmoid(logits).squeeze(-1)
            # 将概率转换为列表
            scores = probs.tolist()
            # 如果 scores 不是列表,则转换为列表
            if not isinstance(scores, list):
                scores = [scores]
        elif self.framework == "pt":
            # 对 logits 应用 softmax 函数,并在最后一个维度上进行压缩
            probs = logits.softmax(dim=-1).squeeze(-1)
            # 将概率转换为列表
            scores = probs.tolist()
            # 如果 scores 不是列表,则转换为列表
            if not isinstance(scores, list):
                scores = [scores]
        elif self.framework == "tf":
            # 对 logits 应用稳定的 softmax 函数,并在最后一个维度上进行处理
            probs = stable_softmax(logits, axis=-1)
            # 将概率转换为 numpy 数组,再转换为列表
            scores = probs.numpy().tolist()
        else:
            # 如果框架不支持,则引发异常
            raise ValueError(f"Unsupported framework: {self.framework}")

        # 将概率分数与候选标签组成字典列表,并按分数降序排列
        result = [
            {"score": score, "label": candidate_label}
            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
        ]
        return result

.\pipelines\zero_shot_object_detection.py

from typing import Any, Dict, List, Union  # 导入需要的类型提示模块

from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends  # 导入自定义工具函数和模块
from .base import ChunkPipeline, build_pipeline_init_args  # 导入基础类和初始化函数构建器


if is_vision_available():  # 如果视觉处理模块可用
    from PIL import Image  # 导入PIL图像处理库中的Image模块
    from ..image_utils import load_image  # 从自定义图像处理工具中导入加载图像的函数

if is_torch_available():  # 如果PyTorch可用
    import torch  # 导入PyTorch模块
    from transformers.modeling_outputs import BaseModelOutput  # 导入模型输出基类
    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES  # 导入零样本对象检测模型映射名称

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))  # 添加文档字符串的装饰器,指定初始化参数为具有图像处理器
class ZeroShotObjectDetectionPipeline(ChunkPipeline):  # 定义零样本对象检测流水线,继承自ChunkPipeline基类
    """
    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
    objects when you provide an image and a set of `candidate_labels`.

    Example:

    ```
    >>> from transformers import pipeline

    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
    >>> detector(
    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
    ...     candidate_labels=["cat", "couch"],
    ... )
    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]

    >>> detector(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["head", "bird"],
    ... )
    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-object-detection"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
    """

    def __init__(self, **kwargs):  # 定义初始化方法,接受任意关键字参数
        super().__init__(**kwargs)  # 调用父类的初始化方法,传递所有接收到的关键字参数

        if self.framework == "tf":  # 如果当前框架是TensorFlow
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")  # 抛出错误,表明该类只在PyTorch中可用

        requires_backends(self, "vision")  # 确保必要的后端模块可用,这里要求视觉处理模块可用
        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)  # 检查当前模型类型是否符合零样本对象检测模型的映射名称

    def __call__(  # 定义对象实例可调用的方法
        self,
        image: Union[str, "Image.Image", List[Dict[str, Any]]],  # 图像参数可以是字符串、PIL图像对象或包含字典的列表
        candidate_labels: Union[str, List[str]] = None,  # 候选标签可以是字符串或字符串列表,默认为None
        **kwargs,  # 允许接收额外的关键字参数
   `
# 定义一个方法用于清理参数
def _sanitize_parameters(self, **kwargs):
    # 创建一个空的预处理参数字典
    preprocess_params = {}
    # 如果参数中包含超时(timeout),将其加入预处理参数中
    if "timeout" in kwargs:
        preprocess_params["timeout"] = kwargs["timeout"]
    
    # 创建一个空的后处理参数字典
    postprocess_params = {}
    # 如果参数中包含阈值(threshold),将其加入后处理参数中
    if "threshold" in kwargs:
        postprocess_params["threshold"] = kwargs["threshold"]
    # 如果参数中包含前 k 个(top_k),将其加入后处理参数中
    if "top_k" in kwargs:
        postprocess_params["top_k"] = kwargs["top_k"]
    
    # 返回预处理参数字典、空字典和后处理参数字典
    return preprocess_params, {}, postprocess_params

# 定义一个预处理方法
def preprocess(self, inputs, timeout=None):
    # 加载图像,并设定超时时间
    image = load_image(inputs["image"], timeout=timeout)
    # 获取候选标签
    candidate_labels = inputs["candidate_labels"]
    # 如果候选标签是字符串,则按逗号分隔
    if isinstance(candidate_labels, str):
        candidate_labels = candidate_labels.split(",")

    # 创建目标尺寸张量
    target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
    
    # 遍历候选标签
    for i, candidate_label in enumerate(candidate_labels):
        # 使用分词器处理候选标签,返回张量
        text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
        # 使用图像处理器处理图像,返回张量
        image_features = self.image_processor(image, return_tensors=self.framework)
        
        # 生成字典,包括是否最后一个、目标尺寸、候选标签及其它特征
        yield {
            "is_last": i == len(candidate_labels) - 1,
            "target_size": target_size,
            "candidate_label": candidate_label,
            **text_inputs,
            **image_features,
        }

# 定义一个前向方法
def _forward(self, model_inputs):
    # 弹出目标尺寸、候选标签和是否最后一个标志
    target_size = model_inputs.pop("target_size")
    candidate_label = model_inputs.pop("candidate_label")
    is_last = model_inputs.pop("is_last")

    # 使用模型处理输入,返回输出
    outputs = self.model(**model_inputs)

    # 创建模型输出字典,包括目标尺寸、候选标签、是否最后一个及其它输出
    model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs}
    return model_outputs

# 定义一个后处理方法
def postprocess(self, model_outputs, threshold=0.1, top_k=None):
    # 存储结果列表
    results = []
    
    # 遍历模型输出
    for model_output in model_outputs:
        # 获取候选标签
        label = model_output["candidate_label"]
        # 将模型输出封装成基本模型输出对象
        model_output = BaseModelOutput(model_output)
        
        # 使用图像处理器后处理目标检测结果,返回输出
        outputs = self.image_processor.post_process_object_detection(
            outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
        )[0]

        # 遍历输出的分eshold, target_sizes=model_output["target_size"]
            )[0]

            # 遍历输出结果中的得分,生成包含得分、标签和边界框的结果字典,并添加到结果列表中
            for index in outputs["scores"].nonzero():
                score = outputs["scores"][index].item()
                box = self._get_bounding_box(outputs["boxes"][index][0])

                result = {"score": score, "label": label, "box": box}
                results.append(result)

        # 按得分倒序排列结果列表
        results = sorted(results, key=lambda x: x["score"], reverse=True)
        # 如果指定了 top_k 参数,则返回前 top_k 个结果
        if top_k:
            results = results[:top_k]

        return results
    # 定义一个方法 `_get_bounding_box`,用于将列表 [xmin, xmax, ymin, ymax] 转换为包含这些坐标的字典
    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
        """
        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }

        Args:
            box (`torch.Tensor`): Tensor containing the coordinates in corners format.

        Returns:
            bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
        """
        # 检查当前所用的深度学习框架是否为 PyTorch,若不是则抛出 ValueError 异常
        if self.framework != "pt":
            raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.")
        # 将输入的 box 张量转换为整数列表,并将其转换为 Python 中的标准列表形式
        xmin, ymin, xmax, ymax = box.int().tolist()
        # 创建包含坐标的字典 bbox,键为坐标名,值为对应的坐标值
        bbox = {
            "xmin": xmin,
            "ymin": ymin,
            "xmax": xmax,
            "ymax": ymax,
        }
        # 返回坐标字典 bbox
        return bbox

.\pipelines\__init__.py

# 导入所需的模块和函数

import json  # 导入处理 JSON 数据的模块
import os  # 导入操作系统相关的功能模块
import warnings  # 导入警告处理模块
from pathlib import Path  # 导入处理路径的模块 Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union  # 导入类型提示相关的功能

from huggingface_hub import model_info  # 从 huggingface_hub 模块导入 model_info

# 从不同模块中导入所需的类和函数
from ..configuration_utils import PretrainedConfig  # 导入预训练配置类
from ..dynamic_module_utils import get_class_from_dynamic_module  # 导入从动态模块获取类的函数
from ..feature_extraction_utils import PreTrainedFeatureExtractor  # 导入预训练特征提取器类
from ..image_processing_utils import BaseImageProcessor  # 导入基础图像处理器类
from ..models.auto.configuration_auto import AutoConfig  # 导入自动配置类
from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor  # 导入自动特征提取映射和自动特征提取器类
from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor  # 导入自动图像处理映射和自动图像处理器类
from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage  # 导入自动深度估计模型和自动图像转换模型
from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer  # 导入自动分词映射和自动分词器类
from ..tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器类
from ..utils import (
    CONFIG_NAME,  # 导入配置文件名常量
    HUGGINGFACE_CO_RESOLVE_ENDPOINT,  # 导入 Hugging Face 协作解决端点常量
    cached_file,  # 导入缓存文件函数
    extract_commit_hash,  # 导入提取提交哈希函数
    find_adapter_config_file,  # 导入查找适配器配置文件函数
    is_kenlm_available,  # 导入检查 kenlm 是否可用函数
    is_offline_mode,  # 导入检查是否离线模式函数
    is_peft_available,  # 导入检查 peft 是否可用函数
    is_pyctcdecode_available,  # 导入检查 pyctcdecode 是否可用函数
    is_tf_available,  # 导入检查是否有 TensorFlow 函数
    is_torch_available,  # 导入检查是否有 PyTorch 函数
    logging,  # 导入日志记录模块
)

# 从不同子模块导入具体的任务流水线类
from .audio_classification import AudioClassificationPipeline  # 导入音频分类任务流水线类
from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline  # 导入自动语音识别任务流水线类
from .base import (  # 从基础模块导入多个类和函数
    ArgumentHandler,  # 导入参数处理器类
    CsvPipelineDataFormat,  # 导入 CSV 数据格式流水线类
    JsonPipelineDataFormat,  # 导入 JSON 数据格式流水线类
    PipedPipelineDataFormat,  # 导入管道数据格式流水线类
    Pipeline,  # 导入任务流水线基类
    PipelineDataFormat,  # 导入任务流水线数据格式基类
    PipelineException,  # 导入任务流水线异常类
    PipelineRegistry,  # 导入任务流水线注册表类
    get_default_model_and_revision,  # 导入获取默认模型和版本函数
    infer_framework_load_model,  # 导入推断框架加载模型函数
)

# 从不同子模块导入特定任务流水线类
from .conversational import Conversation, ConversationalPipeline  # 导入对话任务流水线类
from .depth_estimation import DepthEstimationPipeline  # 导入深度估计任务流水线类
from .document_question_answering import DocumentQuestionAnsweringPipeline  # 导入文档问答任务流水线类
from .feature_extraction import FeatureExtractionPipeline  # 导入特征提取任务流水线类
from .fill_mask import FillMaskPipeline  # 导入填充掩码任务流水线类
from .image_classification import ImageClassificationPipeline  # 导入图像分类任务流水线类
from .image_feature_extraction import ImageFeatureExtractionPipeline  # 导入图像特征提取任务流水线类
from .image_segmentation import ImageSegmentationPipeline  # 导入图像分割任务流水线类
from .image_to_image import ImageToImagePipeline  # 导入图像到图像任务流水线类
from .image_to_text import ImageToTextPipeline  # 导入图像到文本任务流水线类
from .mask_generation import MaskGenerationPipeline  # 导入生成掩码任务流水线类
from .object_detection import ObjectDetectionPipeline  # 导入对象检测任务流水线类
from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline  # 导入问答任务流水线相关类和函数
# 导入表格问答模块中的参数处理器和管道
from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
# 导入文本到文本生成模块中的摘要生成管道、文本到文本生成管道和翻译管道
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
# 导入文本分类模块中的文本分类管道
from .text_classification import TextClassificationPipeline
# 导入文本生成模块中的文本生成管道
from .text_generation import TextGenerationPipeline
# 导入文本到音频模块中的文本到音频管道
from .text_to_audio import TextToAudioPipeline
# 导入标记分类模块中的聚合策略、命名实体识别管道、标记分类参数处理器和标记分类管道
from .token_classification import (
    AggregationStrategy,
    NerPipeline,
    TokenClassificationArgumentHandler,
    TokenClassificationPipeline,
)
# 导入视频分类模块中的视频分类管道
from .video_classification import VideoClassificationPipeline
# 导入视觉问答模块中的视觉问答管道
from .visual_question_answering import VisualQuestionAnsweringPipeline
# 导入零样本音频分类模块中的零样本音频分类管道
from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
# 导入零样本分类模块中的零样本分类参数处理器和零样本分类管道
from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
# 导入零样本图像分类模块中的零样本图像分类管道
from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
# 导入零样本目标检测模块中的零样本目标检测管道
from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline

# 如果 TensorFlow 可用,则导入相关模块和类
if is_tf_available():
    import tensorflow as tf

    from ..models.auto.modeling_tf_auto import (
        TFAutoModel,
        TFAutoModelForCausalLM,
        TFAutoModelForImageClassification,
        TFAutoModelForMaskedLM,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForSeq2SeqLM,
        TFAutoModelForSequenceClassification,
        TFAutoModelForTableQuestionAnswering,
        TFAutoModelForTokenClassification,
        TFAutoModelForVision2Seq,
        TFAutoModelForZeroShotImageClassification,
    )

# 如果 PyTorch 可用,则导入相关模块和类
if is_torch_available():
    import torch

    from ..models.auto.modeling_auto import (
        AutoModel,
        AutoModelForAudioClassification,
        AutoModelForCausalLM,
        AutoModelForCTC,
        AutoModelForDocumentQuestionAnswering,
        AutoModelForImageClassification,
        AutoModelForImageSegmentation,
        AutoModelForMaskedLM,
        AutoModelForMaskGeneration,
        AutoModelForObjectDetection,
        AutoModelForQuestionAnswering,
        AutoModelForSemanticSegmentation,
        AutoModelForSeq2SeqLM,
        AutoModelForSequenceClassification,
        AutoModelForSpeechSeq2Seq,
        AutoModelForTableQuestionAnswering,
        AutoModelForTextToSpectrogram,
        AutoModelForTextToWaveform,
        AutoModelForTokenClassification,
        AutoModelForVideoClassification,
        AutoModelForVision2Seq,
        AutoModelForVisualQuestionAnswering,
        AutoModelForZeroShotImageClassification,
        AutoModelForZeroShotObjectDetection,
    )

# 如果支持类型检查,则导入必要的模块
if TYPE_CHECKING:
    from ..modeling_tf_utils import TFPreTrainedModel
    from ..modeling_utils import PreTrainedModel
    from ..tokenization_utils_fast import PreTrainedTokenizerFast

# 获取日志记录器并命名空间化
logger = logging.get_logger(__name__)

# 注册所有支持的任务别名
TASK_ALIASES = {
    "sentiment-analysis": "text-classification",  # 情感分析任务的别名为文本分类
    "ner": "token-classification",  # 命名实体识别任务的别名为标记分类
    "vqa": "visual-question-answering",  # 视觉问答任务的别名为视觉问答
    "text-to-speech": "text-to-audio",  # 文本转语音任务的别名为文本到音频
}
# 支持的任务及其配置信息字典,每个任务对应一个字典条目
SUPPORTED_TASKS = {
    # 音频分类任务
    "audio-classification": {
        # 实现类为 AudioClassificationPipeline
        "impl": AudioClassificationPipeline,
        # TensorFlow 空元组,无特定的 TensorFlow 模型
        "tf": (),
        # 如果 Torch 可用,包含 AutoModelForAudioClassification 类
        "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
        # 默认模型为 wav2vec2-base-superb-ks,版本为 "372e048"
        "default": {"model": {"pt": ("superb/wav2vec2-base-superb-ks", "372e048")}},
        # 类型为音频
        "type": "audio",
    },
    # 自动语音识别任务
    "automatic-speech-recognition": {
        # 实现类为 AutomaticSpeechRecognitionPipeline
        "impl": AutomaticSpeechRecognitionPipeline,
        # TensorFlow 空元组,无特定的 TensorFlow 模型
        "tf": (),
        # 如果 Torch 可用,包含 AutoModelForCTC 和 AutoModelForSpeechSeq2Seq 类
        "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (),
        # 默认模型为 wav2vec2-base-960h,版本为 "55bb623"
        "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "55bb623")}},
        # 类型为多模态
        "type": "multimodal",
    },
    # 文本转音频任务
    "text-to-audio": {
        # 实现类为 TextToAudioPipeline
        "impl": TextToAudioPipeline,
        # TensorFlow 空元组,无特定的 TensorFlow 模型
        "tf": (),
        # 如果 Torch 可用,包含 AutoModelForTextToWaveform 和 AutoModelForTextToSpectrogram 类
        "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (),
        # 默认模型为 bark-small,版本为 "645cfba"
        "default": {"model": {"pt": ("suno/bark-small", "645cfba")}},
        # 类型为文本
        "type": "text",
    },
    # 特征提取任务
    "feature-extraction": {
        # 实现类为 FeatureExtractionPipeline
        "impl": FeatureExtractionPipeline,
        # 如果 TensorFlow 可用,包含 TFAutoModel 类
        "tf": (TFAutoModel,) if is_tf_available() else (),
        # 如果 Torch 可用,包含 AutoModel 类
        "pt": (AutoModel,) if is_torch_available() else (),
        # 默认模型为 distilbert-base-cased,版本为 "935ac13",同时支持 TensorFlow 和 Torch
        "default": {
            "model": {
                "pt": ("distilbert/distilbert-base-cased", "935ac13"),
                "tf": ("distilbert/distilbert-base-cased", "935ac13"),
            }
        },
        # 类型为多模态
        "type": "multimodal",
    },
    # 文本分类任务
    "text-classification": {
        # 实现类为 TextClassificationPipeline
        "impl": TextClassificationPipeline,
        # 如果 TensorFlow 可用,包含 TFAutoModelForSequenceClassification 类
        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
        # 如果 Torch 可用,包含 AutoModelForSequenceClassification 类
        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
        # 默认模型为 distilbert-base-uncased-finetuned-sst-2-english,版本为 "af0f99b",同时支持 TensorFlow 和 Torch
        "default": {
            "model": {
                "pt": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "af0f99b"),
                "tf": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "af0f99b"),
            },
        },
        # 类型为文本
        "type": "text",
    },
    # 标记分类任务
    "token-classification": {
        # 实现类为 TokenClassificationPipeline
        "impl": TokenClassificationPipeline,
        # 如果 TensorFlow 可用,包含 TFAutoModelForTokenClassification 类
        "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (),
        # 如果 Torch 可用,包含 AutoModelForTokenClassification 类
        "pt": (AutoModelForTokenClassification,) if is_torch_available() else (),
        # 默认模型为 bert-large-cased-finetuned-conll03-english,版本为 "f2482bf",同时支持 TensorFlow 和 Torch
        "default": {
            "model": {
                "pt": ("dbmdz/bert-large-cased-finetuned-conll03-english", "f2482bf"),
                "tf": ("dbmdz/bert-large-cased-finetuned-conll03-english", "f2482bf"),
            },
        },
        # 类型为文本
        "type": "text",
    },
    # 问答任务
    "question-answering": {
        # 实现类为 QuestionAnsweringPipeline
        "impl": QuestionAnsweringPipeline,
        # 如果 TensorFlow 可用,包含 TFAutoModelForQuestionAnswering 类
        "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (),
        # 如果 Torch 可用,包含 AutoModelForQuestionAnswering 类
        "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (),
        # 默认模型为 distilbert-base-cased-distilled-squad,版本为 "626af31",同时支持 TensorFlow 和 Torch
        "default": {
            "model": {
                "pt": ("distilbert/distilbert-base-cased-distilled-squad", "626af31"),
                "tf": ("distilbert/distilbert-base-cased-distilled-squad", "626af31"),
            },
        },
        # 类型为文本
        "type": "text",
    },
    # 定义 table-question-answering 任务配置项
    "table-question-answering": {
        # 使用 TableQuestionAnsweringPipeline 处理该任务
        "impl": TableQuestionAnsweringPipeline,
        # 如果有 Torch 可用,则提供 Torch 模型
        "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
        # 如果有 TensorFlow 可用,则提供 TensorFlow 模型
        "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
        # 默认模型设定
        "default": {
            "model": {
                # Torch 模型及其版本
                "pt": ("google/tapas-base-finetuned-wtq", "69ceee2"),
                # TensorFlow 模型及其版本
                "tf": ("google/tapas-base-finetuned-wtq", "69ceee2"),
            },
        },
        # 任务类型为文本处理
        "type": "text",
    },
    
    # 定义 visual-question-answering 任务配置项
    "visual-question-answering": {
        # 使用 VisualQuestionAnsweringPipeline 处理该任务
        "impl": VisualQuestionAnsweringPipeline,
        # 如果有 Torch 可用,则提供 Torch 模型
        "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
        # TensorFlow 模型部分为空,表示无 TensorFlow 模型
        "tf": (),
        # 默认模型设定
        "default": {
            "model": {
                # Torch 模型及其版本
                "pt": ("dandelin/vilt-b32-finetuned-vqa", "4355f59"),
            },
        },
        # 任务类型为多模态处理
        "type": "multimodal",
    },
    
    # 定义 document-question-answering 任务配置项
    "document-question-answering": {
        # 使用 DocumentQuestionAnsweringPipeline 处理该任务
        "impl": DocumentQuestionAnsweringPipeline,
        # 如果有 Torch 可用,则提供 Torch 模型
        "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
        # TensorFlow 模型部分为空,表示无 TensorFlow 模型
        "tf": (),
        # 默认模型设定
        "default": {
            "model": {
                # Torch 模型及其版本
                "pt": ("impira/layoutlm-document-qa", "52e01b3"),
            },
        },
        # 任务类型为多模态处理
        "type": "multimodal",
    },
    
    # 定义 fill-mask 任务配置项
    "fill-mask": {
        # 使用 FillMaskPipeline 处理该任务
        "impl": FillMaskPipeline,
        # 如果有 TensorFlow 可用,则提供 TensorFlow 模型
        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
        # 如果有 Torch 可用,则提供 Torch 模型
        "pt": (AutoModelForMaskedLM,) if is_torch_available() else (),
        # 默认模型设定
        "default": {
            "model": {
                # Torch 模型及其版本
                "pt": ("distilbert/distilroberta-base", "ec58a5b"),
                # TensorFlow 模型及其版本
                "tf": ("distilbert/distilroberta-base", "ec58a5b"),
            }
        },
        # 任务类型为文本处理
        "type": "text",
    },
    
    # 定义 summarization 任务配置项
    "summarization": {
        # 使用 SummarizationPipeline 处理该任务
        "impl": SummarizationPipeline,
        # 如果有 TensorFlow 可用,则提供 TensorFlow 模型
        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
        # 如果有 Torch 可用,则提供 Torch 模型
        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
        # 默认模型设定
        "default": {
            "model": {
                # Torch 模型及其版本
                "pt": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e"),
                # TensorFlow 模型及其版本
                "tf": ("google-t5/t5-small", "d769bba")
            }
        },
        # 任务类型为文本处理
        "type": "text",
    },
    
    # translation 任务是特殊情况,参数化为 SRC 和 TGT 语言
    "translation": {
        # 使用 TranslationPipeline 处理该任务
        "impl": TranslationPipeline,
        # 如果有 TensorFlow 可用,则提供 TensorFlow 模型
        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
        # 如果有 Torch 可用,则提供 Torch 模型
        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
        # 默认模型设定
        "default": {
            # 设定不同的 SRC 和 TGT 语言对应的模型
            ("en", "fr"): {"model": {"pt": ("google-t5/t5-base", "686f1db"), "tf": ("google-t5/t5-base", "686f1db")}},
            ("en", "de"): {"model": {"pt": ("google-t5/t5-base", "686f1db"), "tf": ("google-t5/t5-base", "686f1db")}},
            ("en", "ro"): {"model": {"pt": ("google-t5/t5-base", "686f1db"), "tf": ("google-t5/t5-base", "686f1db")}},
        },
        # 任务类型为文本处理
        "type": "text",
    },
    "text2text-generation": {  # 文本到文本生成任务配置
        "impl": Text2TextGenerationPipeline,  # 使用 Text2TextGenerationPipeline 类实现
        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),  # 如果 TensorFlow 可用,使用 TFAutoModelForSeq2SeqLM 模型
        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),  # 如果 PyTorch 可用,使用 AutoModelForSeq2SeqLM 模型
        "default": {"model": {"pt": ("google-t5/t5-base", "686f1db"), "tf": ("google-t5/t5-base", "686f1db")}},  # 默认模型配置
        "type": "text",  # 任务类型为文本生成
    },
    "text-generation": {  # 文本生成任务配置
        "impl": TextGenerationPipeline,  # 使用 TextGenerationPipeline 类实现
        "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (),  # 如果 TensorFlow 可用,使用 TFAutoModelForCausalLM 模型
        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),  # 如果 PyTorch 可用,使用 AutoModelForCausalLM 模型
        "default": {"model": {"pt": ("openai-community/gpt2", "6c0e608"), "tf": ("openai-community/gpt2", "6c0e608")}},  # 默认模型配置
        "type": "text",  # 任务类型为文本生成
    },
    "zero-shot-classification": {  # 零样本分类任务配置
        "impl": ZeroShotClassificationPipeline,  # 使用 ZeroShotClassificationPipeline 类实现
        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),  # 如果 TensorFlow 可用,使用 TFAutoModelForSequenceClassification 模型
        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),  # 如果 PyTorch 可用,使用 AutoModelForSequenceClassification 模型
        "default": {  # 默认配置
            "model": {  # 模型配置
                "pt": ("facebook/bart-large-mnli", "c626438"),  # PyTorch 使用 Facebook BART 大型 MNLI 模型
                "tf": ("FacebookAI/roberta-large-mnli", "130fb28"),  # TensorFlow 使用 Facebook RoBERTa 大型 MNLI 模型
            },
            "config": {  # 额外配置
                "pt": ("facebook/bart-large-mnli", "c626438"),  # PyTorch 使用相同的 BART 大型 MNLI 模型
                "tf": ("FacebookAI/roberta-large-mnli", "130fb28"),  # TensorFlow 使用相同的 RoBERTa 大型 MNLI 模型
            },
        },
        "type": "text",  # 任务类型为文本分类
    },
    "zero-shot-image-classification": {  # 零样本图像分类任务配置
        "impl": ZeroShotImageClassificationPipeline,  # 使用 ZeroShotImageClassificationPipeline 类实现
        "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (),  # 如果 TensorFlow 可用,使用 TFAutoModelForZeroShotImageClassification 模型
        "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (),  # 如果 PyTorch 可用,使用 AutoModelForZeroShotImageClassification 模型
        "default": {  # 默认配置
            "model": {  # 模型配置
                "pt": ("openai/clip-vit-base-patch32", "f4881ba"),  # PyTorch 使用 OpenAI CLIP-ViT Base 模型
                "tf": ("openai/clip-vit-base-patch32", "f4881ba"),  # TensorFlow 使用相同的 CLIP-ViT Base 模型
            }
        },
        "type": "multimodal",  # 任务类型为多模态
    },
    "zero-shot-audio-classification": {  # 零样本音频分类任务配置
        "impl": ZeroShotAudioClassificationPipeline,  # 使用 ZeroShotAudioClassificationPipeline 类实现
        "tf": (),  # TensorFlow 不适用于此任务,设为空元组
        "pt": (AutoModel,) if is_torch_available() else (),  # 如果 PyTorch 可用,使用 AutoModel 模型
        "default": {  # 默认配置
            "model": {  # 模型配置
                "pt": ("laion/clap-htsat-fused", "973b6e5"),  # PyTorch 使用 Laion CLAP-HTSAT-Fused 模型
            }
        },
        "type": "multimodal",  # 任务类型为多模态
    },
    "conversational": {  # 对话生成任务配置
        "impl": ConversationalPipeline,  # 使用 ConversationalPipeline 类实现
        "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (),  # 如果 TensorFlow 可用,使用 TFAutoModelForSeq2SeqLM 和 TFAutoModelForCausalLM 模型
        "pt": (AutoModelForSeq2SeqLM, AutoModelForCausalLM) if is_torch_available() else (),  # 如果 PyTorch 可用,使用 AutoModelForSeq2SeqLM 和 AutoModelForCausalLM 模型
        "default": {  # 默认配置
            "model": {"pt": ("microsoft/DialoGPT-medium", "8bada3b"), "tf": ("microsoft/DialoGPT-medium", "8bada3b")}  # 使用 Microsoft DialoGPT 中等模型
        },
        "type": "text",  # 任务类型为文本生成
    },
    {
        # 图像分类任务的配置
        "image-classification": {
            # 实现图像分类任务的流水线
            "impl": ImageClassificationPipeline,
            # TensorFlow 可用时的模型配置,包含自动图像分类模型
            "tf": (TFAutoModelForImageClassification,) if is_tf_available() else (),
            # PyTorch 可用时的模型配置,包含自动图像分类模型
            "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
            # 默认模型配置
            "default": {
                "model": {
                    # PyTorch 的默认模型为 VIT-base-patch16-224,版本为 5dca96d
                    "pt": ("google/vit-base-patch16-224", "5dca96d"),
                    # TensorFlow 的默认模型为 VIT-base-patch16-224,版本为 5dca96d
                    "tf": ("google/vit-base-patch16-224", "5dca96d"),
                }
            },
            # 任务类型为图像处理
            "type": "image",
        },
        # 图像特征提取任务的配置
        "image-feature-extraction": {
            # 实现图像特征提取任务的流水线
            "impl": ImageFeatureExtractionPipeline,
            # TensorFlow 可用时的模型配置,包含自动模型
            "tf": (TFAutoModel,) if is_tf_available() else (),
            # PyTorch 可用时的模型配置,包含自动模型
            "pt": (AutoModel,) if is_torch_available() else (),
            # 默认模型配置
            "default": {
                "model": {
                    # PyTorch 的默认模型为 VIT-base-patch16-224,版本为 29e7a1e183
                    "pt": ("google/vit-base-patch16-224", "29e7a1e183"),
                    # TensorFlow 的默认模型为 VIT-base-patch16-224,版本为 29e7a1e183
                    "tf": ("google/vit-base-patch16-224", "29e7a1e183"),
                }
            },
            # 任务类型为图像处理
            "type": "image",
        },
        # 图像分割任务的配置
        "image-segmentation": {
            # 实现图像分割任务的流水线
            "impl": ImageSegmentationPipeline,
            # TensorFlow 可用时的模型配置为空元组,表示不可用
            "tf": (),
            # PyTorch 可用时的模型配置,包含自动目标分割和语义分割模型
            "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
            # 默认模型配置,PyTorch 的默认模型为 DETR-resnet-50-panoptic,版本为 fc15262
            "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "fc15262")}},
            # 任务类型为多模态处理
            "type": "multimodal",
        },
        # 图像到文本任务的配置
        "image-to-text": {
            # 实现图像到文本任务的流水线
            "impl": ImageToTextPipeline,
            # TensorFlow 可用时的模型配置,包含自动视觉到序列模型
            "tf": (TFAutoModelForVision2Seq,) if is_tf_available() else (),
            # PyTorch 可用时的模型配置,包含自动视觉到序列模型
            "pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
            # 默认模型配置,PyTorch 的默认模型为 VIT-GPT2-COCO-en,版本为 65636df
            "default": {
                "model": {
                    "pt": ("ydshieh/vit-gpt2-coco-en", "65636df"),
                    "tf": ("ydshieh/vit-gpt2-coco-en", "65636df"),
                }
            },
            # 任务类型为多模态处理
            "type": "multimodal",
        },
        # 目标检测任务的配置
        "object-detection": {
            # 实现目标检测任务的流水线
            "impl": ObjectDetectionPipeline,
            # TensorFlow 可用时的模型配置为空元组,表示不可用
            "tf": (),
            # PyTorch 可用时的模型配置,包含自动目标检测模型
            "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
            # 默认模型配置,PyTorch 的默认模型为 DETR-resnet-50,版本为 2729413
            "default": {"model": {"pt": ("facebook/detr-resnet-50", "2729413")}},
            # 任务类型为多模态处理
            "type": "multimodal",
        },
        # 零样本目标检测任务的配置
        "zero-shot-object-detection": {
            # 实现零样本目标检测任务的流水线
            "impl": ZeroShotObjectDetectionPipeline,
            # TensorFlow 可用时的模型配置为空元组,表示不可用
            "tf": (),
            # PyTorch 可用时的模型配置,包含自动零样本目标检测模型
            "pt": (AutoModelForZeroShotObjectDetection,) if is_torch_available() else (),
            # 默认模型配置,PyTorch 的默认模型为 OWL-ViT-base-patch32,版本为 17740e1
            "default": {"model": {"pt": ("google/owlvit-base-patch32", "17740e1")}},
            # 任务类型为多模态处理
            "type": "multimodal",
        },
        # 深度估计任务的配置
        "depth-estimation": {
            # 实现深度估计任务的流水线
            "impl": DepthEstimationPipeline,
            # TensorFlow 可用时的模型配置为空元组,表示不可用
            "tf": (),
            # PyTorch 可用时的模型配置,包含自动深度估计模型
            "pt": (AutoModelForDepthEstimation,) if is_torch_available() else (),
            # 默认模型配置,PyTorch 的默认模型为 DPT-large,版本为 e93beec
            "default": {"model": {"pt": ("Intel/dpt-large", "e93beec")}},
            # 任务类型为图像处理
            "type": "image",
        },
        # 视频分类任务的配置
        "video-classification": {
            # 实现视频分类任务的流水线
            "impl": VideoClassificationPipeline,
            # TensorFlow 可用时的模型配置为空元组,表示不可用
            "tf": (),
            # PyTorch 可用时的模型配置,包含自动视频分类模型
            "pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
            # 默认模型配置,PyTorch 的默认模型为 VideoMae-base-finetuned-kinetics,版本为 4800870
            "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "4800870")}},
            # 任务类型为视频处理
            "type": "video",
        },
    }
    # "mask-generation"任务配置
    "mask-generation": {
        # 使用MaskGenerationPipeline作为实现
        "impl": MaskGenerationPipeline,
        # TensorFlow环境下不需要额外模型
        "tf": (),
        # 如果有PyTorch环境,使用AutoModelForMaskGeneration作为模型
        "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
        # 默认模型配置,使用Facebook的"facebook/sam-vit-huge"模型
        "default": {"model": {"pt": ("facebook/sam-vit-huge", "997b15")}},
        # 任务类型为多模态处理
        "type": "multimodal",
    },
    
    # "image-to-image"任务配置
    "image-to-image": {
        # 使用ImageToImagePipeline作为实现
        "impl": ImageToImagePipeline,
        # TensorFlow环境下不需要额外模型
        "tf": (),
        # 如果有PyTorch环境,使用AutoModelForImageToImage作为模型
        "pt": (AutoModelForImageToImage,) if is_torch_available() else (),
        # 默认模型配置,使用"caidas/swin2SR-classical-sr-x2-64"模型
        "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "4aaedcb")}},
        # 任务类型为图像处理
        "type": "image",
    },
}

# 初始化空集合,用于存放没有特征提取器的任务
NO_FEATURE_EXTRACTOR_TASKS = set()
# 初始化空集合,用于存放没有图像处理器的任务
NO_IMAGE_PROCESSOR_TASKS = set()
# 初始化空集合,用于存放没有分词器的任务
NO_TOKENIZER_TASKS = set()

# 下面这些模型配置是特殊的,它们是通用的,适用于多种任务,意味着任何分词器/特征提取器都可能用于给定的模型,
# 因此我们无法使用静态定义的 TOKENIZER_MAPPING 和 FEATURE_EXTRACTOR_MAPPING 来查看模型是否定义了这些对象。
MULTI_MODEL_AUDIO_CONFIGS = {"SpeechEncoderDecoderConfig"}
MULTI_MODEL_VISION_CONFIGS = {"VisionEncoderDecoderConfig", "VisionTextDualEncoderConfig"}

# 遍历 SUPPORTED_TASKS 中的任务及其值
for task, values in SUPPORTED_TASKS.items():
    if values["type"] == "text":
        # 如果任务类型为文本,将其添加到没有特征提取器的任务集合中
        NO_FEATURE_EXTRACTOR_TASKS.add(task)
        # 如果任务类型为文本,将其添加到没有图像处理器的任务集合中
        NO_IMAGE_PROCESSOR_TASKS.add(task)
    elif values["type"] in {"image", "video"}:
        # 如果任务类型为图像或视频,将其添加到没有分词器的任务集合中
        NO_TOKENIZER_TASKS.add(task)
    elif values["type"] in {"audio"}:
        # 如果任务类型为音频,将其添加到没有分词器的任务集合中
        NO_TOKENIZER_TASKS.add(task)
        # 如果任务类型为音频,将其添加到没有图像处理器的任务集合中
        NO_IMAGE_PROCESSOR_TASKS.add(task)
    elif values["type"] != "multimodal":
        # 如果任务类型不是多模态,抛出异常,说明不支持的任务类型
        raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")

# 创建管道注册对象,使用支持的任务和任务别名作为参数
PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)


def get_supported_tasks() -> List[str]:
    """
    返回支持的任务列表。
    """
    return PIPELINE_REGISTRY.get_supported_tasks()


def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
    """
    根据模型和令牌返回任务字符串,支持废弃的参数。
    """
    # 弹出废弃的参数 use_auth_token,并赋值给 use_auth_token
    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
    
    # 如果 use_auth_token 不为 None,发出废弃警告信息
    if use_auth_token is not None:
        warnings.warn(
            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
            FutureWarning,
        )
        # 如果 token 不为 None,引发值错误,说明同时指定了 token 和 use_auth_token 参数
        if token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        # 将 use_auth_token 赋值给 token
        token = use_auth_token

    # 如果处于离线模式,引发运行时错误,说明不能在离线模式下自动推断任务
    if is_offline_mode():
        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
    
    # 尝试获取模型信息,如果出现异常,引发运行时错误
    try:
        info = model_info(model, token=token)
    except Exception as e:
        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
    
    # 如果信息中没有 pipeline_tag 属性,引发运行时错误,说明模型没有正确设置 pipeline_tag 来自动推断任务
    if not info.pipeline_tag:
        raise RuntimeError(
            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
        )
    
    # 如果 info 的 library_name 属性不是 "transformers",引发运行时错误,说明该模型应该使用其他库而不是 transformers
    if getattr(info, "library_name", "transformers") != "transformers":
        raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
    
    # 返回从 info 中推断的 pipeline_tag 作为任务
    task = info.pipeline_tag
    return task


def check_task(task: str) -> Tuple[str, Dict, Any]:
    """
    检查传入的任务字符串,验证其正确性,并返回默认的管道和模型类,以及默认模型(如果存在)。
    """
    Args:
        task (`str`):
            指定要返回的流水线的任务。目前接受的任务包括:

            - `"audio-classification"`
            - `"automatic-speech-recognition"`
            - `"conversational"`
            - `"depth-estimation"`
            - `"document-question-answering"`
            - `"feature-extraction"`
            - `"fill-mask"`
            - `"image-classification"`
            - `"image-feature-extraction"`
            - `"image-segmentation"`
            - `"image-to-text"`
            - `"image-to-image"`
            - `"object-detection"`
            - `"question-answering"`
            - `"summarization"`
            - `"table-question-answering"`
            - `"text2text-generation"`
            - `"text-classification"`(别名为 `"sentiment-analysis"` 可用)
            - `"text-generation"`
            - `"text-to-audio"`(别名为 `"text-to-speech"` 可用)
            - `"token-classification"`(别名为 `"ner"` 可用)
            - `"translation"`
            - `"translation_xx_to_yy"`
            - `"video-classification"`
            - `"visual-question-answering"`(别名为 `"vqa"` 可用)
            - `"zero-shot-classification"`
            - `"zero-shot-image-classification"`
            - `"zero-shot-object-detection"`

    Returns:
        返回一个元组,包含标准化后的任务名称 `normalized_task`(去除了别名和选项)、任务默认设置字典 `task_defaults`,以及一些额外的任务选项 `task_options`(对于像 "translation_XX_to_YY" 这样带参数的任务)。

    """
    return PIPELINE_REGISTRY.check_task(task)
def clean_custom_task(task_info):
    import transformers  # 导入transformers库

    # 检查任务信息中是否包含实现信息,如果没有则抛出运行时错误
    if "impl" not in task_info:
        raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
    
    pt_class_names = task_info.get("pt", ())  # 获取pt_class_names,如果不存在则默认为空元组
    if isinstance(pt_class_names, str):
        pt_class_names = [pt_class_names]  # 如果pt_class_names是字符串,转换为列表
    # 将pt_class_names中每个类名对应的类对象存入task_info["pt"]中
    task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
    
    tf_class_names = task_info.get("tf", ())  # 获取tf_class_names,如果不存在则默认为空元组
    if isinstance(tf_class_names, str):
        tf_class_names = [tf_class_names]  # 如果tf_class_names是字符串,转换为列表
    # 将tf_class_names中每个类名对应的类对象存入task_info["tf"]中
    task_info["tf"] = tuple(getattr(transformers, c) for c in tf_class_names)
    
    return task_info, None  # 返回更新后的task_info和None作为第二个返回值


def pipeline(
    task: str = None,
    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
    image_processor: Optional[Union[str, BaseImageProcessor]] = None,
    framework: Optional[str] = None,
    revision: Optional[str] = None,
    use_fast: bool = True,
    token: Optional[Union[str, bool]] = None,
    device: Optional[Union[int, str, "torch.device"]] = None,
    device_map=None,
    torch_dtype=None,
    trust_remote_code: Optional[bool] = None,
    model_kwargs: Dict[str, Any] = None,
    pipeline_class: Optional[Any] = None,
    **kwargs,
) -> Pipeline:
    """
    Utility factory method to build a [`Pipeline`].

    Pipelines are made of:

        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
        - A [model](model) to make predictions from the inputs.
        - Some (optional) post processing for enhancing model's output.

    Returns:
        [`Pipeline`]: A suitable pipeline for the task.

    Examples:

    ```
    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

    >>> # Sentiment analysis pipeline
    >>> analyzer = pipeline("sentiment-analysis")

    >>> # Question answering pipeline, specifying the checkpoint identifier
    >>> oracle = pipeline(
    ...     "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
    ... )

    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
    >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
    ```"""
    if model_kwargs is None:
        model_kwargs = {}
    
    # 确保只将use_auth_token作为一个关键字参数传递(以前可以将其传递给model_kwargs,为了保持向后兼容性)
    use_auth_token = model_kwargs.pop("use_auth_token", None)
    # 如果 use_auth_token 参数不为 None,则发出警告,提醒该参数在 Transformers v5 版本中将被移除,建议使用 `token` 参数代替
    if use_auth_token is not None:
        warnings.warn(
            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
            FutureWarning,
        )
        # 如果 token 参数也不为 None,则抛出 ValueError,说明同时指定了 `token` 和 `use_auth_token` 参数,应只设置 `token` 参数
        if token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        # 将 use_auth_token 的值赋给 token 参数
        token = use_auth_token

    # 从 kwargs 字典中弹出 code_revision 和 _commit_hash 参数的值
    code_revision = kwargs.pop("code_revision", None)
    commit_hash = kwargs.pop("_commit_hash", None)

    # 创建 hub_kwargs 字典,用于存储 revision、token、trust_remote_code 和 _commit_hash 参数的值
    hub_kwargs = {
        "revision": revision,
        "token": token,
        "trust_remote_code": trust_remote_code,
        "_commit_hash": commit_hash,
    }

    # 如果既未指定 task 参数也未指定 model 参数,则抛出 RuntimeError,说明无法实例化 Pipeline
    if task is None and model is None:
        raise RuntimeError(
            "Impossible to instantiate a pipeline without either a task or a model "
            "being specified. "
            "Please provide a task class or a model"
        )

    # 如果未指定 model 参数但指定了 tokenizer 参数,则抛出 RuntimeError,说明无法实例化 Pipeline
    if model is None and tokenizer is not None:
        raise RuntimeError(
            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
            " path/identifier to a pretrained model when providing tokenizer."
        )

    # 如果未指定 model 参数但指定了 feature_extractor 参数,则抛出 RuntimeError,说明无法实例化 Pipeline
    if model is None and feature_extractor is not None:
        raise RuntimeError(
            "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided"
            " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class"
            " or a path/identifier to a pretrained model when providing feature_extractor."
        )

    # 如果 model 参数的类型是 Path 对象,则将其转换为字符串类型
    if isinstance(model, Path):
        model = str(model)

    # 如果 commit_hash 参数为 None
    if commit_hash is None:
        # 预先训练的模型名或路径名为 None
        pretrained_model_name_or_path = None
        # 如果 config 参数是字符串类型,则将其赋值给 pretrained_model_name_or_path
        if isinstance(config, str):
            pretrained_model_name_or_path = config
        # 如果 config 参数为 None 且 model 参数为字符串类型,则将 model 参数赋值给 pretrained_model_name_or_path
        elif config is None and isinstance(model, str):
            pretrained_model_name_or_path = model

        # 如果 config 参数不是 PretrainedConfig 类型且 pretrained_model_name_or_path 不为 None
        if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
            # 首先调用配置文件 (可能不存在) 获取 commit hash
            resolved_config_file = cached_file(
                pretrained_model_name_or_path,
                CONFIG_NAME,
                _raise_exceptions_for_gated_repo=False,
                _raise_exceptions_for_missing_entries=False,
                _raise_exceptions_for_connection_errors=False,
                **hub_kwargs,
            )
            # 从配置文件中提取 commit hash,更新 hub_kwargs 中的 _commit_hash 参数
            hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
        else:
            # 否则,从 config 对象中获取 _commit_hash 属性的值,更新 hub_kwargs 中的 _commit_hash 参数
            hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)

    # 配置是最原始的信息项。
    # 如有需要则实例化配置
    # 如果配置是字符串,则根据预训练模型配置自动生成配置对象
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(
            config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
        )
        # 更新 hub_kwargs 中的 _commit_hash
        hub_kwargs["_commit_hash"] = config._commit_hash
    # 如果配置为 None 且模型路径是字符串
    elif config is None and isinstance(model, str):
        # 如果 PEFT 可用,检查模型路径中是否存在适配器文件
        if is_peft_available():
            # 在模型路径中查找适配器配置文件,不包括 `trust_remote_code` 参数
            _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"}
            maybe_adapter_path = find_adapter_config_file(
                model,
                token=hub_kwargs["token"],
                revision=hub_kwargs["revision"],
                _commit_hash=hub_kwargs["_commit_hash"],
            )

            # 如果找到适配器路径,则加载适配器配置文件中的基础模型名称或路径
            if maybe_adapter_path is not None:
                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
                    adapter_config = json.load(f)
                    model = adapter_config["base_model_name_or_path"]

        # 根据模型路径加载自动配置对象
        config = AutoConfig.from_pretrained(
            model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
        )
        # 更新 hub_kwargs 中的 _commit_hash
        hub_kwargs["_commit_hash"] = config._commit_hash

    # 自定义任务字典初始化为空
    custom_tasks = {}
    # 如果配置对象不为空且存在自定义流水线,则获取自定义流水线任务
    if config is not None and len(getattr(config, "custom_pipelines", {})) > 0:
        custom_tasks = config.custom_pipelines
        # 如果任务为 None 且不禁止远程代码,则尝试自动推断任务
        if task is None and trust_remote_code is not False:
            # 如果只有一个自定义任务,则自动选择该任务
            if len(custom_tasks) == 1:
                task = list(custom_tasks.keys())[0]
            else:
                # 如果存在多个自定义任务,则抛出运行时错误,要求手动选择任务
                raise RuntimeError(
                    "We can't infer the task automatically for this model as there are multiple tasks available. Pick "
                    f"one in {', '.join(custom_tasks.keys())}"
                )

    # 如果任务仍为 None 且模型不为空,则尝试获取任务
    if task is None and model is not None:
        # 如果模型不是字符串,则抛出运行时错误
        if not isinstance(model, str):
            raise RuntimeError(
                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
                f"{model} is not a valid model_id."
            )
        # 根据模型 ID 和 token 获取任务
        task = get_task(model, token)

    # 获取任务后的处理流程
    if task in custom_tasks:
        # 标准化任务名称
        normalized_task = task
        # 清理自定义任务,获取目标任务和任务选项
        targeted_task, task_options = clean_custom_task(custom_tasks[task])
        # 如果未指定流水线类,则根据情况抛出 ValueError
        if pipeline_class is None:
            # 如果不信任远程代码,则要求设置 `trust_remote_code=True` 以移除错误
            if not trust_remote_code:
                raise ValueError(
                    "Loading this pipeline requires you to execute the code in the pipeline file in that"
                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
                    " set the option `trust_remote_code=True` to remove this error."
                )
            # 从动态模块中获取类引用
            class_ref = targeted_task["impl"]
            pipeline_class = get_class_from_dynamic_module(
                class_ref,
                model,
                code_revision=code_revision,
                **hub_kwargs,
            )
    else:
        # 检查任务并返回标准化的任务、目标任务和任务选项
        normalized_task, targeted_task, task_options = check_task(task)
        # 如果未指定流水线类,则使用目标任务的实现类作为默认流水线类
        if pipeline_class is None:
            pipeline_class = targeted_task["impl"]

    # 如果未提供模型,则使用任务的默认模型、配置和分词器
    if model is None:
        # 获取任务的默认模型及其修订版本
        model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options)
        # 如果未指定修订版本,则使用默认修订版本
        revision = revision if revision is not None else default_revision
        # 记录警告信息,指出未提供模型,使用默认模型和修订版本
        logger.warning(
            f"No model was supplied, defaulted to {model} and revision"
            f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
            "Using a pipeline without specifying a model name and revision in production is not recommended."
        )
        # 如果未提供配置且模型名称为字符串,则从预训练模型中创建配置对象
        if config is None and isinstance(model, str):
            config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
            # 将配置的提交哈希记录到 hub_kwargs 中
            hub_kwargs["_commit_hash"] = config._commit_hash

    # 如果设备映射不为空,则处理相关参数
    if device_map is not None:
        # 如果模型参数中已包含 device_map,抛出错误
        if "device_map" in model_kwargs:
            raise ValueError(
                'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
                " arguments might conflict, use only one.)"
            )
        # 如果同时指定了 device 和 device_map,则发出警告
        if device is not None:
            logger.warning(
                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
            )
        # 将 device_map 添加到模型参数中
        model_kwargs["device_map"] = device_map

    # 如果 torch 数据类型不为空,则处理相关参数
    if torch_dtype is not None:
        # 如果模型参数中已包含 torch_dtype,抛出错误
        if "torch_dtype" in model_kwargs:
            raise ValueError(
                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
                " arguments might conflict, use only one.)"
            )
        # 如果 torch_dtype 是字符串且存在于 torch 模块中,则转换成相应的 torch 数据类型
        if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
            torch_dtype = getattr(torch, torch_dtype)
        # 将 torch_dtype 添加到模型参数中
        model_kwargs["torch_dtype"] = torch_dtype

    # 如果模型名称是字符串,则推断框架并加载模型
    if isinstance(model, str) or framework is None:
        # 定义模型类别(TensorFlow 或 PyTorch)并根据模型加载相应的框架和模型
        model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
        framework, model = infer_framework_load_model(
            model,
            model_classes=model_classes,
            config=config,
            framework=framework,
            task=task,
            **hub_kwargs,
            **model_kwargs,
        )

    # 获取模型的配置信息
    model_config = model.config
    # 将配置的提交哈希记录到 hub_kwargs 中
    hub_kwargs["_commit_hash"] = model.config._commit_hash
    # 判断是否需要加载分词器
    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
    # 判断是否需要加载特征提取器
    load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
    # 检查是否需要加载图像处理器,条件为模型配置在图像处理器映射中或者图像处理器不为空
    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None

    # 如果传入的`model`(`PretrainedModel`的实例而不是字符串),并且`image_processor`或`feature_extractor`为空,
    # 则加载将失败。这在某些视觉任务中特别发生,当使用`pipeline()`函数时传入`model`和其中一个`image_processor`或`feature_extractor`时。
    # TODO: 我们需要使`NO_IMAGE_PROCESSOR_TASKS`和`NO_FEATURE_EXTRACTOR_TASKS`更加健壮,以避免这种问题。
    # 这段代码仅用于临时使CI通过。
    if load_image_processor and load_feature_extractor:
        load_feature_extractor = False

    # 如果`tokenizer`为空,并且不需要加载`tokenizer`,并且`normalized_task`不在`NO_TOKENIZER_TASKS`中,
    # 并且`model_config`的类名在`MULTI_MODEL_AUDIO_CONFIGS`或`MULTI_MODEL_VISION_CONFIGS`中,
    # 则尝试强制加载`tokenizer`。
    if (
        tokenizer is None
        and not load_tokenizer
        and normalized_task not in NO_TOKENIZER_TASKS
        # 使用类名来避免导入真实类。
        and (
            model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
            or model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
        )
    ):
        load_tokenizer = True

    # 如果`image_processor`为空,并且不需要加载`image_processor`,并且`normalized_task`不在`NO_IMAGE_PROCESSOR_TASKS`中,
    # 并且`model_config`的类名在`MULTI_MODEL_VISION_CONFIGS`中,
    # 则尝试强制加载`image_processor`。
    if (
        image_processor is None
        and not load_image_processor
        and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
        # 使用类名来避免导入真实类。
        and model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
    ):
        load_image_processor = True

    # 如果`feature_extractor`为空,并且不需要加载`feature_extractor`,并且`normalized_task`不在`NO_FEATURE_EXTRACTOR_TASKS`中,
    # 并且`model_config`的类名在`MULTI_MODEL_AUDIO_CONFIGS`中,
    # 则尝试强制加载`feature_extractor`。
    if (
        feature_extractor is None
        and not load_feature_extractor
        and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS
        # 使用类名来避免导入真实类。
        and model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
    ):
        load_feature_extractor = True

    # 如果任务在`NO_TOKENIZER_TASKS`中,则不需要加载`tokenizer`。
    if task in NO_TOKENIZER_TASKS:
        load_tokenizer = False

    # 如果任务在`NO_FEATURE_EXTRACTOR_TASKS`中,则不需要加载`feature_extractor`。
    if task in NO_FEATURE_EXTRACTOR_TASKS:
        load_feature_extractor = False

    # 如果任务在`NO_IMAGE_PROCESSOR_TASKS`中,则不需要加载`image_processor`。
    if task in NO_IMAGE_PROCESSOR_TASKS:
        load_image_processor = False
    # 如果需要加载分词器
    if load_tokenizer:
        # 尝试根据模型名称或配置名称推断分词器(如果提供的话)
        if tokenizer is None:
            # 如果 model_name 是字符串,则尝试使用其作为分词器
            if isinstance(model_name, str):
                tokenizer = model_name
            # 如果 config 是字符串,则尝试使用其作为分词器
            elif isinstance(config, str):
                tokenizer = config
            else:
                # 在这里无法猜测应该使用哪个分词器
                raise Exception(
                    "Impossible to guess which tokenizer to use. "
                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
                )

        # 如果需要,实例化分词器
        if isinstance(tokenizer, (str, tuple)):
            if isinstance(tokenizer, tuple):
                # 对于元组,格式为(分词器名称,{kwargs})
                use_fast = tokenizer[1].pop("use_fast", use_fast)
                tokenizer_identifier = tokenizer[0]
                tokenizer_kwargs = tokenizer[1]
            else:
                tokenizer_identifier = tokenizer
                tokenizer_kwargs = model_kwargs.copy()
                tokenizer_kwargs.pop("torch_dtype", None)

            # 根据给定的参数创建 AutoTokenizer 实例
            tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
            )

    # 如果需要加载图像处理器
    if load_image_processor:
        # 尝试根据模型名称或配置名称推断图像处理器(如果提供的话)
        if image_processor is None:
            # 如果 model_name 是字符串,则尝试使用其作为图像处理器
            if isinstance(model_name, str):
                image_processor = model_name
            # 如果 config 是字符串,则尝试使用其作为图像处理器
            elif isinstance(config, str):
                image_processor = config
            # 为了向后兼容,如果 feature_extractor 是 BaseImageProcessor 的实例,则使用其作为图像处理器
            elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor):
                image_processor = feature_extractor
            else:
                # 在这里无法猜测应该使用哪个图像处理器
                raise Exception(
                    "Impossible to guess which image processor to use. "
                    "Please provide a PreTrainedImageProcessor class or a path/identifier "
                    "to a pretrained image processor."
                )

        # 如果需要,实例化图像处理器
        if isinstance(image_processor, (str, tuple)):
            # 根据给定的参数创建 AutoImageProcessor 实例
            image_processor = AutoImageProcessor.from_pretrained(
                image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
            )
    # 如果需要加载特征提取器
    if load_feature_extractor:
        # 尝试从模型名称或配置名称(如果是字符串)推断特征提取器
        if feature_extractor is None:
            # 如果模型名称是字符串,则将其作为特征提取器
            if isinstance(model_name, str):
                feature_extractor = model_name
            # 如果配置是字符串,则将其作为特征提取器
            elif isinstance(config, str):
                feature_extractor = config
            else:
                # 在此无法猜测正确的特征提取器
                raise Exception(
                    "Impossible to guess which feature extractor to use. "
                    "Please provide a PreTrainedFeatureExtractor class or a path/identifier "
                    "to a pretrained feature extractor."
                )

        # 如果特征提取器是字符串或元组,则实例化特征提取器
        if isinstance(feature_extractor, (str, tuple)):
            feature_extractor = AutoFeatureExtractor.from_pretrained(
                feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
            )

            # 如果特征提取器包含语言模型且模型名称是字符串
            if (
                feature_extractor._processor_class
                and feature_extractor._processor_class.endswith("WithLM")
                and isinstance(model_name, str)
            ):
                try:
                    import kenlm  # 触发 `ImportError` 如果未安装
                    from pyctcdecode import BeamSearchDecoderCTC

                    # 如果模型名称是目录或文件
                    if os.path.isdir(model_name) or os.path.isfile(model_name):
                        decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
                    else:
                        # 语言模型的全局路径及字母表文件名
                        language_model_glob = os.path.join(
                            BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
                        )
                        alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
                        allow_patterns = [language_model_glob, alphabet_filename]
                        # 从 HF Hub 加载模型名称对应的解码器
                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns)

                    # 将解码器加入参数中
                    kwargs["decoder"] = decoder
                except ImportError as e:
                    # 如果无法加载 `decoder`,则记录警告信息,并默认使用原始 CTC
                    logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
                    # 如果未安装 kenlm
                    if not is_kenlm_available():
                        logger.warning("Try to install `kenlm`: `pip install kenlm")

                    # 如果未安装 pyctcdecode
                    if not is_pyctcdecode_available():
                        logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")

    # 如果任务是翻译且模型配置具有特定任务参数
    if task == "translation" and model.config.task_specific_params:
        # 遍历模型配置的特定任务参数
        for key in model.config.task_specific_params:
            # 如果参数以 "translation" 开头
            if key.startswith("translation"):
                # 将任务设为该参数值,并发出警告
                task = key
                warnings.warn(
                    f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
                    UserWarning,
                )
                break

    # 如果存在分词器,则将其加入参数中
    if tokenizer is not None:
        kwargs["tokenizer"] = tokenizer
    # 如果提供了特征提取器,则将其添加到 kwargs 字典中
    if feature_extractor is not None:
        kwargs["feature_extractor"] = feature_extractor

    # 如果提供了 torch 的数据类型,则将其添加到 kwargs 字典中
    if torch_dtype is not None:
        kwargs["torch_dtype"] = torch_dtype

    # 如果提供了图像处理器,则将其添加到 kwargs 字典中
    if image_processor is not None:
        kwargs["image_processor"] = image_processor

    # 如果提供了设备信息,则将其添加到 kwargs 字典中
    if device is not None:
        kwargs["device"] = device

    # 使用给定的参数和 kwargs 字典创建一个新的 pipeline_class 对象并返回
    return pipeline_class(model=model, framework=framework, task=task, **kwargs)
posted @ 2024-07-01 10:54  绝不原创的飞龙  阅读(39)  评论(0编辑  收藏  举报