Transformers-源码解析-一百三十一-

Transformers 源码解析(一百三十一)

.\pipelines\audio_classification.py

# 导入子进程管理模块
import subprocess
# 导入 Union 用于类型提示
from typing import Union

# 导入 numpy 和 requests 库
import numpy as np
import requests

# 从相对路径导入工具函数和判断 Torch 是否可用的函数
from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
# 从当前目录的 base.py 文件中导入 Pipeline 类和初始化管道参数的函数
from .base import Pipeline, build_pipeline_init_args

# 如果 Torch 可用,则从相对路径导入模型映射名称
if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES

# 获取日志记录器
logger = logging.get_logger(__name__)


def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    使用 ffmpeg 读取音频文件的辅助函数。
    """
    # 将采样率转换为字符串
    ar = f"{sampling_rate}"
    # 设置音频通道数为 1
    ac = "1"
    # 设置转换格式为 f32le
    format_for_conversion = "f32le"
    # 构建 ffmpeg 命令
    ffmpeg_command = [
        "ffmpeg",
        "-i",        # 输入文件(从标准输入流读取)
        "pipe:0",    # 使用标准输入流作为输入
        "-ac",       # 设置音频通道数
        ac,
        "-ar",       # 设置音频采样率
        ar,
        "-f",        # 设置输出格式
        format_for_conversion,
        "-hide_banner",  # 隐藏 ffmpeg 的 banner
        "-loglevel",     # 设置日志级别为 quiet(不输出冗长的日志信息)
        "quiet",
        "pipe:1",    # 使用标准输出流输出结果
    ]

    try:
        # 启动 ffmpeg 子进程,将 bpayload 作为输入,获取标准输出流
        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    except FileNotFoundError:
        # 如果找不到 ffmpeg,则抛出 ValueError 异常
        raise ValueError("ffmpeg was not found but is required to load audio files from filename")

    # 与 ffmpeg 进程交互,获取输出流数据
    output_stream = ffmpeg_process.communicate(bpayload)
    out_bytes = output_stream[0]

    # 将输出流数据解析为 np.float32 类型的数组
    audio = np.frombuffer(out_bytes, np.float32)
    # 如果音频数组长度为 0,则抛出 ValueError 异常
    if audio.shape[0] == 0:
        raise ValueError("Malformed soundfile")

    # 返回解析后的音频数据数组
    return audio


# 使用装饰器添加文档字符串,并标记具有特征提取器的管道
@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True))
class AudioClassificationPipeline(Pipeline):
    """
    使用任意 `AutoModelForAudioClassification` 进行音频分类的管道。此管道预测原始波形或音频文件的类别。
    对于音频文件,需要安装 ffmpeg 支持多种音频格式的解析。

    示例:

    ```
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
    ```

    了解如何在 [管道教程](../pipeline_tutorial) 中使用管道的基础知识。

    此管道可以通过 [`pipeline`] 使用以下任务标识符加载:
    `"audio-classification"`.

    """
    """
    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
    """
    
    # 初始化方法,接受任意位置参数和关键字参数
    def __init__(self, *args, **kwargs):
        # 设置默认的 top_k 参数为 5,可能会被 model.config 覆盖
        kwargs["top_k"] = 5
        # 调用父类的初始化方法
        super().__init__(*args, **kwargs)
        
        # 如果模型不是基于 PyTorch 的,抛出 ValueError 异常
        if self.framework != "pt":
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
        
        # 检查模型类型是否属于音频分类映射名称
        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)

    # 调用实例时的方法,用于分类给定的输入序列
    def __call__(
        self,
        inputs: Union[np.ndarray, bytes, str],
        **kwargs,
    ):
        """
        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
        information.

        Args:
            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                The inputs is either :
                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
                      same way.
                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
                        Raw audio at the correct sampling rate (no further check will be done)
                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
                      `"array"` is used to denote the raw audio waveform.
            top_k (`int`, *optional*, defaults to None):
                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
                higher than the number of labels available in the model configuration, it will default to the number of
                labels.

        Return:
            A list of `dict` with the following keys:

            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.
        """
        # 调用父类的 __call__ 方法,执行实际的分类操作
        return super().__call__(inputs, **kwargs)

    # 清理参数的私有方法,目前此管道不接受参数
    def _sanitize_parameters(self, top_k=None, **kwargs):
        # 初始化后处理参数字典
        postprocess_params = {}
        # 如果指定了 top_k 参数
        if top_k is not None:
            # 如果 top_k 大于模型配置中的标签数,将其设为模型配置中的标签数
            if top_k > self.model.config.num_labels:
                top_k = self.model.config.num_labels
            # 将 top_k 参数添加到后处理参数字典中
            postprocess_params["top_k"] = top_k
        
        # 返回空字典和两个空元组作为参数
        return {}, {}, postprocess_params
    # 定义预处理方法,处理输入数据以准备用于模型输入
    def preprocess(self, inputs):
        # 如果输入是字符串
        if isinstance(inputs, str):
            # 如果输入以 "http://" 或 "https://" 开头,表明是网络地址
            if inputs.startswith("http://") or inputs.startswith("https://"):
                # 发起网络请求获取内容并返回其二进制数据
                inputs = requests.get(inputs).content
            else:
                # 否则假设输入是本地文件路径,以二进制方式读取文件内容
                with open(inputs, "rb") as f:
                    inputs = f.read()

        # 如果输入是字节流
        if isinstance(inputs, bytes):
            # 使用自定义的 ffmpeg_read 函数读取音频数据,指定采样率
            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)

        # 如果输入是字典
        if isinstance(inputs, dict):
            # 检查字典是否包含所需的键 "sampling_rate" 和 "raw" 或 "array"
            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
                # 如果不符合要求,抛出数值错误异常
                raise ValueError(
                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
                    "containing the sampling_rate associated with that array"
                )

            # 从字典中取出 "raw" 数据(如果存在),否则取出 "array" 数据
            _inputs = inputs.pop("raw", None)
            if _inputs is None:
                # 如果没有 "raw" 数据,则移除字典中的 "path" 和 "array" 键,再取出 "array" 数据
                inputs.pop("path", None)
                _inputs = inputs.pop("array", None)
            # 取出输入音频的采样率
            in_sampling_rate = inputs.pop("sampling_rate")
            # 将输入设为取出的数据
            inputs = _inputs
            # 如果输入的采样率不等于特征提取器的采样率
            if in_sampling_rate != self.feature_extractor.sampling_rate:
                import torch

                # 检查是否可以使用 torchaudio 库
                if is_torchaudio_available():
                    from torchaudio import functional as F
                else:
                    # 如果没有 torchaudio 库,抛出导入错误异常
                    raise ImportError(
                        "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
                        "The torchaudio package can be installed through: `pip install torchaudio`."
                    )

                # 使用 torch 和 torchaudio 进行音频重采样,并将结果转换为 numpy 数组
                inputs = F.resample(
                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
                ).numpy()

        # 如果输入不是 numpy 数组,则抛出数值错误异常
        if not isinstance(inputs, np.ndarray):
            raise ValueError("We expect a numpy ndarray as input")
        # 如果输入的维度不是 1,说明不是单声道音频,抛出数值错误异常
        if len(inputs.shape) != 1:
            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")

        # 使用特征提取器提取输入的特征,返回处理后的结果
        processed = self.feature_extractor(
            inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
        )
        return processed

    # 定义私有方法 _forward,执行模型的前向传播
    def _forward(self, model_inputs):
        # 调用模型进行前向传播,返回模型输出
        model_outputs = self.model(**model_inputs)
        return model_outputs
    # 定义一个方法用于后处理模型输出,生成排名前top_k的标签和对应的概率
    def postprocess(self, model_outputs, top_k=5):
        # 从模型输出中获取概率分布并进行 softmax 归一化
        probs = model_outputs.logits[0].softmax(-1)
        # 获取概率最高的top_k个值和它们的索引
        scores, ids = probs.topk(top_k)

        # 将张量转换为 Python 列表
        scores = scores.tolist()
        ids = ids.tolist()

        # 根据模型配置中的 id2label 映射,生成标签和对应的概率列表
        labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]

        # 返回标签和对应概率的列表
        return labels

.\pipelines\audio_utils.py

# 版权声明及导入必要的库和模块
# 版权 2023 The HuggingFace Team. 保留所有权利。
import datetime
import platform
import subprocess
from typing import Optional, Tuple, Union

import numpy as np


def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    通过ffmpeg读取音频文件的辅助函数。
    """
    # 将采样率转换为字符串
    ar = f"{sampling_rate}"
    # 设置音频通道数为1
    ac = "1"
    # 设置转换格式为"f32le",即32位浮点数,低端序
    format_for_conversion = "f32le"
    # 构建ffmpeg命令
    ffmpeg_command = [
        "ffmpeg",                # ffmpeg命令
        "-i", "pipe:0",          # 输入文件从标准输入(pipe:0)读取
        "-ac", ac,               # 设置音频通道数
        "-ar", ar,               # 设置采样率
        "-f", format_for_conversion,  # 设置输出格式为指定的转换格式
        "-hide_banner",          # 隐藏ffmpeg的banner信息
        "-loglevel", "quiet",    # 设置日志级别为静默模式,不输出日志信息
        "pipe:1",                # 输出音频数据到标准输出(pipe:1)
    ]

    try:
        # 使用subprocess.Popen启动ffmpeg进程,并通过stdin传输输入数据,stdout接收输出数据
        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
            output_stream = ffmpeg_process.communicate(bpayload)  # 传输音频数据并获取输出流
    except FileNotFoundError as error:
        # 若ffmpeg未找到,则抛出错误
        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error

    out_bytes = output_stream[0]   # 获取ffmpeg输出的音频数据字节流
    audio = np.frombuffer(out_bytes, np.float32)   # 将字节流转换为numpy数组,数据类型为32位浮点数

    # 如果音频数据长度为0,则抛出异常,说明音频文件格式不正确或损坏
    if audio.shape[0] == 0:
        raise ValueError(
            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
            "URL, ensure that the URL is the full address to **download** the audio file."
        )
    return audio   # 返回解码后的音频数据数组


def ffmpeg_microphone(
    sampling_rate: int,
    chunk_length_s: float,
    format_for_conversion: str = "f32le",
):
    """
    读取原始麦克风数据的辅助函数。
    """
    # 将采样率转换为字符串
    ar = f"{sampling_rate}"
    # 设置音频通道数为1
    ac = "1"

    # 根据指定的转换格式确定每个音频样本的字节大小
    if format_for_conversion == "s16le":
        size_of_sample = 2   # 每个样本为16位整数,即2个字节
    elif format_for_conversion == "f32le":
        size_of_sample = 4   # 每个样本为32位浮点数,即4个字节
    else:
        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")

    # 获取当前操作系统类型
    system = platform.system()
    if system == "Linux":
        format_ = "alsa"     # Linux系统使用alsa音频系统
        input_ = "default"   # 默认输入设备
    elif system == "Darwin":
        format_ = "avfoundation"   # macOS系统使用avfoundation音频系统
        input_ = ":0"        # 默认音频输入设备
    elif system == "Windows":
        format_ = "dshow"    # Windows系统使用dshow音频系统
        input_ = _get_microphone_name()  # 获取当前连接的麦克风设备名称

    # 构建ffmpeg命令
    ffmpeg_command = [
        "ffmpeg",                # ffmpeg命令
        "-f", format_,           # 指定输入格式为当前系统指定的音频系统
        "-i", input_,            # 指定输入来源,如默认设备或具体设备名称
        "-ac", ac,               # 设置音频通道数
        "-ar", ar,               # 设置采样率
        "-f", format_for_conversion,  # 设置输出格式为指定的转换格式
        "-fflags", "nobuffer",   # 设置fflags参数为nobuffer,禁用缓冲
        "-hide_banner",          # 隐藏ffmpeg的banner信息
        "-loglevel", "quiet",    # 设置日志级别为静默模式,不输出日志信息
        "pipe:1",                # 输出音频数据到标准输出(pipe:1)
    ]

    # 计算每个数据块的长度
    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample

    # 使用私有函数_ffmpeg_stream迭代处理音频流
    iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)

    # 生成器函数,逐项产生处理后的音频数据块
    for item in iterator:
        yield item


def ffmpeg_microphone_live(
    sampling_rate: int,
    chunk_length_s: float,
    stream_chunk_s: Optional[int] = None,
    stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
    format_for_conversion: str = "f32le",
):
    """
    实时读取麦克风音频数据的辅助函数。
    """
    # 如果 stream_chunk_s 不为 None,则将其作为 chunk_s;否则使用 chunk_length_s 作为 chunk_s
    if stream_chunk_s is not None:
        chunk_s = stream_chunk_s
    else:
        chunk_s = chunk_length_s

    # 调用 ffmpeg_microphone 函数获取麦克风的音频流,使用指定的采样率和 chunk_s,同时指定音频格式为 format_for_conversion
    microphone = ffmpeg_microphone(sampling_rate, chunk_s, format_for_conversion=format_for_conversion)

    # 根据 format_for_conversion 的值选择相应的数据类型 dtype 和每个样本的大小 size_of_sample
    if format_for_conversion == "s16le":
        dtype = np.int16
        size_of_sample = 2
    elif format_for_conversion == "f32le":
        dtype = np.float32
        size_of_sample = 4
    else:
        # 如果 format_for_conversion 不是已处理的格式,则抛出 ValueError 异常
        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")

    # 如果未指定 stride_length_s,则将 chunk_length_s 的六分之一作为默认值
    if stride_length_s is None:
        stride_length_s = chunk_length_s / 6

    # 计算 chunk_length_s 对应的音频数据长度,并转换为字节数
    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample

    # 如果 stride_length_s 是单个数字(int 或 float),则将其转换为左右两侧相同长度的列表
    if isinstance(stride_length_s, (int, float)):
        stride_length_s = [stride_length_s, stride_length_s]

    # 计算左右两侧 stride 对应的音频数据长度,并转换为字节数
    stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
    stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample

    # 记录当前时间,用于计算音频的时间戳
    audio_time = datetime.datetime.now()

    # 计算时间增量 delta,表示每次处理的音频数据长度对应的时间长度
    delta = datetime.timedelta(seconds=chunk_s)
    # 使用 chunk_bytes_iter 函数从 microphone 生成的数据流中迭代获取每个音频片段
    for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
        # 将 item 中的 "raw" 字段重新转换为 numpy 数组
        item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
        # 调整 item 中的 "stride" 字段,使其单位符合采样样本的大小
        item["stride"] = (
            item["stride"][0] // size_of_sample,
            item["stride"][1] // size_of_sample,
        )
        # 设置 item 中的 "sampling_rate" 字段为指定的采样率
        item["sampling_rate"] = sampling_rate
        # 增加当前音频时间
        audio_time += delta
        # 如果当前时间超过音频时间加上 10 倍的时间间隔,则跳过当前音频片段
        if datetime.datetime.now() > audio_time + 10 * delta:
            # 我们已经迟到了!!跳过
            continue
        # 通过生成器返回当前处理的音频片段
        yield item
def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
    """
    Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
    get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
    """
    acc = b""  # 初始化一个空字节串,用于累积迭代器中的原始字节数据
    stride_left, stride_right = stride  # 将步长参数解包成左右两部分
    if stride_left + stride_right >= chunk_len:
        raise ValueError(
            f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
        )  # 如果步长大于等于块长度,抛出数值错误异常
    _stride_left = 0  # 初始化一个内部步长变量为零
    for raw in iterator:  # 迭代处理输入的迭代器
        acc += raw  # 将迭代器中的原始数据累加到累积变量中
        if stream and len(acc) < chunk_len:
            stride = (_stride_left, 0)  # 如果流模式为真且累积数据长度小于块长度,则使用当前内部步长和零作为步长
            yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}  # 生成部分结果字典,包含截取的原始数据、步长和部分结果标志
        else:
            while len(acc) >= chunk_len:
                # We are flushing the accumulator
                stride = (_stride_left, stride_right)  # 当累积数据长度大于等于块长度时,使用当前内部步长和右侧步长作为步长
                item = {"raw": acc[:chunk_len], "stride": stride}  # 创建包含截取的原始数据和步长的结果字典
                if stream:
                    item["partial"] = False  # 如果流模式为真,设置部分结果标志为假
                yield item  # 生成结果字典
                _stride_left = stride_left  # 更新内部步长为左侧步长
                acc = acc[chunk_len - stride_left - stride_right :]  # 更新累积变量,去除已处理的数据部分
    # Last chunk
    if len(acc) > stride_left:
        item = {"raw": acc, "stride": (_stride_left, 0)}  # 处理最后一个块,生成结果字典包含剩余的原始数据和步长
        if stream:
            item["partial"] = False  # 如果流模式为真,设置部分结果标志为假
        yield item  # 生成最后一个结果字典


def _ffmpeg_stream(ffmpeg_command, buflen: int):
    """
    Internal function to create the generator of data through ffmpeg
    """
    bufsize = 2**24  # 16Mo,设置缓冲区大小为 16MB
    try:
        with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
            while True:
                raw = ffmpeg_process.stdout.read(buflen)  # 从 ffmpeg 进程的标准输出中读取指定长度的数据块
                if raw == b"":  # 如果读取到的数据为空字节串
                    break  # 跳出循环
                yield raw  # 生成读取到的原始数据块
    except FileNotFoundError as error:
        raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
        # 如果捕获到文件未找到异常,抛出数值错误异常,指明需要安装 ffmpeg 以从文件名流式传输音频文件


def _get_microphone_name():
    """
    Retrieve the microphone name in Windows .
    """
    command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]  # 定义获取麦克风名称的命令

    try:
        ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")  # 执行命令并捕获标准错误
        microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
        # 过滤包含"(audio)"的标准错误输出行

        if microphone_lines:  # 如果找到匹配的麦克风行
            microphone_name = microphone_lines[0].split('"')[1]  # 解析麦克风名称
            print(f"Using microphone: {microphone_name}")  # 打印使用的麦克风名称
            return f"audio={microphone_name}"  # 返回 ffmpeg 需要的音频设备字符串
    except FileNotFoundError:
        print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
        # 如果捕获到文件未找到异常,打印消息提示用户安装或将其添加到系统路径中

    return "default"  # 默认返回字符串,表示使用默认音频设备

.\pipelines\automatic_speech_recognition.py

# 引入从 collections 模块中导入 defaultdict 类
from collections import defaultdict
# 从 typing 模块中导入 TYPE_CHECKING, Dict, Optional, Union 等类型
from typing import TYPE_CHECKING, Dict, Optional, Union

# 导入 numpy 库,用于处理数组和矩阵等数据
import numpy as np
# 导入 requests 库,用于发送 HTTP 请求
import requests

# 从 tokenization_utils 模块中导入 PreTrainedTokenizer 类
from ..tokenization_utils import PreTrainedTokenizer
# 从 utils 模块中导入 is_torch_available, is_torchaudio_available, logging 等函数和类
from ..utils import is_torch_available, is_torchaudio_available, logging
# 从 audio_utils 模块中导入 ffmpeg_read 函数
from .audio_utils import ffmpeg_read
# 从 base 模块中导入 ChunkPipeline 类
from .base import ChunkPipeline

# 如果 TYPE_CHECKING 为真,则从 pyctcdecode 模块中导入 BeamSearchDecoderCTC 类
if TYPE_CHECKING:
    from pyctcdecode import BeamSearchDecoderCTC
    # 从 feature_extraction_sequence_utils 模块中导入 SequenceFeatureExtractor 类
    from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
    # 从 modeling_utils 模块中导入 PreTrainedModel 类
    from ..modeling_utils import PreTrainedModel

# 获取 logger 实例
logger = logging.get_logger(__name__)

# 如果 torch 可用,则从 models.auto.modeling_auto 模块中导入 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES 常量
if is_torch_available():
    import torch
    from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES


def rescale_stride(stride, ratio):
    """
    Rescales the stride values from audio space to tokens/logits space.

    (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
    """
    # 创建一个空列表用于存放重新缩放后的步幅值
    new_strides = []
    # 遍历输入的每一个步幅值 (input_n, left, right)
    for input_n, left, right in stride:
        # 计算 token_n,将输入空间的步幅值按比例缩放到 tokens/logits 空间
        token_n = int(round(input_n * ratio))
        # 计算左侧步幅 left 在 tokens/logits 空间的值
        left = int(round(left / input_n * token_n))
        # 计算右侧步幅 right 在 tokens/logits 空间的值
        right = int(round(right / input_n * token_n))
        # 将缩放后的步幅值组成元组,并添加到新步幅列表中
        new_stride = (token_n, left, right)
        new_strides.append(new_stride)

    return new_strides


def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
    """
    Iterates over chunks of input data, yielding processed chunks.

    inputs: numpy array, the input data to be chunked
    feature_extractor: SequenceFeatureExtractor, object for extracting features from chunks
    chunk_len: int, length of each chunk
    stride_left: int, left stride length
    stride_right: int, right stride length
    dtype: optional, data type to convert processed chunks

    Yields dictionaries containing processed chunk data and metadata.
    """
    # 获取输入数据的长度
    inputs_len = inputs.shape[0]
    # 计算每次迭代的步长,chunk_len - stride_left - stride_right
    step = chunk_len - stride_left - stride_right
    # 从输入数据的起始位置开始,以步长逐步迭代
    for chunk_start_idx in range(0, inputs_len, step):
        # 计算当前 chunk 的结束索引
        chunk_end_idx = chunk_start_idx + chunk_len
        # 从输入数据中提取当前 chunk
        chunk = inputs[chunk_start_idx:chunk_end_idx]
        # 使用特征提取器从当前 chunk 提取特征,返回处理后的结果
        processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
        # 如果指定了数据类型 dtype,则将处理后的结果转换为指定类型
        if dtype is not None:
            processed = processed.to(dtype=dtype)
        # 如果 chunk 的起始索引是 0,则左侧步幅为 0;否则与指定的左侧步幅相同
        _stride_left = 0 if chunk_start_idx == 0 else stride_left
        # 如果 chunk 的结束索引超过输入数据长度且右侧步幅大于 0,则说明是最后一个 item
        is_last = chunk_end_idx > inputs_len if stride_right > 0 else chunk_end_idx >= inputs_len
        # 如果是最后一个 item,则右侧步幅为 0;否则与指定的右侧步幅相同
        _stride_right = 0 if is_last else stride_right

        # 记录当前 chunk 的长度
        chunk_len = chunk.shape[0]
        # 创建步幅元组
        stride = (chunk_len, _stride_left, _stride_right)
        # 如果当前 chunk 的长度大于左侧步幅,生成包含处理结果和元数据的字典,并返回
        if chunk.shape[0] > _stride_left:
            yield {"is_last": is_last, "stride": stride, **processed}
        # 如果是最后一个 item,则停止迭代
        if is_last:
            break
def _fast_find_longest_common_sequence(sequence_left, sequence_right):
    # 获取左序列和右序列的长度
    seq_len_left = len(sequence_left)
    seq_len_right = len(sequence_right)
    # 初始化一个二维列表作为计数器,用于记录最长公共子序列的长度
    counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
    longest = 0
    # 遍历左序列和右序列,填充计数器
    for i in range(seq_len_left):
        for j in range(seq_len_right):
            # 如果左序列和右序列当前位置的元素相同
            if sequence_left[i] == sequence_right[j]:
                previous_counter = counter[i][j] + 1
                counter[i + 1][j + 1] = previous_counter
                # 更新最长公共子序列的长度
                if previous_counter > longest:
                    longest = previous_counter

    # 转换计数器为NumPy数组
    counter = np.array(counter)
    # 找到最长公共子序列在左序列和右序列中的起始索引和长度
    index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
    index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
    return index_left, index_right, longest


def _find_longest_common_sequence(sequences, tokenizer):
    # TODO  使用更快的算法,可能可以在O(n)时间内完成,使用后缀数组
    # 这可能因为容错性而变得繁琐
    # 我们实际上有一个非常好的性质,即总序列必须按顺序是这些子序列
    # 此外,该算法应该对错误更加容忍
    # 从第一个序列中提取不包含特殊标识符的 token ID 组成的列表
    sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
    # 遍历其余的序列
    for new_seq in sequences[1:]:
        # 从每个序列中提取不包含特殊标识符的 token ID 组成的列表
        new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]

        index = 0
        max_ = 0.0
        # 遍历新序列,计算最长公共子序列的相关指标
        for i in range(1, len(new_sequence) + 1):
            # epsilon 用于偏爱长完全匹配
            eps = i / 10000.0
            # 计算匹配的数量和匹配度
            matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
            matching = matches / i + eps
            # 如果匹配数大于1且匹配度大于当前最大值,则更新最大值
            if matches > 1 and matching > max_:
                index = i
                max_ = matching
        # 将新序列中从最佳匹配点开始的部分扩展到主序列中
        sequence.extend(new_sequence[index:])
    return np.array(sequence)


class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
    """
    Pipeline that aims at extracting spoken text contained within some audio.

    The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
    to support multiple audio formats

    Example:

    ```
    >>> from transformers import pipeline

    >>> transcriber = pipeline(model="openai/whisper-base")
    >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
    {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
    """
    Arguments:
        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
            模型将用于通过管道进行预测。必须是继承自[`PreTrainedModel`](PyTorch)或[`TFPreTrainedModel`](TensorFlow)的模型。
        feature_extractor ([`SequenceFeatureExtractor`]):
            特征提取器将用于对波形进行编码,以供模型使用。
        tokenizer ([`PreTrainedTokenizer`]):
            分词器将用于对数据进行编码,以供模型使用。此对象继承自[`PreTrainedTokenizer`]。
        decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
            可选参数,用于语言模型增强解码的PyCTCDecode的BeamSearchDecoderCTC。详见[`Wav2Vec2ProcessorWithLM`]获取更多信息。
        chunk_length_s (`float`, *optional*, defaults to 0):
            每个分块的输入长度(秒)。如果`chunk_length_s = 0`,则禁用分块(默认)。
    
            <Tip>
    
            有关如何有效使用`chunk_length_s`的更多信息,请查看ASR分块博文。
    
            </Tip>
    
        stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
            每个分块左右的步幅长度。仅在`chunk_length_s > 0`时使用。这使得模型能够查看更多的上下文,并更好地推断字母,但管道会丢弃最后的步幅位,以尽可能完美地重构最终结果。
    
            <Tip>
    
            有关如何有效使用`stride_length_s`的更多信息,请查看ASR分块博文。
    
            </Tip>
    
        framework (`str`, *optional*):
            要使用的框架,可以是`"pt"`表示PyTorch或`"tf"`表示TensorFlow。必须安装指定的框架。如果未指定框架,默认使用当前安装的框架。如果未指定框架且两个框架都安装,则默认使用模型的框架,或者如果没有提供模型,则默认使用PyTorch。
        device (Union[`int`, `torch.device`], *optional*):
            CPU/GPU设备编号。将其设置为`None`将使用CPU,设置为正整数将在关联的CUDA设备ID上运行模型。
        torch_dtype (Union[`int`, `torch.dtype`], *optional*):
            计算的数据类型(dtype)。将其设置为`None`将使用float32精度。设置为`torch.float16`或`torch.bfloat16`将使用相应的半精度dtype。
    # 初始化方法,接受多个参数来配置模型和处理过程
    def __init__(
        self,
        model: "PreTrainedModel",  # 模型参数,预训练模型对象
        feature_extractor: Union["SequenceFeatureExtractor", str] = None,  # 特征提取器,可以是对象或者字符串
        tokenizer: Optional[PreTrainedTokenizer] = None,  # 分词器,可选的预训练分词器对象
        decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,  # 解码器,可以是解码器对象或者字符串
        device: Union[int, "torch.device"] = None,  # 设备参数,可以是整数或者 torch 设备对象
        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,  # torch 数据类型,可选的字符串或者 torch 数据类型对象
        **kwargs,  # 其他关键字参数
    ):
        # 设置模型类型,以便检查预处理和后处理参数是否正确
        if model.config.model_type == "whisper":
            self.type = "seq2seq_whisper"  # 如果模型类型是 "whisper",设置类型为 "seq2seq_whisper"
        elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
            self.type = "seq2seq"  # 如果模型类名在映射字典中的值列表中,则设置类型为 "seq2seq"
        elif (
            feature_extractor._processor_class  # 如果特征提取器的处理类存在
            and feature_extractor._processor_class.endswith("WithLM")  # 并且处理类名称以 "WithLM" 结尾
            and decoder is not None  # 并且解码器不为 None
        ):
            self.decoder = decoder  # 设置解码器
            self.type = "ctc_with_lm"  # 设置类型为 "ctc_with_lm"
        else:
            self.type = "ctc"  # 否则,设置类型为 "ctc"

        # 调用父类的初始化方法,传递模型、分词器、特征提取器、设备和其他关键字参数
        super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs)

    # 调用方法,用于处理输入数据
    def __call__(
        self,
        inputs: Union[np.ndarray, bytes, str],  # 输入参数可以是 numpy 数组、字节流或字符串
        **kwargs,  # 其他关键字参数
    ):

    # 校验参数方法,用于规范化处理参数
    def _sanitize_parameters(
        self,
        chunk_length_s=None,  # 分块长度(秒)
        stride_length_s=None,  # 步长长度(秒)
        ignore_warning=None,  # 是否忽略警告
        decoder_kwargs=None,  # 解码器关键字参数
        return_timestamps=None,  # 是否返回时间戳
        return_language=None,  # 是否返回语言
        generate_kwargs=None,  # 生成关键字参数
        max_new_tokens=None,  # 最大新标记数
    ):

    # 后处理方法,用于处理模型输出
    def postprocess(
        self, model_outputs,  # 模型输出
        decoder_kwargs: Optional[Dict] = None,  # 解码器关键字参数
        return_timestamps=None,  # 是否返回时间戳
        return_language=None,  # 是否返回语言
def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
    """
    Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
    `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
    iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
    processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
    properly compute the final `offset`.
    """
    # index of the first timestamp token
    # 获取第一个时间戳的token索引,这里假设"<|notimestamps|>"是一个特殊标记,用于指示时间戳的开始
    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
    items = []
    # approximation of the token to time ratio : ~0.2seconds
    # token与时间的近似比例:约为0.2秒,用于计算时间偏移量
    time_precision = feature_extractor.chunk_length / max_source_positions
    time = 0  # 初始时间设为0
    result = []
    for i in range(len(items)):
        result += items[i].tolist()  # 将items中的元素转换为列表后添加到result中
    return result  # 返回合并后的结果列表

.\pipelines\base.py

# 导入必要的标准库和第三方库
import collections  # 导入collections模块,用于处理集合类型数据
import csv  # 导入csv模块,用于读写CSV文件
import importlib  # 导入importlib模块,用于动态加载模块
import json  # 导入json模块,用于处理JSON格式数据
import os  # 导入os模块,提供了许多与操作系统交互的函数
import pickle  # 导入pickle模块,用于序列化和反序列化Python对象
import sys  # 导入sys模块,提供了与Python解释器相关的函数和变量
import traceback  # 导入traceback模块,用于提取和格式化异常的回溯信息
import types  # 导入types模块,用于操作Python类型和对象
import warnings  # 导入warnings模块,用于管理警告信息
from abc import ABC, abstractmethod  # 导入ABC和abstractmethod类,用于定义抽象基类和抽象方法
from collections import UserDict  # 从collections模块导入UserDict类,用于创建自定义字典类型
from contextlib import contextmanager  # 导入contextmanager装饰器,用于创建上下文管理器
from os.path import abspath, exists  # 从os.path模块导入abspath和exists函数,用于处理文件路径
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union  # 导入类型提示相关的类和函数

from ..dynamic_module_utils import custom_object_save  # 导入自定义模块,用于定制对象的保存方式
from ..feature_extraction_utils import PreTrainedFeatureExtractor  # 导入特征提取工具类
from ..image_processing_utils import BaseImageProcessor  # 导入图像处理基类
from ..modelcard import ModelCard  # 导入模型卡片类
from ..models.auto.configuration_auto import AutoConfig  # 导入自动配置类
from ..tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器类
from ..utils import (  # 从utils模块导入多个功能函数和类
    ModelOutput,  # 模型输出类
    add_end_docstrings,  # 添加文档字符串的装饰器
    infer_framework,  # 推断框架类型的函数
    is_tf_available,  # 检查是否有TensorFlow环境的函数
    is_torch_available,  # 检查是否有PyTorch环境的函数
    is_torch_cuda_available,  # 检查是否有CUDA支持的函数
    is_torch_npu_available,  # 检查是否有NPU支持的函数
    is_torch_xpu_available,  # 检查是否有XPU支持的函数
    logging,  # 日志记录对象
)

# 定义一个泛型张量类型
GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]

# 如果有TensorFlow环境,导入必要的模块
if is_tf_available():
    import tensorflow as tf  # 导入TensorFlow库

    from ..models.auto.modeling_tf_auto import TFAutoModel  # 导入TensorFlow自动建模类

# 如果有PyTorch环境,导入必要的模块
if is_torch_available():
    import torch  # 导入PyTorch库
    from torch.utils.data import DataLoader, Dataset  # 导入PyTorch的数据加载器和数据集类

    from ..models.auto.modeling_auto import AutoModel  # 导入PyTorch自动建模类

    # 为了向后兼容重新导出
    from .pt_utils import KeyDataset  # 导入PyTorch工具类中的KeyDataset类
else:
    Dataset = None  # 如果没有PyTorch环境,将Dataset设为None
    KeyDataset = None  # 如果没有PyTorch环境,将KeyDataset设为None

# 如果支持类型检查,导入必要的模型类
if TYPE_CHECKING:
    from ..modeling_tf_utils import TFPreTrainedModel  # 导入TensorFlow预训练模型类
    from ..modeling_utils import PreTrainedModel  # 导入通用预训练模型类

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)


# 定义一个不进行数据整理的函数
def no_collate_fn(items):
    if len(items) != 1:
        raise ValueError("This collate_fn is meant to be used with batch_size=1")
    return items[0]


# 定义一个用于填充数据的函数
def _pad(items, key, padding_value, padding_side):
    batch_size = len(items)  # 获取批次大小
    # 检查第一个项目的键对应的值是否为 torch.Tensor 类型
    if isinstance(items[0][key], torch.Tensor):
        # 获取张量的形状信息
        shape = items[0][key].shape
        # 获取张量的维度数
        dim = len(shape)
        
        # 如果键是 "pixel_values" 或 "image",通常表示图像数据,不需要填充
        if key in ["pixel_values", "image"]:
            # 返回所有项目中指定键的张量拼接结果,按第 0 维度拼接
            # B, C, H, W
            return torch.cat([item[key] for item in items], dim=0)
        # 如果维度为 4 且键是 "input_features",通常表示批处理的梅尔频谱图
        elif dim == 4 and key == "input_features":
            # 返回所有项目中指定键的张量拼接结果,按第 0 维度拼接
            # 这里是批处理的梅尔频谱图
            return torch.cat([item[key] for item in items], dim=0)
        
        # 计算所有项目中指定键的张量的最大长度和最小长度
        max_length = max(item[key].shape[1] for item in items)
        min_length = min(item[key].shape[1] for item in items)
        # 获取数据类型
        dtype = items[0][key].dtype

        # 根据不同维度情况进行张量初始化和填充
        if dim == 2:
            # 如果最大长度等于最小长度,可以直接拼接
            if max_length == min_length:
                # 返回所有项目中指定键的张量拼接结果,按第 0 维度拼接
                # 对于 ImageGPT,可能不提供填充值,但是应该能够一致地填充,因为大小应该匹配
                return torch.cat([item[key] for item in items], dim=0)
            # 创建一个形状为 (batch_size, max_length) 的零张量,使用指定的数据类型和填充值
            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
        elif dim == 3:
            # 创建一个形状为 (batch_size, max_length, shape[-1]) 的零张量,使用指定的数据类型和填充值
            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
        elif dim == 4:
            # 创建一个形状为 (batch_size, max_length, shape[-2], shape[-1]) 的零张量,使用指定的数据类型和填充值
            tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value

        # 遍历项目列表,根据填充方向和维度对张量进行填充操作
        for i, item in enumerate(items):
            if dim == 2:
                # 根据填充方向选择对张量的部分进行填充
                if padding_side == "left":
                    tensor[i, -len(item[key][0]) :] = item[key][0].clone()
                else:
                    tensor[i, : len(item[key][0])] = item[key][0].clone()
            elif dim == 3:
                # 根据填充方向选择对张量的部分进行填充
                if padding_side == "left":
                    tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
                else:
                    tensor[i, : len(item[key][0]), :] = item[key][0].clone()
            elif dim == 4:
                # 根据填充方向选择对张量的部分进行填充
                if padding_side == "left":
                    tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
                else:
                    tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()

        # 返回填充后的张量
        return tensor
    else:
        # 如果第一个项目的键对应的值不是 torch.Tensor 类型,返回所有项目中指定键的列表
        return [item[key] for item in items]
# 根据给定的 tokenizer 和 feature_extractor 对输入进行填充处理的函数
def pad_collate_fn(tokenizer, feature_extractor):
    # tokenizer 的填充位置,默认为 None
    t_padding_side = None
    # feature_extractor 的填充位置,默认为 None
    f_padding_side = None
    
    # 如果既没有 tokenizer 也没有 feature_extractor,则无法进行批处理,抛出 ValueError 异常
    if tokenizer is None and feature_extractor is None:
        raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
    
    # 如果存在 tokenizer
    if tokenizer is not None:
        # 如果 tokenizer 没有 pad_token_id,则抛出 ValueError 异常
        if tokenizer.pad_token_id is None:
            raise ValueError(
                "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
                "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
            )
        else:
            # 设置 tokenizer 的填充值为 pad_token_id,并获取填充位置
            t_padding_value = tokenizer.pad_token_id
            t_padding_side = tokenizer.padding_side
    
    # 如果存在 feature_extractor
    if feature_extractor is not None:
        # 获取 feature_extractor 的填充值和填充位置
        f_padding_value = getattr(feature_extractor, "padding_value", None)
        f_padding_side = getattr(feature_extractor, "padding_side", None)
    
    # 如果 tokenizer 和 feature_extractor 的填充位置存在且不一致,则抛出 ValueError 异常
    if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
        raise ValueError(
            f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
        )
    
    # 默认的填充位置为右侧
    padding_side = "right"
    if t_padding_side is not None:
        padding_side = t_padding_side
    if f_padding_side is not None:
        padding_side = f_padding_side
    
    # 内部函数,用于对批量数据进行填充处理
    def inner(items):
        keys = set(items[0].keys())
        # 检查批量数据中的每个元素是否具有相同的键,否则抛出 ValueError 异常
        for item in items:
            if set(item.keys()) != keys:
                raise ValueError(
                    f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !="
                    f" {keys})"
                )
        
        # 初始化填充后的数据字典
        padded = {}
        # 遍历每个键,根据键的类型选择相应的填充值进行填充
        for key in keys:
            if key in {"input_ids"}:
                # 对于 input_ids 类型的键,根据存在的 tokenizer 和 feature_extractor 选择填充值
                if tokenizer is None and feature_extractor is not None:
                    _padding_value = f_padding_value
                else:
                    _padding_value = t_padding_value
            elif key in {"input_values", "pixel_values", "input_features"}:
                # 对于 input_values, pixel_values, input_features 类型的键,使用 feature_extractor 的填充值
                _padding_value = f_padding_value
            elif key in {"p_mask", "special_tokens_mask"}:
                # 对于 p_mask, special_tokens_mask 类型的键,使用填充值 1
                _padding_value = 1
            elif key in {"attention_mask", "token_type_ids"}:
                # 对于 attention_mask, token_type_ids 类型的键,使用填充值 0
                _padding_value = 0
            else:
                # 对于其他类型的键,默认使用填充值 0
                _padding_value = 0
            # 调用 _pad 函数进行填充,并将填充后的结果存入 padded 字典中
            padded[key] = _pad(items, key, _padding_value, padding_side)
        
        return padded
    
    return inner


def infer_framework_load_model(
    model,
    config: AutoConfig,
    model_classes: Optional[Dict[str, Tuple[type]]] = None,
    task: Optional[str] = None,
    framework: Optional[str] = None,
    **model_kwargs,
):
    """
    模型加载函数,根据不同的框架和任务加载模型

    Parameters:
    - model: 加载的模型实例
    - config: 自动配置对象
    - model_classes: 模型类别的字典,可选
    - task: 任务名称,可选
    - framework: 框架名称,可选
    - **model_kwargs: 其他模型相关的参数

    """
    # 检查是否安装了 TensorFlow 和 PyTorch,如果都没有安装则引发运行时错误
    if not is_tf_available() and not is_torch_available():
        raise RuntimeError(
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
    # 如果输入的 model 是一个字符串,说明需要从预定义的任务中加载模型
    if isinstance(model, str):
        # 在 model_kwargs 中添加 "_from_pipeline" 键,指定任务名称
        model_kwargs["_from_pipeline"] = task
        # 初始化一个空的类元组
        class_tuple = ()
        # 检查是否可以使用 Torch,并且 framework 可以是 "pt" 或者 None
        look_pt = is_torch_available() and framework in {"pt", None}
        # 检查是否可以使用 TensorFlow,并且 framework 可以是 "tf" 或者 None
        look_tf = is_tf_available() and framework in {"tf", None}

        # 如果提供了 model_classes
        if model_classes:
            # 如果可以使用 Torch,则将 Torch 模型类添加到 class_tuple 中
            if look_pt:
                class_tuple = class_tuple + model_classes.get("pt", (AutoModel,))
            # 如果可以使用 TensorFlow,则将 TensorFlow 模型类添加到 class_tuple 中
            if look_tf:
                class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,))

        # 如果提供了 config.architectures
        if config.architectures:
            # 初始化一个空列表用于存储模型类
            classes = []
            # 遍历 config.architectures 中的每个架构
            for architecture in config.architectures:
                # 动态导入 transformers 模块
                transformers_module = importlib.import_module("transformers")
                # 如果可以使用 Torch
                if look_pt:
                    # 尝试从 transformers 模块中获取对应的架构类
                    _class = getattr(transformers_module, architecture, None)
                    # 如果成功获取到类,则将其添加到 classes 列表中
                    if _class is not None:
                        classes.append(_class)
                # 如果可以使用 TensorFlow
                if look_tf:
                    # 尝试从 transformers 模块中获取对应的 TensorFlow 架构类
                    _class = getattr(transformers_module, f"TF{architecture}", None)
                    # 如果成功获取到类,则将其添加到 classes 列表中
                    if _class is not None:
                        classes.append(_class)
            # 将 classes 列表中的所有类添加到 class_tuple 中
            class_tuple = class_tuple + tuple(classes)

        # 如果没有找到合适的模型类,则抛出 ValueError 异常
        if len(class_tuple) == 0:
            raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")

        # 初始化一个字典,用于存储所有的 traceback 信息
        all_traceback = {}
        # 遍历 class_tuple 中的每个模型类
        for model_class in class_tuple:
            # 复制 model_kwargs 到 kwargs
            kwargs = model_kwargs.copy()
            # 如果 framework 是 "pt" 并且 model 的结尾是 ".h5"
            if framework == "pt" and model.endswith(".h5"):
                # 在 kwargs 中添加 "from_tf" 键,指示从 TensorFlow 加载模型
                kwargs["from_tf"] = True
                # 输出警告信息,说明正在尝试使用 PyTorch 加载 TensorFlow 模型
                logger.warning(
                    "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                    "Trying to load the model with PyTorch."
                )
            # 如果 framework 是 "tf" 并且 model 的结尾是 ".bin"
            elif framework == "tf" and model.endswith(".bin"):
                # 在 kwargs 中添加 "from_pt" 键,指示从 PyTorch 加载模型
                kwargs["from_pt"] = True
                # 输出警告信息,说明正在尝试使用 TensorFlow 加载 PyTorch 模型
                logger.warning(
                    "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                    "Trying to load the model with Tensorflow."
                )

            try:
                # 使用 model_class.from_pretrained 方法加载指定的模型
                model = model_class.from_pretrained(model, **kwargs)
                # 如果模型有 eval 方法,则调用它将模型设置为评估模式
                if hasattr(model, "eval"):
                    model = model.eval()
                # 在第一次成功加载模型后停止加载尝试
                break
            # 捕获 OSError 或 ValueError 异常
            except (OSError, ValueError):
                # 将捕获到的异常信息存储到 all_traceback 字典中
                all_traceback[model_class.__name__] = traceback.format_exc()
                # 继续尝试加载下一个模型类

        # 如果 model 仍然是一个字符串,说明无法成功加载任何模型类
        if isinstance(model, str):
            # 初始化一个空字符串,用于存储所有的错误信息
            error = ""
            # 遍历 all_traceback 字典中的每个模型类及其对应的 traceback 信息
            for class_name, trace in all_traceback.items():
                # 拼接错误信息字符串
                error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
            # 抛出 ValueError 异常,说明无法加载指定的模型
            raise ValueError(
                f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
            )

    # 如果 framework 是 None,则根据模型的类推断 framework
    if framework is None:
        framework = infer_framework(model.__class__)
    # 返回推断得到的 framework 和加载成功的 model
    return framework, model
# 从给定的模型推断出使用的框架(TensorFlow 或 PyTorch),并返回框架和模型的元组。

def infer_framework_from_model(
    model,
    model_classes: Optional[Dict[str, Tuple[type]]] = None,
    task: Optional[str] = None,
    framework: Optional[str] = None,
    **model_kwargs,
):
    """
    从传入的 `model` 推断出要使用的框架(TensorFlow 或 PyTorch)。返回一个元组 (框架, 模型)。

    如果 `model` 已经被实例化,此函数将从模型类中推断框架。否则,如果 `model` 是一个检查点名称,
    此方法将尝试使用 `model_classes` 实例化它。由于我们不希望实例化模型两次,这个模型将被返回以供流水线使用。

    如果两种框架都安装并可用于 `model`,将选择 PyTorch。

    Args:
        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
            要从中推断框架的模型。如果是 `str`,则是检查点名称。要推断框架的模型。
        model_classes (dictionary `str` to `type`, *optional*):
            框架到类的映射。
        task (`str`):
            定义将返回的流水线的任务。
        model_kwargs:
            传递给模型的 `from_pretrained(..., **model_kwargs)` 函数的额外关键字参数。

    Returns:
        `Tuple`: 框架和模型的元组。
    """
    if isinstance(model, str):
        # 如果 `model` 是字符串,则从预训练模型加载配置。
        config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
    else:
        # 否则,从已实例化的模型中获取配置。
        config = model.config
    return infer_framework_load_model(
        model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs
    )


def get_framework(model, revision: Optional[str] = None):
    """
    选择要使用的框架(TensorFlow 或 PyTorch)。

    Args:
        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
            如果两种框架都安装,选择与传入的模型对应的框架(模型类或模型名称)。如果未提供特定模型,则默认使用 PyTorch。
    """
    # 发出警告,表示 `get_framework` 已弃用,将在 v5 中移除,建议使用 `infer_framework_from_model`。
    warnings.warn(
        "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
        FutureWarning,
    )
    if not is_tf_available() and not is_torch_available():
        # 如果 TensorFlow 2.0 和 PyTorch 都未安装,则引发运行时错误。
        raise RuntimeError(
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
    # 检查 model 是否为字符串类型
    if isinstance(model, str):
        # 检查当前环境是否支持 PyTorch 但不支持 TensorFlow
        if is_torch_available() and not is_tf_available():
            # 使用 AutoModel 类从预训练模型中加载指定的模型
            model = AutoModel.from_pretrained(model, revision=revision)
        # 检查当前环境是否支持 TensorFlow 但不支持 PyTorch
        elif is_tf_available() and not is_torch_available():
            # 使用 TFAutoModel 类从预训练模型中加载指定的模型
            model = TFAutoModel.from_pretrained(model, revision=revision)
        else:
            # 尝试使用 AutoModel 类加载预训练模型,如果出现 OSError 则使用 TFAutoModel 类
            try:
                model = AutoModel.from_pretrained(model, revision=revision)
            except OSError:
                model = TFAutoModel.from_pretrained(model, revision=revision)

    # 推断模型所属的深度学习框架(例如 PyTorch、TensorFlow)
    framework = infer_framework(model.__class__)
    # 返回推断得到的框架名称
    return framework
# 定义函数,选择给定任务的默认模型和修订版本
def get_default_model_and_revision(
    targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]
) -> Union[str, Tuple[str, str]]:
    """
    Select a default model to use for a given task. Defaults to pytorch if ambiguous.

    Args:
        targeted_task (`Dict` ):
           Dictionary representing the given task, that should contain default models

        framework (`str`, None)
           "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.

        task_options (`Any`, None)
           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
           translation task.

    Returns

        `str` The model string representing the default model for this pipeline
    """
    # 如果只有 torch 可用而没有 tensorflow,则设定 framework 为 "pt"
    if is_torch_available() and not is_tf_available():
        framework = "pt"
    # 如果只有 tensorflow 可用而没有 torch,则设定 framework 为 "tf"
    elif is_tf_available() and not is_torch_available():
        framework = "tf"

    # 获取目标任务的默认设置
    defaults = targeted_task["default"]
    # 如果有任务选项,则根据选项获取默认模型
    if task_options:
        # 如果任务选项不在默认设置中,则抛出异常
        if task_options not in defaults:
            raise ValueError(f"The task does not provide any default models for options {task_options}")
        # 获取特定任务选项下的默认模型
        default_models = defaults[task_options]["model"]
    # 如果没有任务选项,但默认设置中包含 "model" 键,则获取默认模型
    elif "model" in defaults:
        default_models = targeted_task["default"]["model"]
    else:
        # 如果出现这种情况,通常表示任务默认设置选择有问题
        # XXX 如果更多任务要参数化,需要更新这个错误消息为更通用的形式
        raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"')

    # 如果 framework 为 None,则默认使用 "pt"
    if framework is None:
        framework = "pt"

    # 返回指定 framework 下的默认模型
    return default_models[framework]


class PipelineException(Exception):
    """
    Raised by a [`Pipeline`] when handling __call__.

    Args:
        task (`str`): The task of the pipeline.
        model (`str`): The model used by the pipeline.
        reason (`str`): The error message to display.
    """

    def __init__(self, task: str, model: str, reason: str):
        # 初始化异常,继承自 Exception 类
        super().__init__(reason)

        # 初始化异常的任务、模型和原因
        self.task = task
        self.model = model


class ArgumentHandler(ABC):
    """
    Base interface for handling arguments for each [`~pipelines.Pipeline`].
    """

    @abstractmethod
    def __call__(self, *args, **kwargs):
        # 抽象方法,用于处理管道每个实例的参数,需要在子类中实现
        raise NotImplementedError()


class PipelineDataFormat:
    """
    Base class for all the pipeline supported data format both for reading and writing. Supported data formats
    currently includes:

    - JSON
    - CSV
    - stdin/stdout (pipe)

    `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to
    pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """

    # 管道支持的数据格式的基类,包括读取和写入支持的数据格式
    # 当前支持的数据格式包括 JSON、CSV、标准输入/输出(管道)
    # `PipelineDataFormat` 还包括一些工具函数,用于处理多列数据,例如从数据集列映射到管道关键字参数的格式 `dataset_kwarg_1=dataset_column_1`
    Args:
        output_path (`str`): Where to save the outgoing data.
        input_path (`str`): Where to look for the input data.
        column (`str`): The column to read.
        overwrite (`bool`, *optional*, defaults to `False`):
            Whether or not to overwrite the `output_path`.
    """



    # 支持的数据格式
    SUPPORTED_FORMATS = ["json", "csv", "pipe"]

    def __init__(
        self,
        output_path: Optional[str],
        input_path: Optional[str],
        column: Optional[str],
        overwrite: bool = False,
    ):
        # 输出路径
        self.output_path = output_path
        # 输入路径
        self.input_path = input_path
        # 要读取的列,如果没有指定则为空字符串列表
        self.column = column.split(",") if column is not None else [""]
        # 是否多列读取
        self.is_multi_columns = len(self.column) > 1

        # 如果是多列读取,则解析每列的键值对形式
        if self.is_multi_columns:
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]

        # 如果指定了输出路径且不允许覆盖写入,则检查输出路径是否已存在
        if output_path is not None and not overwrite:
            if exists(abspath(self.output_path)):
                raise OSError(f"{self.output_path} already exists on disk")

        # 如果指定了输入路径,则检查输入路径是否存在
        if input_path is not None:
            if not exists(abspath(self.input_path)):
                raise OSError(f"{self.input_path} doesnt exist on disk")

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: Union[dict, List[dict]]):
        """
        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].

        Args:
            data (`dict` or list of `dict`): The data to store.
        """
        raise NotImplementedError()

    def save_binary(self, data: Union[dict, List[dict]]) -> str:
        """
        Save the provided data object as a pickle-formatted binary data on the disk.

        Args:
            data (`dict` or list of `dict`): The data to store.

        Returns:
            `str`: Path where the data has been saved.
        """
        # 获取输出路径的文件名(去除扩展名)
        path, _ = os.path.splitext(self.output_path)
        # 构建保存为 pickle 格式的二进制文件路径
        binary_path = os.path.extsep.join((path, "pickle"))

        # 将数据以二进制形式写入到文件中
        with open(binary_path, "wb+") as f_output:
            pickle.dump(data, f_output)

        # 返回保存数据的文件路径
        return binary_path

    @staticmethod
    def from_str(
        format: str,
        output_path: Optional[str],
        input_path: Optional[str],
        column: Optional[str],
        overwrite=False,
    ) -> "PipelineDataFormat":
        """
        根据 `format` 参数创建相应的 [`~pipelines.PipelineDataFormat`] 子类实例。

        Args:
            format (`str`):
                所需流水线的格式。可接受的值为 `"json"`、`"csv"` 或 `"pipe"`。
            output_path (`str`, *optional*):
                输出数据保存的路径。
            input_path (`str`, *optional*):
                输入数据所在路径。
            column (`str`, *optional*):
                要读取的列。
            overwrite (`bool`, *optional*, 默认为 `False`):
                是否覆盖 `output_path`。

        Returns:
            [`~pipelines.PipelineDataFormat`]: 合适的数据格式对象。
        """
        if format == "json":
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
        elif format == "csv":
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
        elif format == "pipe":
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
        else:
            raise KeyError(f"未知的数据格式 {format} (可用的格式为 json/csv/pipe)")
class CsvPipelineDataFormat(PipelineDataFormat):
    """
    Support for pipelines using CSV data format.

    Args:
        output_path (`str`): Where to save the outgoing data.
        input_path (`str`): Where to look for the input data.
        column (`str`): The column to read.
        overwrite (`bool`, *optional*, defaults to `False`):
            Whether or not to overwrite the `output_path`.
    """

    def __init__(
        self,
        output_path: Optional[str],
        input_path: Optional[str],
        column: Optional[str],
        overwrite=False,
    ):
        # 调用父类的构造函数
        super().__init__(output_path, input_path, column, overwrite=overwrite)

    def __iter__(self):
        # 打开输入文件并创建 CSV 字典读取器
        with open(self.input_path, "r") as f:
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}  # 如果有多列数据,以字典形式返回
                else:
                    yield row[self.column[0]]  # 否则返回指定列

    def save(self, data: List[dict]):
        """
        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].

        Args:
            data (`List[dict]`): The data to store.
        """
        with open(self.output_path, "w") as f:
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()  # 写入头部
                writer.writerows(data)  # 写入数据


class JsonPipelineDataFormat(PipelineDataFormat):
    """
    Support for pipelines using JSON file format.

    Args:
        output_path (`str`): Where to save the outgoing data.
        input_path (`str`): Where to look for the input data.
        column (`str`): The column to read.
        overwrite (`bool`, *optional*, defaults to `False`):
            Whether or not to overwrite the `output_path`.
    """

    def __init__(
        self,
        output_path: Optional[str],
        input_path: Optional[str],
        column: Optional[str],
        overwrite=False,
    ):
        super().__init__(output_path, input_path, column, overwrite=overwrite)  # 调用父类的构造函数

        with open(input_path, "r") as f:  
            self._entries = json.load(f)  # 读取 JSON 格式的数据并存储在 _entries 变量中

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}  # 如果有多列数据,以字典形式返回
            else:
                yield entry[self.column[0]]  # 否则返回指定列

    def save(self, data: dict):
        """
        Save the provided data object in a json file.

        Args:
            data (`dict`): The data to store.
        """
        with open(self.output_path, "w") as f:
            json.dump(data, f)  # 将数据存储为 JSON 格式


class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process. For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
    def __iter__(self):
        """
        Iterate over input lines from stdin.

        Yields:
            - If the line contains tabs (`\t`):
                - If `self.column` is defined, yield a dictionary mapping column names to line values.
                - Otherwise, yield a tuple of line values.
            - If no tabs are present, yield the entire line.
        """
        for line in sys.stdin:
            # Split for multi-columns
            if "\t" in line:
                line = line.split("\t")
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        """
        Print the provided data.

        Args:
            data (`dict`): The data to be printed.
        """
        print(data)

    def save_binary(self, data: Union[dict, List[dict]]) -> str:
        """
        Save binary data to the specified output path.

        Args:
            data (Union[dict, List[dict]]): The binary data to be saved.

        Returns:
            str: The output path where the data was saved.
        
        Raises:
            KeyError: If `self.output_path` is `None`, indicating a missing output path.
        """
        if self.output_path is None:
            raise KeyError(
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
            )

        return super().save_binary(data)
class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        # 抽象方法:子类需实现数据转换的逻辑
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        # 抽象方法:子类需实现预测的逻辑
        raise NotImplementedError()


def build_pipeline_init_args(
    has_tokenizer: bool = False,
    has_feature_extractor: bool = False,
    has_image_processor: bool = False,
    supports_binary_output: bool = True,
) -> str:
    docstring = r"""
    Arguments:
        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow."""
    if has_tokenizer:
        docstring += r"""
        tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
            [`PreTrainedTokenizer`]."""
    if has_feature_extractor:
        docstring += r"""
        feature_extractor ([`SequenceFeatureExtractor`]):
            The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from
            [`SequenceFeatureExtractor`]."""
    if has_image_processor:
        docstring += r"""
        image_processor ([`BaseImageProcessor`]):
            The image processor that will be used by the pipeline to encode data for the model. This object inherits from
            [`BaseImageProcessor`]."""
    docstring += r"""
        modelcard (`str` or [`ModelCard`], *optional*):
            Model card attributed to the model for this pipeline.
        framework (`str`, *optional*):
            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
            installed.
            
            If no framework is specified, will default to the one currently installed. If no framework is specified and
            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
            provided.
        task (`str`, defaults to `""`):
            A task-identifier for the pipeline.
        num_workers (`int`, *optional*, defaults to 8):
            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
            workers to be used.
        batch_size (`int`, *optional*, defaults to 1):
            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
            the batch to use, for inference this is not always beneficial, please read [Batching with
            pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) .
        args_parser ([`~pipelines.ArgumentHandler`], *optional*):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (`int`, *optional*, defaults to -1):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
            the associated CUDA device id. You can pass native `torch.device` or a `str` too
        torch_dtype (`str` or `torch.dtype`, *optional*):
            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`)
    """
    # 如果支持二进制输出,则添加下面的描述
    if supports_binary_output:
        docstring += r"""
        binary_output (`bool`, *optional*, defaults to `False`):
            Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as
            the raw output data e.g. text."""
    # 返回完整的文档字符串
    return docstring
# 使用指定的参数构建初始化 Pipeline 的参数字典
PIPELINE_INIT_ARGS = build_pipeline_init_args(
    has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True
)

# 如果当前环境支持 Torch,则导入相关的工具类和函数
if is_torch_available():
    from transformers.pipelines.pt_utils import (
        PipelineChunkIterator,  # 导入 PipelineChunkIterator 类,用于处理分块迭代
        PipelineDataset,         # 导入 PipelineDataset 类,用于管道的数据集操作
        PipelineIterator,        # 导入 PipelineIterator 类,用于管道的迭代操作
        PipelinePackIterator,    # 导入 PipelinePackIterator 类,用于管道的打包迭代操作
    )

# 根据指定的参数设置构建 Pipeline,并添加相应的文档字符串
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_feature_extractor=True, has_image_processor=True))
class Pipeline(_ScikitCompat):
    """
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.

    Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
    operations:

        Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output

    Pipeline supports running on CPU or GPU through the device argument (see below).

    Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object
    as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output`
    constructor argument. If set to `True`, the output will be stored in the pickle format.
    """

    default_input_names = None

    def __init__(
        self,
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: Optional[PreTrainedTokenizer] = None,
        feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
        image_processor: Optional[BaseImageProcessor] = None,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
        task: str = "",
        args_parser: ArgumentHandler = None,
        device: Union[int, "torch.device"] = None,
        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
        binary_output: bool = False,
        **kwargs,
    ):
        """
        Initialize a pipeline object.

        Parameters:
        - model (Union["PreTrainedModel", "TFPreTrainedModel"]): The pretrained model to use.
        - tokenizer (Optional[PreTrainedTokenizer]): Tokenizer used to preprocess the inputs.
        - feature_extractor (Optional[PreTrainedFeatureExtractor]): Feature extractor for inputs.
        - image_processor (Optional[BaseImageProcessor]): Image processor for image inputs.
        - modelcard (Optional[ModelCard]): ModelCard describing the model's attributes.
        - framework (Optional[str]): The framework where the model is implemented (e.g., 'pt' for PyTorch).
        - task (str): The task associated with the pipeline.
        - args_parser (ArgumentHandler): Custom argument handler for parsing pipeline arguments.
        - device (Union[int, "torch.device"]): Device where the model will be run (CPU/GPU).
        - torch_dtype (Optional[Union[str, "torch.dtype"]]): Data type used in PyTorch models.
        - binary_output (bool): Whether to output results in binary (pickle) format.
        - **kwargs: Additional keyword arguments passed to the pipeline.

        Notes:
        - This constructor initializes a pipeline object with the specified parameters.
        - It supports various preprocessing and postprocessing operations for different tasks.
        - The 'binary_output' flag controls whether outputs are stored in binary format.
        """
        super().__init__()
        # Initialize the pipeline object with the provided parameters
        self.model = model
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.image_processor = image_processor
        self.modelcard = modelcard
        self.framework = framework
        self.task = task
        self.args_parser = args_parser
        self.device = device
        self.torch_dtype = torch_dtype
        self.binary_output = binary_output
        self.kwargs = kwargs
    def save_pretrained(self, save_directory: str, safe_serialization: bool = True):
        """
        Save the pipeline's model and tokenizer.

        Args:
            save_directory (`str`):
                A path to the directory where to saved. It will be created if it doesn't exist.
            safe_serialization (`str`):
                Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow.
        """
        # 检查保存路径是否为文件,若是则记录错误并返回
        if os.path.isfile(save_directory):
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
            return
        # 创建保存路径的目录,若目录已存在则不会重复创建
        os.makedirs(save_directory, exist_ok=True)

        # 如果对象具有 `_registered_impl` 属性,则保存自定义流水线信息和代码
        if hasattr(self, "_registered_impl"):
            # 复制已注册的流水线信息
            pipeline_info = self._registered_impl.copy()
            custom_pipelines = {}
            # 遍历流水线信息
            for task, info in pipeline_info.items():
                # 只保留与当前类相关的流水线信息
                if info["impl"] != self.__class__:
                    continue

                info = info.copy()
                module_name = info["impl"].__module__
                last_module = module_name.split(".")[-1]
                # 将类名转换为完整的模块.类名 形式
                info["impl"] = f"{last_module}.{info['impl'].__name__}"
                # 转换为元组,包含每个任务支持的类的名称
                info["pt"] = tuple(c.__name__ for c in info["pt"])
                info["tf"] = tuple(c.__name__ for c in info["tf"])

                custom_pipelines[task] = info
            # 将自定义流水线信息设置到模型配置中
            self.model.config.custom_pipelines = custom_pipelines
            # 保存流水线的自定义代码和对象
            custom_object_save(self, save_directory)

        # 调用模型的保存方法,将模型保存到指定路径
        self.model.save_pretrained(save_directory, safe_serialization=safe_serialization)

        # 如果存在 tokenizer,则也将其保存到指定路径
        if self.tokenizer is not None:
            self.tokenizer.save_pretrained(save_directory)

        # 如果存在特征提取器,则也将其保存到指定路径
        if self.feature_extractor is not None:
            self.feature_extractor.save_pretrained(save_directory)

        # 如果存在图像处理器,则也将其保存到指定路径
        if self.image_processor is not None:
            self.image_processor.save_pretrained(save_directory)

        # 如果存在模型卡片信息,则也将其保存到指定路径
        if self.modelcard is not None:
            self.modelcard.save_pretrained(save_directory)
    # 定义上下文管理器,允许在用户指定的设备上进行张量分配,与框架无关。
    def device_placement(self):
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.

        Returns:
            Context manager

        Examples:

        ```
        # Explicitly ask for tensor allocation on CUDA device :0
        pipe = pipeline(..., device=0)
        with pipe.device_placement():
            # Every framework specific tensor allocation will be done on the request device
            output = pipe(...)
        ```"""
        # 如果当前框架是 TensorFlow
        if self.framework == "tf":
            # 使用 tf.device 确定张量分配在指定的 CPU 或 GPU 设备上
            with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
                # 使用 yield 将控制权交给调用者
                yield
        else:
            # 如果当前框架不是 TensorFlow
            # 检查设备类型是否为 CUDA
            if self.device.type == "cuda":
                # 使用 torch.cuda.device 确定张量分配在指定的 CUDA 设备上
                with torch.cuda.device(self.device):
                    # 使用 yield 将控制权交给调用者
                    yield
            else:
                # 对于其他类型的设备,默认使用 yield 将控制权交给调用者
                yield

    # 确保 PyTorch 张量位于指定设备上
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.

        Args:
            inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored):
                The tensors to place on `self.device`.
            Recursive on lists **only**.

        Return:
            `Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
        """
        # 调用内部方法 _ensure_tensor_on_device,将输入张量放置在指定的设备上
        return self._ensure_tensor_on_device(inputs, self.device)

    # 内部方法,递归确保张量位于指定设备上
    def _ensure_tensor_on_device(self, inputs, device):
        # 如果输入是 ModelOutput 类型的对象
        if isinstance(inputs, ModelOutput):
            # 递归处理每个项,并确保它们位于指定设备上
            return ModelOutput(
                {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
            )
        # 如果输入是字典类型
        elif isinstance(inputs, dict):
            # 递归处理每个键值对,并确保值位于指定设备上
            return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
        # 如果输入是 UserDict 类型
        elif isinstance(inputs, UserDict):
            # 递归处理每个键值对,并确保值位于指定设备上
            return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()})
        # 如果输入是列表类型
        elif isinstance(inputs, list):
            # 递归处理列表中的每个元素,并确保它们位于指定设备上
            return [self._ensure_tensor_on_device(item, device) for item in inputs]
        # 如果输入是元组类型
        elif isinstance(inputs, tuple):
            # 递归处理元组中的每个元素,并确保它们位于指定设备上
            return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
        # 如果输入是 PyTorch 的张量类型
        elif isinstance(inputs, torch.Tensor):
            # 如果目标设备是 CPU,并且张量的数据类型是 float16 或 bfloat16,则将其转换为 float 类型
            if device == torch.device("cpu") and inputs.dtype in {torch.float16, torch.bfloat16}:
                inputs = inputs.float()
            # 将张量移到指定设备上,并返回结果
            return inputs.to(device)
        else:
            # 对于其他类型的输入,直接返回原始输入
            return inputs
    def check_model_type(self, supported_models: Union[List[str], dict]):
        """
        检查模型类是否被流水线支持。

        Args:
            supported_models (`List[str]` or `dict`):
                支持的模型列表或包含模型类值的字典。
        """
        if not isinstance(supported_models, list):  # 如果不是列表,则从模型映射创建
            supported_models_names = []
            for _, model_name in supported_models.items():
                # 映射现在可以包含相同配置的模型元组。
                if isinstance(model_name, tuple):
                    supported_models_names.extend(list(model_name))
                else:
                    supported_models_names.append(model_name)
            if hasattr(supported_models, "_model_mapping"):
                for _, model in supported_models._model_mapping._extra_content.items():
                    if isinstance(model_name, tuple):
                        supported_models_names.extend([m.__name__ for m in model])
                    else:
                        supported_models_names.append(model.__name__)
            supported_models = supported_models_names
        if self.model.__class__.__name__ not in supported_models:
            logger.error(
                f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are"
                f" {supported_models}."
            )

    @abstractmethod
    def _sanitize_parameters(self, **pipeline_parameters):
        """
        _sanitize_parameters 将会被调用,传入来自 `__init__` 或 `__call__` 方法的任意多余的命名参数。
        它应该返回三个字典,这些字典包含各种 `preprocess`、`forward` 和 `postprocess` 方法使用的解析参数。
        如果调用者未指定 kwargs,则不要填充字典。这使您可以在函数签名中保留默认值,这更加自然。

        它不应该直接调用,将会在 `__init__` 和 `__call__` 中自动调用,并由这些方法解析最终参数。
        """
        raise NotImplementedError("_sanitize_parameters not implemented")

    @abstractmethod
    def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
        """
        preprocess 将接受流水线特定的 `input_` 并返回一个包含一切 `_forward` 正常运行所需的字典。
        它应该至少包含一个张量,但也可能包含任意其他项目。
        """
        raise NotImplementedError("preprocess not implemented")
    # 定义一个抽象方法,用于模型的前向传播,接收经过 `preprocess` 处理后的输入数据字典,并返回模型输出
    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
        """
        _forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might
        involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess`
        and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible.

        It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional
        code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part
        of the code (leading to faster inference).
        """
        raise NotImplementedError("_forward not implemented")

    # 定义一个抽象方法,用于对 `_forward` 方法的输出进行后处理,将模型的原始输出转换成更友好的格式
    @abstractmethod
    def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
        """
        Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into
        something more friendly. Generally it will output a list or a dict or results (containing just strings and
        numbers).
        """
        raise NotImplementedError("postprocess not implemented")

    # 返回一个上下文管理器,用于在推理过程中关闭梯度计算,提高推理效率
    def get_inference_context(self):
        return torch.no_grad

    # 定义模型的前向传播方法 `forward`,根据框架类型选择不同的处理逻辑,确保模型和张量在相同的设备上,并禁用训练部分的代码以加快推理速度
    def forward(self, model_inputs, **forward_params):
        with self.device_placement():
            if self.framework == "tf":
                model_inputs["training"] = False
                model_outputs = self._forward(model_inputs, **forward_params)
            elif self.framework == "pt":
                inference_context = self.get_inference_context()
                with inference_context():
                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
                    model_outputs = self._forward(model_inputs, **forward_params)
                    model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
            else:
                raise ValueError(f"Framework {self.framework} is not supported")
        return model_outputs

    # 获取数据迭代器,用于模型推理过程中的数据加载和预处理
    def get_iterator(
        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
    ):
        ):
            # 检查输入是否可迭代
            if isinstance(inputs, collections.abc.Sized):
                # 如果可迭代,创建 PipelineDataset 对象
                dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
            else:
                # 如果不可迭代,并且设置了多个工作进程
                if num_workers > 1:
                    # 发出警告,因为在可迭代数据集中使用多个工作进程可能会导致错误
                    logger.warning(
                        "For iterable dataset using num_workers>1 is likely to result"
                        " in errors since everything is iterable, setting `num_workers=1`"
                        " to guarantee correctness."
                    )
                    # 设置 num_workers 为 1 以确保正确性
                    num_workers = 1
                # 创建 PipelineIterator 对象
                dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
            # 如果环境变量中未设置 TOKENIZERS_PARALLELISM
            if "TOKENIZERS_PARALLELISM" not in os.environ:
                # 输出日志,禁用 tokenizer 的并行处理,因为 DataLoader 已经在多线程处理了
                logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
                # 设置环境变量 TOKENIZERS_PARALLELISM 为 false
                os.environ["TOKENIZERS_PARALLELISM"] = "false"
            # TODO hack by collating feature_extractor and image_processor
            # 根据情况选择特征提取器或图像处理器作为 feature_extractor
            feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
            # 根据 batch_size 是否为 1 选择使用 no_collate_fn 或 pad_collate_fn 作为 collate_fn
            collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
            # 创建 DataLoader 对象
            dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
            # 使用 dataloader 创建 PipelineIterator 对象
            model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
            # 使用 model_iterator 创建最终 PipelineIterator 对象
            final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
            # 返回最终的迭代器
            return final_iterator
    # 定义一个特殊方法 __call__,使对象可以像函数一样调用,接收输入参数 inputs 和额外的可变参数 args
    def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
        # 如果有额外的位置参数 args,则记录警告并忽略这些参数
        if args:
            logger.warning(f"Ignoring args : {args}")

        # 确定并设置 num_workers 参数:如果未提供,则使用对象属性 _num_workers 或默认值 0
        if num_workers is None:
            if self._num_workers is None:
                num_workers = 0
            else:
                num_workers = self._num_workers

        # 确定并设置 batch_size 参数:如果未提供,则使用对象属性 _batch_size 或默认值 1
        if batch_size is None:
            if self._batch_size is None:
                batch_size = 1
            else:
                batch_size = self._batch_size

        # 根据传入的关键字参数 kwargs,从中提取预处理、前向传播和后处理的参数
        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)

        # 将 __init__ 方法的参数与当前 __call__ 方法的参数合并,不影响 __init__ 方法的参数
        preprocess_params = {**self._preprocess_params, **preprocess_params}
        forward_params = {**self._forward_params, **forward_params}
        postprocess_params = {**self._postprocess_params, **postprocess_params}

        # 增加调用计数器
        self.call_count += 1
        # 如果调用次数超过 10 次,并且使用的框架是 "pt",并且设备是 CUDA GPU,则发出警告
        if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
            logger.warning_once(
                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
                " dataset",
                UserWarning,
            )

        # 判断输入是否为 Dataset 类型并且 Dataset 类型存在,或者是生成器类型,或者是列表类型
        is_dataset = Dataset is not None and isinstance(inputs, Dataset)
        is_generator = isinstance(inputs, types.GeneratorType)
        is_list = isinstance(inputs, list)

        # 判断输入是否为可迭代对象,包括 Dataset、生成器和列表
        is_iterable = is_dataset or is_generator or is_list

        # 判断是否可以使用迭代器处理输入数据:当前框架为 "pt" 并且输入是 Dataset、生成器或列表
        can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list)

        # 如果输入是列表类型,并且可以使用迭代器处理,则获取迭代器并返回迭代器的结果列表
        if is_list:
            if can_use_iterator:
                final_iterator = self.get_iterator(
                    inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
                )
                outputs = list(final_iterator)
                return outputs
            else:
                # 否则,使用 run_multi 方法处理列表输入并返回结果
                return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
        # 如果可以使用迭代器处理输入,则直接返回迭代器对象
        elif can_use_iterator:
            return self.get_iterator(
                inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
            )
        # 如果输入是可迭代对象,则使用 iterate 方法处理输入并返回结果
        elif is_iterable:
            return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
        # 如果框架为 "pt" 并且当前对象是 ChunkPipeline 类的实例,则处理单个输入并返回结果
        elif self.framework == "pt" and isinstance(self, ChunkPipeline):
            return next(
                iter(
                    self.get_iterator(
                        [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
                    )
                )
            )
        # 否则,使用 run_single 方法处理单个输入并返回结果
        else:
            return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
    # 使用给定的输入数据列表并行运行模型,对每个输入调用 `run_single` 方法
    def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
        # 返回一个列表,包含每个输入数据经过模型处理后的结果
        return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs]

    # 对单个输入数据进行预处理、前向推理和后处理,返回处理后的结果
    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
        # 对输入数据进行预处理,得到模型所需的输入格式
        model_inputs = self.preprocess(inputs, **preprocess_params)
        # 对预处理后的输入进行模型推理,得到模型输出
        model_outputs = self.forward(model_inputs, **forward_params)
        # 对模型输出进行后处理,得到最终的处理结果
        outputs = self.postprocess(model_outputs, **postprocess_params)
        # 返回处理后的输出结果
        return outputs

    # 迭代给定的输入数据集,对每个输入数据进行模型处理,并通过生成器返回结果
    def iterate(self, inputs, preprocess_params, forward_params, postprocess_params):
        # 这个函数应该重新命名为 `get_iterator`,这是一个临时的简单解决方案。
        for input_ in inputs:
            # 对每个输入数据调用 `run_single` 方法,通过生成器 `yield` 返回处理结果
            yield self.run_single(input_, preprocess_params, forward_params, postprocess_params)
# 定义 ChunkPipeline 类,继承自 Pipeline 类
class ChunkPipeline(Pipeline):
    
    # 重写 run_single 方法,处理单个输入数据
    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
        # 存储所有模型的输出结果
        all_outputs = []
        
        # 遍历预处理后的输入数据
        for model_inputs in self.preprocess(inputs, **preprocess_params):
            # 调用模型的前向推理方法,获取模型输出
            model_outputs = self.forward(model_inputs, **forward_params)
            # 将模型输出添加到结果列表中
            all_outputs.append(model_outputs)
        
        # 对所有模型的输出进行后处理,得到最终的输出结果
        outputs = self.postprocess(all_outputs, **postprocess_params)
        return outputs

    # 获取迭代器方法,用于生成数据迭代器
    def get_iterator(
        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
    ):
        # 如果环境变量中没有设置 TOKENIZERS_PARALLELISM,则设置为 false
        if "TOKENIZERS_PARALLELISM" not in os.environ:
            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
            os.environ["TOKENIZERS_PARALLELISM"] = "false"
        
        # 如果 num_workers 大于 1,发出警告信息并将其设置为 1,以确保正确性
        if num_workers > 1:
            logger.warning(
                "For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable,"
                " setting `num_workers=1` to guarantee correctness."
            )
            num_workers = 1
        
        # 使用输入数据和预处理函数参数创建 PipelineChunkIterator 对象
        dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
        
        # 根据 batch_size 的不同选择不同的数据整合函数
        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
        
        # 使用 DataLoader 创建数据加载器对象
        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
        
        # 使用 PipelinePackIterator 封装数据加载器,创建模型迭代器
        model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
        
        # 使用 PipelineIterator 封装模型迭代器,创建最终迭代器对象
        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
        return final_iterator


# 定义 PipelineRegistry 类,用于管理支持的任务和任务别名
class PipelineRegistry:
    
    # 初始化方法,接受支持的任务字典和任务别名字典作为参数
    def __init__(self, supported_tasks: Dict[str, Any], task_aliases: Dict[str, str]) -> None:
        self.supported_tasks = supported_tasks  # 存储支持的任务字典
        self.task_aliases = task_aliases  # 存储任务别名字典
    
    # 获取所有支持任务名称的方法
    def get_supported_tasks(self) -> List[str]:
        # 获取所有支持任务的名称列表,包括任务字典中的键和任务别名字典中的键
        supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys())
        supported_task.sort()  # 对任务名称列表进行排序
        return supported_task  # 返回排序后的任务名称列表
    # 检查给定的任务是否存在别名,若存在则替换为其真实任务名
    def check_task(self, task: str) -> Tuple[str, Dict, Any]:
        if task in self.task_aliases:
            task = self.task_aliases[task]

        # 检查任务是否在支持的任务列表中,若是则返回任务名、目标任务配置和空的参数
        if task in self.supported_tasks:
            targeted_task = self.supported_tasks[task]
            return task, targeted_task, None

        # 若任务以"translation"开头,进一步解析任务格式,并返回任务名、翻译任务配置以及相关参数
        if task.startswith("translation"):
            tokens = task.split("_")
            if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
                targeted_task = self.supported_tasks["translation"]
                task = "translation"
                return task, targeted_task, (tokens[1], tokens[3])
            # 抛出格式错误的异常信息,要求任务名称使用正确的格式'translation_XX_to_YY'
            raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")

        # 抛出未知任务异常信息,显示当前可用任务列表及格式示例
        raise KeyError(
            f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
        )

    # 注册新的任务流水线,并配置相关的模型类、默认模型和类型信息
    def register_pipeline(
        self,
        task: str,
        pipeline_class: type,
        pt_model: Optional[Union[type, Tuple[type]]] = None,
        tf_model: Optional[Union[type, Tuple[type]]] = None,
        default: Optional[Dict] = None,
        type: Optional[str] = None,
    ) -> None:
        # 如果任务已存在于支持的任务列表中,发出警告并覆盖现有的任务流水线配置
        if task in self.supported_tasks:
            logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...")

        # 如果没有提供 PyTorch 模型,则设为空元组
        if pt_model is None:
            pt_model = ()
        elif not isinstance(pt_model, tuple):
            pt_model = (pt_model,)

        # 如果没有提供 TensorFlow 模型,则设为空元组
        if tf_model is None:
            tf_model = ()
        elif not isinstance(tf_model, tuple):
            tf_model = (tf_model,)

        # 构建任务实现的字典,包括实现类、PyTorch 和 TensorFlow 模型
        task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model}

        # 如果提供了默认配置,则检查是否包含模型信息,如果只有 'pt' 或 'tf',则封装为包含模型键的字典
        if default is not None:
            if "model" not in default and ("pt" in default or "tf" in default):
                default = {"model": default}
            task_impl["default"] = default

        # 如果提供了类型信息,则添加到任务实现字典中
        if type is not None:
            task_impl["type"] = type

        # 将任务实现字典注册到支持的任务列表中,并将其绑定到流水线类的注册实现字典中
        self.supported_tasks[task] = task_impl
        pipeline_class._registered_impl = {task: task_impl}

    # 返回当前支持的任务列表及其配置的字典表示形式
    def to_dict(self):
        return self.supported_tasks

.\pipelines\conversational.py

# 导入必要的模块和库
import uuid  # 导入用于生成唯一标识符的模块
import warnings  # 导入警告处理模块
from typing import Any, Dict, List, Union  # 导入类型提示相关的模块

# 导入相对路径的模块和函数
from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
# 从本地模块导入基础类和函数
from .base import Pipeline, build_pipeline_init_args

# 如果 TensorFlow 可用,则导入 TensorFlow 模块
if is_tf_available():
    import tensorflow as tf

# 如果 PyTorch 可用,则导入 PyTorch 模块
if is_torch_available():
    import torch

# 导入日志记录器
logger = logging.get_logger(__name__)


class Conversation:
    """
    Utility class containing a conversation and its history. This class is meant to be used as an input to the
    [`ConversationalPipeline`]. The conversation contains several utility functions to manage the addition of new user
    inputs and generated model responses.

    Arguments:
        messages (Union[str, List[Dict[str, str]]], *optional*):
            The initial messages to start the conversation, either a string, or a list of dicts containing "role" and
            "content" keys. If a string is passed, it is interpreted as a single message with the "user" role.
        conversation_id (`uuid.UUID`, *optional*):
            Unique identifier for the conversation. If not provided, a random UUID4 id will be assigned to the
            conversation.

    Usage:

    ```
    conversation = Conversation("Going to the movies tonight - any suggestions?")
    conversation.add_message({"role": "assistant", "content": "The Big lebowski."})
    conversation.add_message({"role": "user", "content": "Is it good?"})
    ```

    """

    def __init__(
        self, messages: Union[str, List[Dict[str, str]]] = None, conversation_id: uuid.UUID = None, **deprecated_kwargs
    ):
        # 初始化函数,用于创建 Conversation 对象的实例
    ):
        # 如果未提供会话 ID,则生成一个新的 UUID 作为会话 ID
        if not conversation_id:
            conversation_id = uuid.uuid4()

        # 如果未提供消息内容,则从过时的关键字参数中取出"text",创建包含用户角色和文本内容的消息列表
        if messages is None:
            text = deprecated_kwargs.pop("text", None)
            if text is not None:
                messages = [{"role": "user", "content": text}]
            else:
                messages = []
        elif isinstance(messages, str):
            # 如果消息直接是字符串,则转换为包含用户角色和文本内容的消息列表
            messages = [{"role": "user", "content": messages}]

        # 处理遗留的参数 - 新代码应完全避免使用 past_user_inputs 和 generated_responses
        # 设置已处理的用户输入数量为零
        self._num_processed_user_inputs = 0
        generated_responses = deprecated_kwargs.pop("generated_responses", None)
        past_user_inputs = deprecated_kwargs.pop("past_user_inputs", None)
        
        # 如果传入了 generated_responses 但未传入 past_user_inputs,则抛出异常
        if generated_responses is not None and past_user_inputs is None:
            raise ValueError("generated_responses cannot be passed without past_user_inputs!")

        # 如果传入了 past_user_inputs,则组装遗留消息列表和当前消息列表
        if past_user_inputs is not None:
            legacy_messages = []
            if generated_responses is None:
                generated_responses = []
            
            # 通过循环构建消息列表,长度可能不同,因此采用 max() 长度作为循环条件
            for i in range(max([len(past_user_inputs), len(generated_responses)])):
                if i < len(past_user_inputs):
                    legacy_messages.append({"role": "user", "content": past_user_inputs[i]})
                if i < len(generated_responses):
                    legacy_messages.append({"role": "assistant", "content": generated_responses[i]})
            
            # 合并遗留消息列表和当前消息列表
            messages = legacy_messages + messages

        # 设置会话的 UUID 和消息内容
        self.uuid = conversation_id
        self.messages = messages

    # 判断两个 Conversation 对象是否相等的方法
    def __eq__(self, other):
        if not isinstance(other, Conversation):
            return False
        return self.uuid == other.uuid or self.messages == other.messages

    # 向会话中添加消息的方法
    def add_message(self, message: Dict[str, str]):
        # 检查消息中只包含 "role" 和 "content" 两个键
        if not set(message.keys()) == {"role", "content"}:
            raise ValueError("Message should contain only 'role' and 'content' keys!")

        # 检查消息角色是否为 'user', 'assistant' 或 'system'
        if message["role"] not in ("user", "assistant", "system"):
            raise ValueError("Only 'user', 'assistant' and 'system' roles are supported for now!")

        # 添加消息到会话的消息列表中
        self.messages.append(message)
    def add_user_input(self, text: str, overwrite: bool = False):
        """
        Add a user input to the conversation for the next round. This is a legacy method that assumes that inputs must
        alternate user/assistant/user/assistant, and so will not add multiple user messages in succession. We recommend
        just using `add_message` with role "user" instead.
        """
        # 检查对话中是否有消息存在,并且最后一条消息的角色是用户
        if len(self) > 0 and self[-1]["role"] == "user":
            # 如果设置了 overwrite 参数为 True,则覆盖最后一条未处理的用户输入
            if overwrite:
                logger.warning(
                    f'User input added while unprocessed input was existing: "{self[-1]["content"]}" was overwritten '
                    f'with: "{text}".'
                )
                # 覆盖最后一条用户输入的内容为新的文本内容
                self[-1]["content"] = text
            else:
                logger.warning(
                    f'User input added while unprocessed input was existing: "{self[-1]["content"]}" new input '
                    f'ignored: "{text}". Set `overwrite` to True to overwrite unprocessed user input'
                )
        else:
            # 否则,将新的用户输入消息添加到对话中
            self.messages.append({"role": "user", "content": text})

    def append_response(self, response: str):
        """
        This is a legacy method. We recommend just using `add_message` with an appropriate role instead.
        """
        # 将机器人的回复添加到对话中,角色为助手
        self.messages.append({"role": "assistant", "content": response})

    def mark_processed(self):
        """
        This is a legacy method, as the Conversation no longer distinguishes between processed and unprocessed user
        input. We set a counter here to keep behaviour mostly backward-compatible, but in general you should just read
        the messages directly when writing new code.
        """
        # 将未处理的用户消息数量设置为已处理的用户消息数量
        self._num_processed_user_inputs = len(self._user_messages)

    def __iter__(self):
        # 实现迭代器接口,允许对 Conversation 对象进行迭代
        for message in self.messages:
            yield message

    def __getitem__(self, item):
        # 允许使用索引访问 Conversation 对象的消息
        return self.messages[item]

    def __setitem__(self, key, value):
        # 允许使用索引设置 Conversation 对象的消息
        self.messages[key] = value

    def __len__(self):
        # 返回 Conversation 对象中消息的数量
        return len(self.messages)

    def __repr__(self):
        """
        Generates a string representation of the conversation.

        Returns:
            `str`:

        Example:
            Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user: Going to the movies tonight - any suggestions?
            bot: The Big Lebowski
        """
        # 生成 Conversation 对象的字符串表示形式,包含对话的 ID 和每条消息的角色及内容
        output = f"Conversation id: {self.uuid}\n"
        for message in self.messages:
            output += f"{message['role']}: {message['content']}\n"
        return output

    def iter_texts(self):
        # 提供对消息文本的迭代访问,返回 (是否为用户消息, 消息内容) 的元组
        # 这是为了向后兼容而存在,推荐直接访问 conversation.messages
        for message in self.messages:
            yield message["role"] == "user", message["content"]

    @property
    # 返回所有用户消息内容的列表,这是为了向后兼容而保留的遗留属性。
    # 推荐直接访问 conversation.messages 来获取消息。
    def _user_messages(self):
        return [message["content"] for message in self.messages if message["role"] == "user"]

    @property
    # 返回过去用户输入的列表,这是为了向后兼容而保留的遗留属性。
    # 推荐直接访问 conversation.messages 来获取消息。现代类不关心哪些消息被处理或未处理。
    # 在过去,最近的用户消息必须经过 mark_processed() 处理才能包含在 past_user_messages 中。
    # 类实际上有一个单消息缓冲区,表示尚未回复的消息。现在不再需要这样做,但为了向后兼容,在此属性中模仿其行为。
    def past_user_inputs(self):
        if not self._user_messages:
            return []
        if self.messages[-1]["role"] != "user" or self._num_processed_user_inputs == len(self._user_messages):
            return self._user_messages[:-1]

        return self._user_messages

    @property
    # 返回所有生成的助理响应内容的列表,这是为了向后兼容而保留的遗留属性。
    # 推荐直接访问 conversation.messages 来获取消息。
    def generated_responses(self):
        return [message["content"] for message in self.messages if message["role"] == "assistant"]

    @property
    # 返回最新的用户输入消息内容,这是为了向后兼容而保留的遗留属性。
    # 推荐直接访问 conversation.messages 来获取消息。
    def new_user_input(self):
        return self._user_messages[-1]
@add_end_docstrings(
    build_pipeline_init_args(has_tokenizer=True),
    r"""
        min_length_for_response (`int`, *optional*, defaults to 32):
            The minimum length (in number of tokens) for a response.""",
)
class ConversationalPipeline(Pipeline):
    """
    Multi-turn conversational pipeline.

    Example:

    ```
    >>> from transformers import pipeline, Conversation
    # Any model with a chat template can be used in a ConversationalPipeline.

    >>> chatbot = pipeline(model="facebook/blenderbot-400M-distill")
    >>> # Conversation objects initialized with a string will treat it as a user message
    >>> conversation = Conversation("I'm looking for a movie - what's your favourite one?")
    >>> conversation = chatbot(conversation)
    >>> conversation.messages[-1]["content"]
    "I don't really have a favorite movie, but I do like action movies. What about you?"

    >>> conversation.add_message({"role": "user", "content": "That's interesting, why do you like action movies?"})
    >>> conversation = chatbot(conversation)
    >>> conversation.messages[-1]["content"]
    " I think it's just because they're so fast-paced and action-fantastic."
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This conversational pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"conversational"`.

    This pipeline can be used with any model that has a [chat
    template](https://huggingface.co/docs/transformers/chat_templating) set.
    """

    def __init__(self, *args, **kwargs):
        warnings.warn(
            "`ConversationalPipeline` is now deprecated, and the functionality has been moved to the standard `text-generation` pipeline, which now accepts lists of message dicts as well as strings. This class will be removed in v4.42.",
            DeprecationWarning,
        )
        super().__init__(*args, **kwargs)
        # Check if tokenizer does not have a pad token ID, set pad_token to eos_token
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def _sanitize_parameters(self, min_length_for_response=None, clean_up_tokenization_spaces=None, **generate_kwargs):
        """
        Prepares and sanitizes generation parameters for text generation.

        Args:
            min_length_for_response (int, optional): Minimum length of response in tokens.
            clean_up_tokenization_spaces (bool, optional): Whether to clean up tokenization spaces.
            **generate_kwargs: Additional keyword arguments for text generation.

        Returns:
            tuple: Three dictionaries containing pre-process, forward, and post-process parameters.
        """
        preprocess_params = {}
        forward_params = {}
        postprocess_params = {}

        if min_length_for_response is not None:
            preprocess_params["min_length_for_response"] = min_length_for_response

        if "max_length" in generate_kwargs:
            forward_params["max_length"] = generate_kwargs["max_length"]
            # self.max_length = generate_kwargs.get("max_length", self.model.config.max_length)
        if clean_up_tokenization_spaces is not None:
            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces

        if generate_kwargs:
            forward_params.update(generate_kwargs)

        return preprocess_params, forward_params, postprocess_params
    def __call__(self, conversations: Union[List[Dict], Conversation, List[Conversation]], num_workers=0, **kwargs):
        """
        Generate responses for the conversation(s) given as inputs.

        Args:
            conversations (a [`Conversation`] or a list of [`Conversation`]):
                Conversation to generate responses for. Inputs can also be passed as a list of dictionaries with `role`
                and `content` keys - in this case, they will be converted to `Conversation` objects automatically.
                Multiple conversations in either format may be passed as a list.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./model#generative-models)).

        Returns:
            [`Conversation`] or a list of [`Conversation`]: Conversation(s) with updated generated responses for those
            containing a new user input.
        """
        # XXX: num_workers==0 is required to be backward compatible
        # Otherwise the threads will require a Conversation copy.
        # This will definitely hinder performance on GPU, but has to be opted
        # in because of this BC change.
        # 检查是否输入的是列表且第一个元素是字典,将其转换为 Conversation 对象
        if isinstance(conversations, list) and isinstance(conversations[0], dict):
            conversations = Conversation(conversations)
        # 检查是否输入的是列表且第一个元素是列表,将每个子列表转换为 Conversation 对象
        elif isinstance(conversations, list) and isinstance(conversations[0], list):
            conversations = [Conversation(conv) for conv in conversations]
        # 调用父类的 __call__ 方法进行生成响应
        outputs = super().__call__(conversations, num_workers=num_workers, **kwargs)
        # 如果输出是列表且长度为1,则返回第一个元素,否则返回整个列表
        if isinstance(outputs, list) and len(outputs) == 1:
            return outputs[0]
        return outputs

    def preprocess(self, conversation: Conversation, min_length_for_response=32) -> Dict[str, Any]:
        """
        Preprocesses the conversation to generate model inputs.

        Args:
            conversation (`Conversation`): Conversation object containing role and content information.
            min_length_for_response (`int`, *optional*, defaults to `32`):
                Minimum length required for the model to generate a response.

        Returns:
            Dict[str, Any]: Dictionary containing input_ids (tokenized input) and the original conversation object.
        """
        # 应用聊天模板并为生成模型输入的 token IDs
        input_ids = self.tokenizer.apply_chat_template(conversation, add_generation_prompt=True)

        # 根据所用的框架,将 input_ids 转换为适当的张量类型
        if self.framework == "pt":
            input_ids = torch.LongTensor([input_ids])
        elif self.framework == "tf":
            input_ids = tf.constant([input_ids])

        return {"input_ids": input_ids, "conversation": conversation}

    def _forward(self, model_inputs, **generate_kwargs):
        """
        Perform forward pass through the model to generate output IDs.

        Args:
            model_inputs (Dict[str, Any]): Dictionary containing input_ids (token IDs) and conversation object.
            generate_kwargs: Additional keyword arguments passed to the generate method of the model.

        Returns:
            Dict[str, Any]: Dictionary containing output_ids (generated token IDs) and conversation object.
        """
        # 获取输入 token IDs 的长度
        n = model_inputs["input_ids"].shape[1]
        # 获取并移除 model_inputs 中的 conversation 对象
        conversation = model_inputs.pop("conversation")
        # 如果 generate_kwargs 中未指定 max_length 或 max_new_tokens,则设置默认值
        if "max_length" not in generate_kwargs and "max_new_tokens" not in generate_kwargs:
            generate_kwargs["max_new_tokens"] = 256
        # 使用 generate 方法生成输出 token IDs
        output_ids = self.model.generate(**model_inputs, **generate_kwargs)
        # 根据模型配置,确定 start_position 的起始位置
        if self.model.config.is_encoder_decoder:
            start_position = 1
        else:
            start_position = n
        # 返回生成的输出 token IDs 和 conversation 对象
        return {"output_ids": output_ids[:, start_position:], "conversation": conversation}
    # 定义一个方法用于处理模型输出后续的后处理逻辑
    def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
        # 从模型输出中获取生成的文本的标识符序列
        output_ids = model_outputs["output_ids"]
        # 使用分词器将标识符序列解码成文本,跳过特殊标记符号,并可选择清理分词空格
        answer = self.tokenizer.decode(
            output_ids[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
        )
        # 从模型输出中获取对话对象,向其添加生成的助理回复消息
        conversation = model_outputs["conversation"]
        conversation.add_message({"role": "assistant", "content": answer})
        # 返回更新后的对话对象
        return conversation

.\pipelines\depth_estimation.py

# 从 typing 模块导入 List 和 Union 类型
from typing import List, Union

# 导入 numpy 库并使用 np 别名
import numpy as np

# 从 ..utils 模块导入指定函数和类
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends

# 从 .base 模块导入 Pipeline 类和 build_pipeline_init_args 函数
from .base import Pipeline, build_pipeline_init_args

# 如果 torch 可用,则执行条件语句块
if is_torch_available():
    # 导入 torch 库
    import torch

    # 从 ..models.auto.modeling_auto 模块导入 MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES 常量
    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES

# 使用 logging 模块获取名为 __name__ 的 logger 对象
logger = logging.get_logger(__name__)

# 使用装饰器 add_end_docstrings 构造类 DepthEstimationPipeline,继承自 Pipeline 类
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class DepthEstimationPipeline(Pipeline):
    """
    Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.

    Example:

    ```
    >>> from transformers import pipeline

    >>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf")
    >>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
    >>> # This is a tensor with the values being the depth expressed in meters for each pixel
    >>> output["predicted_depth"].shape
    torch.Size([1, 384, 384])
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)


    This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"depth-estimation"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation).
    """

    # 构造函数,初始化对象
    def __init__(self, *args, **kwargs):
        # 调用父类 Pipeline 的构造函数进行初始化
        super().__init__(*args, **kwargs)
        # 要求视觉后端库可用,否则引发异常
        requires_backends(self, "vision")
        # 检查模型类型是否匹配 MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES 中的映射名称
        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
    # 调用函数,用于预测输入图像的深度
    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
        """
        Predict the depth(s) of the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                images.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
            the images.

            The dictionaries contain the following keys:

            - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
            - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
        """
        return super().__call__(images, **kwargs)

    # 将超时参数转换为预处理参数字典
    def _sanitize_parameters(self, timeout=None, **kwargs):
        preprocess_params = {}
        if timeout is not None:
            preprocess_params["timeout"] = timeout
        return preprocess_params, {}, {}

    # 对输入图像进行预处理,返回模型输入
    def preprocess(self, image, timeout=None):
        image = load_image(image, timeout)  # 加载图像
        self.image_size = image.size  # 记录图像尺寸
        model_inputs = self.image_processor(images=image, return_tensors=self.framework)  # 图像处理并返回模型输入
        return model_inputs

    # 对模型输入进行前向传播,返回模型输出
    def _forward(self, model_inputs):
        model_outputs = self.model(**model_inputs)  # 模型前向传播
        return model_outputs

    # 对模型输出进行后处理,生成预测深度图和格式化后的深度图像
    def postprocess(self, model_outputs):
        predicted_depth = model_outputs.predicted_depth  # 提取预测深度
        # 对预测深度进行双三次插值,并转换为 PIL 图像
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1), size=self.image_size[::-1], mode="bicubic", align_corners=False
        )
        output = prediction.squeeze().cpu().numpy()  # 去除多余维度并转为 numpy 数组
        formatted = (output * 255 / np.max(output)).astype("uint8")  # 格式化深度图像
        depth = Image.fromarray(formatted)  # 转换为 PIL 图像
        output_dict = {}
        output_dict["predicted_depth"] = predicted_depth  # 存储预测深度张量
        output_dict["depth"] = depth  # 存储格式化后的深度图像
        return output_dict

.\pipelines\document_question_answering.py

# 导入正则表达式模块
import re
# 导入类型提示相关模块
from typing import List, Optional, Tuple, Union

# 导入第三方库 numpy
import numpy as np

# 导入自定义工具函数和类
from ..utils import (
    ExplicitEnum,
    add_end_docstrings,
    is_pytesseract_available,
    is_torch_available,
    is_vision_available,
    logging,
)

# 导入基础类 ChunkPipeline 和函数 build_pipeline_init_args
from .base import ChunkPipeline, build_pipeline_init_args
# 导入问题回答相关函数 select_starts_ends
from .question_answering import select_starts_ends

# 如果视觉处理库可用,则导入 PIL 图像处理模块和 load_image 函数
if is_vision_available():
    from PIL import Image
    from ..image_utils import load_image

# 如果 PyTorch 可用,则导入 PyTorch 库
if is_torch_available():
    import torch
    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES

# 初始化 TESSERACT_LOADED 标志
TESSERACT_LOADED = False
# 如果 pytesseract 可用,则将 TESSERACT_LOADED 设置为 True,并导入 pytesseract 库
if is_pytesseract_available():
    TESSERACT_LOADED = True
    import pytesseract

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# normalize_box() 和 apply_tesseract() 函数从 models/layoutlmv3/feature_extraction_layoutlmv3.py 中的 apply_tesseract 派生而来。
# 由于管道可能会从 layoutlmv3 当前的实现中演变,因此此处将其复制(而非导入),以避免创建不必要的依赖关系。

def normalize_box(box, width, height):
    """根据图像宽度和高度,归一化边界框的坐标值,并返回归一化后的边界框列表。"""
    return [
        int(1000 * (box[0] / width)),
        int(1000 * (box[1] / height)),
        int(1000 * (box[2] / width)),
        int(1000 * (box[3] / height)),
    ]

def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]):
    """对文档图像应用 Tesseract OCR,返回识别的单词及其归一化的边界框。"""
    # 应用 OCR
    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]

    # 过滤空单词及其对应的坐标
    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]

    # 将坐标转换为 (left, top, left+width, top+height) 格式
    actual_boxes = []
    # 使用 zip 函数并行迭代 left, top, width, height 四个列表,每次迭代取出一个元组 (x, y, w, h)
    for x, y, w, h in zip(left, top, width, height):
        # 根据左上角坐标和宽高计算出实际边界框的坐标 [left, top, right, bottom]
        actual_box = [x, y, x + w, y + h]
        # 将计算得到的实际边界框添加到 actual_boxes 列表中
        actual_boxes.append(actual_box)

    # 获取图像的宽度和高度
    image_width, image_height = image.size

    # 创建一个空列表来存储标准化后的边界框
    normalized_boxes = []
    # 遍历所有实际边界框,对每个边界框调用 normalize_box 函数进行标准化处理
    for box in actual_boxes:
        normalized_boxes.append(normalize_box(box, image_width, image_height))

    # 检查单词列表和标准化边界框列表的长度是否相等,如果不相等则抛出 ValueError 异常
    if len(words) != len(normalized_boxes):
        raise ValueError("Not as many words as there are bounding boxes")

    # 返回处理后的单词列表和标准化后的边界框列表作为结果
    return words, normalized_boxes
class ModelType(ExplicitEnum):
    LayoutLM = "layoutlm"
    LayoutLMv2andv3 = "layoutlmv2andv3"
    VisionEncoderDecoder = "vision_encoder_decoder"


@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True))
class DocumentQuestionAnsweringPipeline(ChunkPipeline):
    # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
    """
    Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
    similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
    words/boxes) as input instead of text context.

    Example:

    ```
    >>> from transformers import pipeline

    >>> document_qa = pipeline(model="impira/layoutlm-document-qa")
    >>> document_qa(
    ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
    ...     question="What is the invoice number?",
    ... )
    [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"document-question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
    See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 检查是否提供了非快速的分词器,如果提供了,抛出值错误异常
        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
            raise ValueError(
                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
                f"(`{self.tokenizer.__class__.__name__}`) is provided."
            )

        # 如果模型配置为 VisionEncoderDecoderConfig 类型,则设置模型类型为 VisionEncoderDecoder
        if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
            self.model_type = ModelType.VisionEncoderDecoder
            # 如果模型编码器类型不是 "donut-swin",则抛出值错误异常
            if self.model.config.encoder.model_type != "donut-swin":
                raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
        else:
            # 否则,检查模型类型是否在 DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES 中
            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
            # 如果模型配置为 LayoutLMConfig 类型,则设置模型类型为 LayoutLM
            if self.model.config.__class__.__name__ == "LayoutLMConfig":
                self.model_type = ModelType.LayoutLM
            else:
                # 否则,设置模型类型为 LayoutLMv2andv3
                self.model_type = ModelType.LayoutLMv2andv3
    # 对输入参数进行清理和预处理,返回预处理参数和空的后处理参数字典
    def _sanitize_parameters(
        self,
        padding=None,  # 如果指定了填充参数,设置预处理参数字典中的填充
        doc_stride=None,  # 如果指定了文档步幅参数,设置预处理参数字典中的文档步幅
        max_question_len=None,  # 如果指定了最大问题长度参数,设置预处理参数字典中的最大问题长度
        lang: Optional[str] = None,  # 如果指定了语言参数,设置预处理参数字典中的语言
        tesseract_config: Optional[str] = None,  # 如果指定了 Tesseract 配置参数,设置预处理参数字典中的 Tesseract 配置
        max_answer_len=None,  # 如果指定了最大答案长度参数,设置后处理参数字典中的最大答案长度
        max_seq_len=None,  # 如果指定了最大序列长度参数,设置预处理参数字典中的最大序列长度
        top_k=None,  # 如果指定了 top_k 参数,设置后处理参数字典中的 top_k
        handle_impossible_answer=None,  # 如果指定了处理不可能答案的参数,设置后处理参数字典中的处理方式
        timeout=None,  # 如果指定了超时参数,设置预处理参数字典中的超时时间
        **kwargs,  # 其他未命名的参数,不做特定处理
    ):
        preprocess_params, postprocess_params = {}, {}

        if padding is not None:
            preprocess_params["padding"] = padding
        if doc_stride is not None:
            preprocess_params["doc_stride"] = doc_stride
        if max_question_len is not None:
            preprocess_params["max_question_len"] = max_question_len
        if max_seq_len is not None:
            preprocess_params["max_seq_len"] = max_seq_len
        if lang is not None:
            preprocess_params["lang"] = lang
        if tesseract_config is not None:
            preprocess_params["tesseract_config"] = tesseract_config
        if timeout is not None:
            preprocess_params["timeout"] = timeout

        if top_k is not None:
            if top_k < 1:
                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
            postprocess_params["top_k"] = top_k
        if max_answer_len is not None:
            if max_answer_len < 1:
                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
            postprocess_params["max_answer_len"] = max_answer_len
        if handle_impossible_answer is not None:
            postprocess_params["handle_impossible_answer"] = handle_impossible_answer

        return preprocess_params, {}, postprocess_params

    # 处理调用对象的输入,支持图片或文件路径、问题文本、词框列表等输入
    def __call__(
        self,
        image: Union["Image.Image", str],  # 图片或文件路径
        question: Optional[str] = None,  # 可选的问题文本
        word_boxes: Tuple[str, List[float]] = None,  # 包含词框的元组
        **kwargs,  # 其他未命名的参数,不做特定处理
    ):
    
    # 对输入进行预处理,支持输入、填充方式、文档步幅、最大序列长度、词框列表、语言、Tesseract 配置及超时设置
    def preprocess(
        self,
        input,
        padding="do_not_pad",  # 默认不填充
        doc_stride=None,  # 可选的文档步幅
        max_seq_len=None,  # 可选的最大序列长度
        word_boxes: Tuple[str, List[float]] = None,  # 可选的词框列表
        lang=None,  # 可选的语言设置
        tesseract_config="",  # 默认空的 Tesseract 配置
        timeout=None,  # 可选的超时设置
    ):
    
    # 执行模型的前向传播,处理模型输入和生成参数
    def _forward(self, model_inputs, **generate_kwargs):
        p_mask = model_inputs.pop("p_mask", None)  # 弹出并获取模型输入中的 p_mask
        word_ids = model_inputs.pop("word_ids", None)  # 弹出并获取模型输入中的 word_ids
        words = model_inputs.pop("words", None)  # 弹出并获取模型输入中的 words
        is_last = model_inputs.pop("is_last", False)  # 弹出并获取模型输入中的 is_last,默认为 False

        if self.model_type == ModelType.VisionEncoderDecoder:
            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)  # 生成视觉编码器解码器模型的输出
        else:
            model_outputs = self.model(**model_inputs)  # 调用普通模型的前向传播

        model_outputs = dict(model_outputs.items())  # 将模型输出转换为字典形式
        model_outputs["p_mask"] = p_mask  # 将 p_mask 放回模型输出
        model_outputs["word_ids"] = word_ids  # 将 word_ids 放回模型输出
        model_outputs["words"] = words  # 将 words 放回模型输出
        model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)  # 获取模型输入中的 attention_mask 并放入模型输出
        model_outputs["is_last"] = is_last  # 将 is_last 放回模型输出
        return model_outputs  # 返回模型输出
    # 根据模型类型确定后处理方法,对模型输出进行处理并返回答案列表
    def postprocess(self, model_outputs, top_k=1, **kwargs):
        if self.model_type == ModelType.VisionEncoderDecoder:
            # 如果模型类型是 VisionEncoderDecoder,则调用相应的单一处理方法
            answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
        else:
            # 否则,调用抽取式问答的后处理方法
            answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)

        # 按照答案的分数从高到低进行排序,并选取前 top_k 个答案
        answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
        return answers

    # 处理单个 VisionEncoderDecoder 模型输出的后处理方法
    def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
        # 解码模型输出的序列为文本
        sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0]

        # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer
        # (see https://github.com/huggingface/transformers/pull/18414/files#r961747408 for more context).
        
        # 以下逻辑大部分特定于 Donut,可能应该在 tokenizer 中处理
        # 参考链接:https://github.com/huggingface/transformers/pull/18414/files#r961747408
        
        # 替换序列中的 eos_token 和 pad_token
        sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
        # 使用正则表达式移除第一个任务开始标记之后的内容
        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
        ret = {
            "answer": None,
        }

        # 从序列中寻找 <s_answer>...</s_answer> 匹配的内容作为答案
        answer = re.search(r"<s_answer>(.*)</s_answer>", sequence)
        if answer is not None:
            ret["answer"] = answer.group(1).strip()
        return ret

    # 处理抽取式问答模型输出的后处理方法
    def postprocess_extractive_qa(
        self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
    ):
        # 设置一个较大的初始空值分数
        min_null_score = 1000000  # large and positive
        answers = []
        for output in model_outputs:
            words = output["words"]

            # 选择起始和结束位置,并更新最小空值分数
            starts, ends, scores, min_null_score = select_starts_ends(
                start=output["start_logits"],
                end=output["end_logits"],
                p_mask=output["p_mask"],
                attention_mask=output["attention_mask"].numpy()
                if output.get("attention_mask", None) is not None
                else None,
                min_null_score=min_null_score,
                top_k=top_k,
                handle_impossible_answer=handle_impossible_answer,
                max_answer_len=max_answer_len,
            )
            word_ids = output["word_ids"]
            for start, end, score in zip(starts, ends, scores):
                word_start, word_end = word_ids[start], word_ids[end]
                if word_start is not None and word_end is not None:
                    # 将答案及其相关信息添加到答案列表中
                    answers.append(
                        {
                            "score": float(score),
                            "answer": " ".join(words[word_start : word_end + 1]),
                            "start": word_start,
                            "end": word_end,
                        }
                    )

        # 如果处理不可能的答案,则将最小空值分数的答案添加到答案列表中
        if handle_impossible_answer:
            answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0})

        return answers

.\pipelines\feature_extraction.py

# 引入类型提示字典 Dict
from typing import Dict

# 从当前模块的 utils 中导入 add_end_docstrings 函数
from ..utils import add_end_docstrings
# 从当前包的 base 模块中导入 GenericTensor, Pipeline, build_pipeline_init_args 函数
from .base import GenericTensor, Pipeline, build_pipeline_init_args

# 使用 add_end_docstrings 装饰器,添加结尾文档字符串
@add_end_docstrings(
    # 调用 build_pipeline_init_args 函数生成初始化参数
    build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False),
    r"""
        tokenize_kwargs (`dict`, *optional*):
                Additional dictionary of keyword arguments passed along to the tokenizer.
        return_tensors (`bool`, *optional*):
            If `True`, returns a tensor according to the specified framework, otherwise returns a list.""",
)
# 定义 FeatureExtractionPipeline 类,继承自 Pipeline 类
class FeatureExtractionPipeline(Pipeline):
    """
    Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
    transformer, which can be used as features in downstream tasks.

    Example:

    ```
    >>> from transformers import pipeline

    >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
    >>> result = extractor("This is a simple test.", return_tensors=True)
    >>> result.shape  # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input string.
    torch.Size([1, 8, 768])
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
    `"feature-extraction"`.

    All models may be used for this pipeline. See a list of all models, including community-contributed models on
    [huggingface.co/models](https://huggingface.co/models).
    """

    # 定义 _sanitize_parameters 方法,用于预处理参数
    def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
        # 如果 tokenize_kwargs 为 None,则初始化为空字典
        if tokenize_kwargs is None:
            tokenize_kwargs = {}

        # 如果 truncation 参数不为 None
        if truncation is not None:
            # 如果 tokenize_kwargs 中已经包含 'truncation' 键
            if "truncation" in tokenize_kwargs:
                # 抛出 ValueError 异常,指示 truncation 参数重复定义
                raise ValueError(
                    "truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
                )
            # 否则将 truncation 参数添加到 tokenize_kwargs 中
            tokenize_kwargs["truncation"] = truncation

        # 将 tokenize_kwargs 赋值给 preprocess_params
        preprocess_params = tokenize_kwargs

        # 初始化 postprocess_params 为空字典
        postprocess_params = {}
        # 如果 return_tensors 不为 None
        if return_tensors is not None:
            # 将 return_tensors 参数添加到 postprocess_params 中
            postprocess_params["return_tensors"] = return_tensors

        # 返回预处理参数、空字典和后处理参数
        return preprocess_params, {}, postprocess_params

    # 定义 preprocess 方法,用于数据预处理
    def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]:
        # 使用 self.tokenizer 对输入进行标记化,根据 self.framework 返回张量
        model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs)
        # 返回模型输入数据字典
        return model_inputs

    # 定义 _forward 方法,用于模型前向传播
    def _forward(self, model_inputs):
        # 使用 self.model 对模型输入进行前向传播,得到模型输出
        model_outputs = self.model(**model_inputs)
        # 返回模型输出
        return model_outputs

    # 定义 postprocess 方法,用于数据后处理
    def postprocess(self, model_outputs, return_tensors=False):
        # 如果 return_tensors 为 True,则返回第一个可用的张量,即 logits 或 last_hidden_state
        if return_tensors:
            return model_outputs[0]
        # 如果 self.framework 为 'pt',则将张量转换为列表返回
        if self.framework == "pt":
            return model_outputs[0].tolist()
        # 如果 self.framework 为 'tf',则将张量转换为 NumPy 数组再转换为列表返回
        elif self.framework == "tf":
            return model_outputs[0].numpy().tolist()
    # 定义 `__call__` 方法,该方法允许对象实例像函数一样被调用
    def __call__(self, *args, **kwargs):
        """
        提取输入文本的特征。

        Args:
            args (`str` or `List[str]`): 一个或多个文本(或文本列表),用于提取特征。

        Return:
            A nested list of `float`: 模型计算得到的特征。
        """
        # 调用父类的 `__call__` 方法,执行特征提取操作
        return super().__call__(*args, **kwargs)

.\pipelines\fill_mask.py

from typing import Dict  # 导入 Dict 类型提示,用于声明字典类型变量

import numpy as np  # 导入 NumPy 库,用于数值计算

from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging  # 导入自定义模块和函数

from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args  # 导入本地模块和类


if is_tf_available():  # 检查是否导入了 TensorFlow
    import tensorflow as tf  # 导入 TensorFlow 库

    from ..tf_utils import stable_softmax  # 导入自定义 TensorFlow 工具函数


if is_torch_available():  # 检查是否导入了 PyTorch
    import torch  # 导入 PyTorch 库


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


@add_end_docstrings(  # 使用装饰器为类添加文档字符串
    build_pipeline_init_args(has_tokenizer=True),  # 调用 build_pipeline_init_args 函数生成初始化参数说明
    r"""
        top_k (`int`, defaults to 5):
            The number of predictions to return.
        targets (`str` or `List[str]`, *optional*):
            When passed, the model will limit the scores to the passed targets instead of looking up in the whole
            vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
            token will be used (with a warning, and that might be slower).
        tokenizer_kwargs (`dict`, *optional*):
            Additional dictionary of keyword arguments passed along to the tokenizer."""  # 对类的文档字符串进行详细注释
)
class FillMaskPipeline(Pipeline):  # 定义 FillMaskPipeline 类,继承自 Pipeline 类
    """
    Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
    examples](../task_summary#masked-language-modeling) for more information.

    Example:

    ```
    >>> from transformers import pipeline

    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
    >>> fill_masker("This is a simple [MASK].")
    [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"fill-mask"`.

    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
    which includes the bi-directional models in the library. See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).

    <Tip>

    This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
    masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
    joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).

    </Tip>

    <Tip>

    This pipeline now supports tokenizer_kwargs. For example try:

    ```
    >>> from transformers import pipeline

    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased", tokenizer_kwargs={"do_lower_case": False})
    >>> fill_masker("This is a simple [MASK].")
    ```

    This will make the tokenizer to treat "This" and "this" as distinct words.

    </Tip>
    """
    # 类的主体部分包含了关于使用遮罩语言建模的预测流水线的详细说明和示例,以及相关的提示和链接。
    pass  # 类体中没有代码,所以使用 pass 语句进行占位
    >>> from transformers import pipeline
导入transformers库中的pipeline模块,用于创建基于预训练模型的NLP处理管道。

    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
创建一个新的填充掩码(fill-mask)管道,使用Google BERT模型的基本未大写模型。

    >>> tokenizer_kwargs = {"truncation": True}
定义一个字典tokenizer_kwargs,其中包含了一个键值对,用于设置tokenizer的参数,这里指定了截断为True。

    >>> fill_masker(
    ...     "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
    ...     tokenizer_kwargs=tokenizer_kwargs,
    ... )
调用填充掩码(fill-mask)管道的函数,传入一个带有填充掩码标记的文本以及tokenizer的额外参数。这里的文本是一个简单的句子,加上了大量重复的文本内容。

    """

    def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
        if self.framework == "tf":
            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
        elif self.framework == "pt":
            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
        else:
            raise ValueError("Unsupported framework")
        return masked_index
定义一个方法get_masked_index,用于获取输入张量中掩码标记的索引。根据self.framework属性,如果是"tf"则使用TensorFlow库的方法找到掩码标记索引并转换为NumPy数组,如果是"pt"则使用PyTorch库的方法返回掩码标记索引,否则抛出异常。

    def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
        masked_index = self.get_masked_index(input_ids)
        numel = np.prod(masked_index.shape)
        if numel < 1:
            raise PipelineException(
                "fill-mask",
                self.model.base_model_prefix,
                f"No mask_token ({self.tokenizer.mask_token}) found on the input",
            )
定义一个方法_ensure_exactly_one_mask_token,确保输入张量中只有一个掩码标记。首先调用get_masked_index方法获取掩码标记的索引,然后计算索引数组的元素数量。如果数量小于1,则抛出PipelineException异常,提示输入中未找到掩码标记。

    def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
        if isinstance(model_inputs, list):
            for model_input in model_inputs:
                self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
        else:
            for input_ids in model_inputs["input_ids"]:
                self._ensure_exactly_one_mask_token(input_ids)
定义一个公共方法ensure_exactly_one_mask_token,用于确保模型输入中每个示例只有一个掩码标记。根据输入类型(列表或单个输入),对每个模型输入调用_ensure_exactly_one_mask_token方法。

    def preprocess(
        self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
    ) -> Dict[str, GenericTensor]:
        if return_tensors is None:
            return_tensors = self.framework
        if tokenizer_kwargs is None:
            tokenizer_kwargs = {}

        model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
        self.ensure_exactly_one_mask_token(model_inputs)
        return model_inputs
定义一个预处理方法preprocess,用于将原始输入处理成适合模型的输入格式。根据参数设置,调用tokenizer将输入转换成张量表示,并调用ensure_exactly_one_mask_token方法确保每个模型输入只有一个掩码标记。

    def _forward(self, model_inputs):
        model_outputs = self.model(**model_inputs)
        model_outputs["input_ids"] = model_inputs["input_ids"]
        return model_outputs
定义一个方法_forward,执行模型的前向传播。调用模型将输入传递给模型并返回输出,同时保留输入中的input_ids信息。
    # 定义一个方法用于后处理模型输出,接受模型输出、top_k 参数和目标标识符作为输入
    def postprocess(self, model_outputs, top_k=5, target_ids=None):
        # 如果存在目标标识符并且目标标识符的数量少于 top_k,则将 top_k 设置为目标标识符的数量
        if target_ids is not None and target_ids.shape[0] < top_k:
            top_k = target_ids.shape[0]
        # 获取模型输出中的输入标识符
        input_ids = model_outputs["input_ids"][0]
        # 获取模型输出中的预测 logits
        outputs = model_outputs["logits"]

        # 如果使用 TensorFlow 框架
        if self.framework == "tf":
            # 找到输入标识符中等于 tokenizer 的 mask_token_id 的位置索引
            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]

            # 将 logits 转换为 numpy 数组
            outputs = outputs.numpy()

            # 提取特定位置的 logits
            logits = outputs[0, masked_index, :]
            # 对 logits 进行稳定的 softmax 操作
            probs = stable_softmax(logits, axis=-1)
            # 如果存在目标标识符,则根据目标标识符从 probs 中抽取对应的概率
            if target_ids is not None:
                probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
                probs = tf.expand_dims(probs, 0)

            # 获取概率最高的 top_k 个值和对应的索引
            topk = tf.math.top_k(probs, k=top_k)
            values, predictions = topk.values.numpy(), topk.indices.numpy()
        else:
            # 如果使用的是 PyTorch 框架,找到输入标识符中等于 tokenizer 的 mask_token_id 的位置索引
            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
            # Fill mask pipeline supports only one ${mask_token} per sample

            # 提取特定位置的 logits
            logits = outputs[0, masked_index, :]
            # 对 logits 进行 softmax 操作
            probs = logits.softmax(dim=-1)
            # 如果存在目标标识符,则根据目标标识符从 probs 中抽取对应的概率
            if target_ids is not None:
                probs = probs[..., target_ids]

            # 获取概率最高的 top_k 个值和对应的索引
            values, predictions = probs.topk(top_k)

        # 初始化结果列表
        result = []
        # 检查是否只有单个 mask
        single_mask = values.shape[0] == 1
        # 遍历概率值和对应的预测值
        for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
            row = []
            for v, p in zip(_values, _predictions):
                # 创建输入标识符的副本,因为后续会修改此数组
                tokens = input_ids.numpy().copy()
                # 如果存在目标标识符,则将 p 替换为目标标识符中的对应值
                if target_ids is not None:
                    p = target_ids[p].tolist()

                # 将输入标识符中的 mask 位置替换为 p
                tokens[masked_index[i]] = p
                # 过滤掉填充标记
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
                # 使用 tokenizer 解码 tokens 生成序列,根据 single_mask 决定是否跳过特殊标记
                sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
                # 创建建议字典,包含分数、标记、标记字符串和序列
                proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
                row.append(proposition)
            result.append(row)
        # 如果只有单个 mask,则返回结果列表的第一个元素
        if single_mask:
            return result[0]
        # 否则返回完整的结果列表
        return result
    # 获取目标标记的对应 ID 列表
    def get_target_ids(self, targets, top_k=None):
        # 如果目标是字符串,则转换为列表形式
        if isinstance(targets, str):
            targets = [targets]
        try:
            # 获取当前 tokenizer 的词汇表
            vocab = self.tokenizer.get_vocab()
        except Exception:
            # 若获取失败则设置空词汇表
            vocab = {}
        # 初始化目标 ID 列表
        target_ids = []
        # 遍历每个目标标记
        for target in targets:
            # 获取目标标记在词汇表中的 ID,若不存在则为 None
            id_ = vocab.get(target, None)
            # 如果 ID 不存在
            if id_ is None:
                # 使用 tokenizer 处理目标标记,获取其对应的 input_ids
                input_ids = self.tokenizer(
                    target,
                    add_special_tokens=False,
                    return_attention_mask=False,
                    return_token_type_ids=False,
                    max_length=1,
                    truncation=True,
                )["input_ids"]
                # 如果 input_ids 长度为 0,表示标记在模型词汇表中不存在
                if len(input_ids) == 0:
                    # 发出警告,指出指定的目标标记在模型词汇表中不存在
                    logger.warning(
                        f"The specified target token `{target}` does not exist in the model vocabulary. "
                        "We cannot replace it with anything meaningful, ignoring it"
                    )
                    # 继续下一个目标标记的处理
                    continue
                # 将第一个 input_id 作为替代标记的 ID
                id_ = input_ids[0]
                # 发出警告,指出替代了不存在的目标标记,并提示替代的标记
                logger.warning(
                    f"The specified target token `{target}` does not exist in the model vocabulary. "
                    f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
                )
            # 将获取到的目标标记 ID 添加到列表中
            target_ids.append(id_)
        # 去重目标 ID 列表
        target_ids = list(set(target_ids))
        # 如果目标 ID 列表为空,则抛出数值错误异常
        if len(target_ids) == 0:
            raise ValueError("At least one target must be provided when passed.")
        # 转换目标 ID 列表为 NumPy 数组格式
        target_ids = np.array(target_ids)
        # 返回目标 ID 数组
        return target_ids

    # 清理参数函数,返回预处理、后处理参数及空字典
    def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
        preprocess_params = {}

        # 如果存在 tokenizer_kwargs 参数,则添加到预处理参数中
        if tokenizer_kwargs is not None:
            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs

        postprocess_params = {}

        # 如果存在 targets 参数,则获取目标标记的 ID 列表
        if targets is not None:
            target_ids = self.get_target_ids(targets, top_k)
            postprocess_params["target_ids"] = target_ids

        # 如果存在 top_k 参数,则添加到后处理参数中
        if top_k is not None:
            postprocess_params["top_k"] = top_k

        # 如果 tokenizer 的 mask_token_id 为 None,则抛出管道异常
        if self.tokenizer.mask_token_id is None:
            raise PipelineException(
                "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
            )

        # 返回预处理参数、空字典和后处理参数
        return preprocess_params, {}, postprocess_params
    # 覆盖父类的 __call__ 方法,用于填充输入文本中的掩码标记。

    outputs = super().__call__(inputs, **kwargs)
    # 调用父类的 __call__ 方法,传入输入参数 inputs 和其他关键字参数 kwargs,并获取输出结果

    if isinstance(inputs, list) and len(inputs) == 1:
        # 检查 inputs 是否为列表且长度为1
        return outputs[0]
        # 如果是单个文本输入,则直接返回第一个输出结果
    return outputs
    # 否则返回所有输出结果

.\pipelines\image_classification.py

# 引入必要的类型和模块
from typing import List, Union
import numpy as np

# 引入一些自定义工具函数和类
from ..utils import (
    ExplicitEnum,
    add_end_docstrings,
    is_tf_available,
    is_torch_available,
    is_vision_available,
    logging,
    requires_backends,
)

# 导入基础的管道类和初始化函数
from .base import Pipeline, build_pipeline_init_args

# 如果当前环境支持视觉处理,则导入图像处理相关的模块
if is_vision_available():
    from PIL import Image
    from ..image_utils import load_image

# 如果当前环境支持 TensorFlow,则导入相关的模块
if is_tf_available():
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES

# 如果当前环境支持 PyTorch,则导入相关的模块
if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义 sigmoid 函数,用于计算 sigmoid 激活函数的输出
def sigmoid(_outputs):
    return 1.0 / (1.0 + np.exp(-_outputs))

# 定义 softmax 函数,用于计算 softmax 激活函数的输出
def softmax(_outputs):
    maxes = np.max(_outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(_outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

# 定义分类函数的枚举类,包括 sigmoid、softmax 和 none 三种选择
class ClassificationFunction(ExplicitEnum):
    SIGMOID = "sigmoid"
    SOFTMAX = "softmax"
    NONE = "none"

# 添加文档注释,描述了初始化图像分类管道的参数和功能
@add_end_docstrings(
    build_pipeline_init_args(has_image_processor=True),
    r"""
        function_to_apply (`str`, *optional*, defaults to `"default"`):
            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:

            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
              has several labels, will apply the softmax function on the output.
            - `"sigmoid"`: Applies the sigmoid function on the output.
            - `"softmax"`: Applies the softmax function on the output.
            - `"none"`: Does not apply any function on the output.""",
)
# 定义图像分类管道类,继承自 Pipeline 类
class ImageClassificationPipeline(Pipeline):
    """
    Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an
    image.

    Example:

    ```
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
    >>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"image-classification"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
    """
    function_to_apply: ClassificationFunction = ClassificationFunction.NONE

# 初始化一个属性 `function_to_apply`,默认为 `ClassificationFunction.NONE`,表示没有指定分类函数。


    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        requires_backends(self, "vision")
        self.check_model_type(
            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
            if self.framework == "tf"
            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
        )

# 类的初始化方法,接受任意位置和关键字参数。调用父类的初始化方法,并确保当前对象需要 "vision" 后端支持。根据当前框架("tf" 或其他),检查模型类型是否符合预期。


    def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
        preprocess_params = {}
        if timeout is not None:
            preprocess_params["timeout"] = timeout
        postprocess_params = {}
        if top_k is not None:
            postprocess_params["top_k"] = top_k
        if isinstance(function_to_apply, str):
            function_to_apply = ClassificationFunction(function_to_apply.lower())
        if function_to_apply is not None:
            postprocess_params["function_to_apply"] = function_to_apply
        return preprocess_params, {}, postprocess_params

# 方法 `_sanitize_parameters`,用于处理和清理参数。根据传入的参数,构建预处理参数和后处理参数字典。如果传入了 `timeout` 参数,则将其加入 `preprocess_params` 字典。如果传入了 `top_k` 参数,则将其加入 `postprocess_params` 字典。如果 `function_to_apply` 是字符串类型,则将其转换为 `ClassificationFunction` 枚举类型。最后,如果 `function_to_apply` 不为 None,则将其加入 `postprocess_params` 字典。最终返回三个空字典构成的元组 `(preprocess_params, {}, postprocess_params)`。
    # 继承父类的 __call__ 方法,用于给传入的图像(或图像列表)分配标签
    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
        """
        Assign labels to the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                images.
            function_to_apply (`str`, *optional*, defaults to `"default"`):
                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
                values:

                If this argument is not specified, then it will apply the following functions according to the number
                of labels:

                - If the model has a single label, will apply the sigmoid function on the output.
                - If the model has several labels, will apply the softmax function on the output.

                Possible values are:

                - `"sigmoid"`: Applies the sigmoid function on the output.
                - `"softmax"`: Applies the softmax function on the output.
                - `"none"`: Does not apply any function on the output.
            top_k (`int`, *optional*, defaults to 5):
                The number of top labels that will be returned by the pipeline. If the provided number is higher than
                the number of labels available in the model configuration, it will default to the number of labels.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
            the images.

            The dictionaries contain the following keys:

            - **label** (`str`) -- The label identified by the model.
            - **score** (`int`) -- The score attributed by the model for that label.
        """
        # 调用父类的 __call__ 方法,将图像(或图像列表)和其他关键字参数传递给父类
        return super().__call__(images, **kwargs)

    # 图像预处理方法,加载图像并进行预处理
    def preprocess(self, image, timeout=None):
        # 调用 load_image 方法加载图像,设置超时时间
        image = load_image(image, timeout=timeout)
        # 使用 image_processor 对象处理图像,将处理结果返回作为模型输入
        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
        return model_inputs

    # 模型前向传播方法,接收模型输入并返回模型输出
    def _forward(self, model_inputs):
        # 使用模型对象处理模型输入,得到模型输出
        model_outputs = self.model(**model_inputs)
        return model_outputs
    # 对模型输出进行后处理,根据给定的函数应用规则或默认选择
    def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
        # 如果未提供特定的函数应用规则,则根据模型配置选择默认规则
        if function_to_apply is None:
            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
                function_to_apply = ClassificationFunction.SIGMOID  # 使用 sigmoid 函数进行分类
            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
                function_to_apply = ClassificationFunction.SOFTMAX  # 使用 softmax 函数进行分类
            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
                function_to_apply = self.model.config.function_to_apply  # 使用配置中指定的函数
            else:
                function_to_apply = ClassificationFunction.NONE  # 不应用特定的函数

        # 如果 top_k 超过可用标签数量,则将其限制为标签数量
        if top_k > self.model.config.num_labels:
            top_k = self.model.config.num_labels

        # 从模型输出中提取 logits 并转换为 NumPy 数组
        outputs = model_outputs["logits"][0]
        outputs = outputs.numpy()

        # 根据选择的函数应用规则对 logits 进行处理
        if function_to_apply == ClassificationFunction.SIGMOID:
            scores = sigmoid(outputs)  # 应用 sigmoid 函数
        elif function_to_apply == ClassificationFunction.SOFTMAX:
            scores = softmax(outputs)  # 应用 softmax 函数
        elif function_to_apply == ClassificationFunction.NONE:
            scores = outputs  # 不应用额外的函数,直接使用 logits
        else:
            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")  # 抛出异常,指示无法识别的函数应用规则

        # 将得分与标签对应,并按得分降序排序
        dict_scores = [
            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
        ]
        dict_scores.sort(key=lambda x: x["score"], reverse=True)

        # 如果指定了 top_k,则仅保留前 top_k 个条目
        if top_k is not None:
            dict_scores = dict_scores[:top_k]

        # 返回包含标签及其得分的字典列表
        return dict_scores

.\pipelines\image_feature_extraction.py

# 导入类型提示字典
from typing import Dict

# 从当前包的utils模块中导入函数和变量
from ..utils import add_end_docstrings, is_vision_available

# 从当前包的base模块中导入指定类和函数
from .base import GenericTensor, Pipeline, build_pipeline_init_args

# 如果视觉功能可用,则从当前包的image_utils模块中导入load_image函数
if is_vision_available():
    from ..image_utils import load_image

# 使用装饰器添加终端文档字符串和初始化参数
@add_end_docstrings(
    build_pipeline_init_args(has_image_processor=True),  # 使用构建管道初始化参数装饰函数
    """
        image_processor_kwargs (`dict`, *optional*):
                Additional dictionary of keyword arguments passed along to the image processor e.g.
                {"size": {"height": 100, "width": 100}}
        pool (`bool`, *optional*, defaults to `False`):
            Whether or not to return the pooled output. If `False`, the model will return the raw hidden states.
    """,
)
# 图像特征提取管道类,继承自Pipeline基类
class ImageFeatureExtractionPipeline(Pipeline):
    """
    Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
    transformer, which can be used as features in downstream tasks.

    Example:

    ```
    >>> from transformers import pipeline

    >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
    >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
    >>> result.shape  # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
    torch.Size([1, 197, 768])
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
    `"image-feature-extraction"`.

    All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
    [huggingface.co/models](https://huggingface.co/models).
    """

    # 参数清理函数,处理图像处理器参数和后处理参数
    def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs):
        # 如果没有指定图像处理器参数,设置为空字典
        preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs

        # 设置后处理参数为空字典
        postprocess_params = {}

        # 如果指定了池化参数,将其添加到后处理参数中
        if pool is not None:
            postprocess_params["pool"] = pool
        # 如果指定了返回张量参数,将其添加到后处理参数中
        if return_tensors is not None:
            postprocess_params["return_tensors"] = return_tensors

        # 如果kwargs中包含超时参数,将其添加到预处理参数中
        if "timeout" in kwargs:
            preprocess_params["timeout"] = kwargs["timeout"]

        # 返回预处理参数、空字典和后处理参数
        return preprocess_params, {}, postprocess_params

    # 预处理函数,加载图像并使用图像处理器处理,返回模型输入
    def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
        # 加载图像数据
        image = load_image(image, timeout=timeout)
        # 使用图像处理器处理图像并返回模型输入
        model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
        return model_inputs

    # 前向传播函数,使用模型进行推理并返回模型输出
    def _forward(self, model_inputs):
        # 使用模型进行推理并返回输出
        model_outputs = self.model(**model_inputs)
        return model_outputs
    # 定义一个方法用于后处理模型输出
    def postprocess(self, model_outputs, pool=None, return_tensors=False):
        # 如果 pool 参数为真,则检查是否有 "pooler_output" 在模型输出中
        pool = pool if pool is not None else False

        if pool:
            # 如果使用了池化选项且模型输出中没有 "pooler_output",则抛出数值错误
            if "pooler_output" not in model_outputs:
                raise ValueError(
                    "No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option."
                )
            # 将模型输出中的 "pooler_output" 赋值给 outputs
            outputs = model_outputs["pooler_output"]
        else:
            # 如果未使用池化选项,则将模型输出中的第一个张量(logits 或者 last_hidden_state)赋值给 outputs
            # [0] is the first available tensor, logits or last_hidden_state.
            outputs = model_outputs[0]

        # 如果设置了 return_tensors,则直接返回 outputs
        if return_tensors:
            return outputs
        # 根据指定的深度学习框架返回 outputs 的转换结果
        if self.framework == "pt":
            return outputs.tolist()  # 返回 PyTorch 张量的转换为列表
        elif self.framework == "tf":
            return outputs.numpy().tolist()  # 返回 TensorFlow 张量的转换为列表

    # 定义一个方法使对象可被调用,用于提取输入的特征
    def __call__(self, *args, **kwargs):
        """
        Extract the features of the input(s).

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                images.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
                the call may block forever.
        Return:
            A nested list of `float`: The features computed by the model.
        """
        # 调用父类的 __call__ 方法,并传递所有的位置参数和关键字参数
        return super().__call__(*args, **kwargs)

.\pipelines\image_segmentation.py

# 从 typing 模块导入 Any、Dict、List、Union 类型
from typing import Any, Dict, List, Union

# 导入 numpy 库并用 np 别名引用
import numpy as np

# 从当前目录的 ..utils 模块中导入指定的函数和类
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends

# 从当前目录的 .base 模块中导入 Pipeline 类和 build_pipeline_init_args 函数
from .base import Pipeline, build_pipeline_init_args

# 如果 torch 可用,则执行以下导入
if is_vision_available():
    # 从 PIL 库中导入 Image 类
    from PIL import Image
    # 从 ..image_utils 模块中导入 load_image 函数
    from ..image_utils import load_image

# 如果 torch 可用,则执行以下导入
if is_torch_available():
    # 从 ..models.auto.modeling_auto 模块中导入以下命名
    from ..models.auto.modeling_auto import (
        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
    )

# 从 logging 模块中获取 logger 对象
logger = logging.get_logger(__name__)

# 定义类型别名 Prediction 为字典类型
Prediction = Dict[str, Any]
# 定义类型别名 Predictions 为列表,其中每个元素是 Prediction 类型
Predictions = List[Prediction]

# 使用装饰器 add_end_docstrings,为 ImageSegmentationPipeline 类添加文档字符串
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
# 继承 Pipeline 类,实现图像分割管道
class ImageSegmentationPipeline(Pipeline):
    """
    Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
    their classes.

    Example:

    ```
    >>> from transformers import pipeline

    >>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
    >>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    >>> len(segments)
    2

    >>> segments[0]["label"]
    'bird'

    >>> segments[1]["label"]
    'bird'

    >>> type(segments[0]["mask"])  # This is a black and white mask showing where is the bird on the original image.
    <class 'PIL.Image.Image'>

    >>> segments[0]["mask"].size
    (768, 512)
    ```

    This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"image-segmentation"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
    """

    # 构造函数初始化,接受任意位置和关键字参数
    def __init__(self, *args, **kwargs):
        # 调用父类 Pipeline 的构造函数
        super().__init__(*args, **kwargs)

        # 如果使用的是 TensorFlow 框架,则抛出 ValueError 异常
        if self.framework == "tf":
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")

        # 检查是否需要 vision 后端支持
        requires_backends(self, "vision")

        # 复制并更新模型映射字典
        mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
        mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
        mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
        mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)

        # 检查当前模型的类型
        self.check_model_type(mapping)
    # 定义一个方法用于清理和规范化输入参数
    def _sanitize_parameters(self, **kwargs):
        # 初始化预处理和后处理参数字典
        preprocess_kwargs = {}
        postprocess_kwargs = {}

        # 如果参数中包含"subtask",则在预处理和后处理参数中设置"subtask"
        if "subtask" in kwargs:
            postprocess_kwargs["subtask"] = kwargs["subtask"]
            preprocess_kwargs["subtask"] = kwargs["subtask"]

        # 如果参数中包含"threshold",则在后处理参数中设置"threshold"
        if "threshold" in kwargs:
            postprocess_kwargs["threshold"] = kwargs["threshold"]

        # 如果参数中包含"mask_threshold",则在后处理参数中设置"mask_threshold"
        if "mask_threshold" in kwargs:
            postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]

        # 如果参数中包含"overlap_mask_area_threshold",则在后处理参数中设置"overlap_mask_area_threshold"
        if "overlap_mask_area_threshold" in kwargs:
            postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]

        # 如果参数中包含"timeout",则在预处理参数中设置"timeout"
        if "timeout" in kwargs:
            preprocess_kwargs["timeout"] = kwargs["timeout"]

        # 返回预处理参数、空字典和后处理参数的元组
        return preprocess_kwargs, {}, postprocess_kwargs
    # 定义一个特殊方法 __call__,允许对象被像函数一样调用,用于执行图像分割(检测掩码和类别)。
    def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
        """
        执行图像分割(检测掩码和类别)在作为输入的图像中。

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                处理三种类型的图像:

                - 包含 HTTP(S) 链接指向图像的字符串
                - 包含指向本地图像路径的字符串
                - 直接加载的 PIL 图像

                管道接受单个图像或批量图像。批量图像必须统一格式:全部是 HTTP(S) 链接,全部是本地路径,或全部是 PIL 图像。
            subtask (`str`, *optional*):
                要执行的分割任务,根据模型能力选择 [`semantic`, `instance` 和 `panoptic`]。如果未设置,管道将按照以下顺序尝试解析:
                  `panoptic`, `instance`, `semantic`.
            threshold (`float`, *optional*, defaults to 0.9):
                过滤预测掩码的概率阈值。
            mask_threshold (`float`, *optional*, defaults to 0.5):
                在将预测掩码转换为二进制值时使用的阈值。
            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                掩码重叠阈值,用于消除小的断开分段。
            timeout (`float`, *optional*, defaults to None):
                从网络获取图像的最大等待时间(秒)。如果为 None,则不设置超时,调用可能会永远阻塞。

        Return:
            包含结果的字典或字典列表。如果输入是单个图像,则返回字典列表;如果输入是多个图像,则返回与每个图像对应的字典列表。

            字典包含每个检测到对象的掩码、标签和得分(适用时),包含以下键:

            - **label** (`str`) -- 模型识别的类别标签。
            - **mask** (`PIL.Image`) -- 检测到对象的二进制掩码,作为原始图像的 PIL 图像,形状为 (width, height)。如果未找到对象,则返回填充零的掩码。
            - **score** (*optional* `float`) -- 当模型能够估计标签和掩码描述的 "对象" 的置信度时,可选地包含。
        """
        # 调用父类的 __call__ 方法,执行实际的图像处理操作
        return super().__call__(images, **kwargs)
    # 对图像进行预处理,加载图像并根据需要设置超时时间
    image = load_image(image, timeout=timeout)
    
    # 设置目标尺寸为图像的高度和宽度的元组列表
    target_size = [(image.height, image.width)]
    
    # 如果模型配置是 OneFormerConfig 类型,则根据子任务设置输入参数
    if self.model.config.__class__.__name__ == "OneFormerConfig":
        if subtask is None:
            kwargs = {}  # 如果没有子任务,初始化空的关键字参数字典
        else:
            kwargs = {"task_inputs": [subtask]}  # 如果有子任务,设置任务输入参数
        # 使用图像处理器处理图像,返回 PyTorch 张量作为输入
        inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
        
        # 使用分词器处理任务输入,将任务输入转换为输入 ID 的张量,填充到最大长度
        inputs["task_inputs"] = self.tokenizer(
            inputs["task_inputs"],
            padding="max_length",
            max_length=self.model.config.task_seq_len,
            return_tensors=self.framework,
        )["input_ids"]
    else:
        # 如果模型配置不是 OneFormerConfig 类型,则仅使用图像处理器处理图像
        inputs = self.image_processor(images=[image], return_tensors="pt")
    
    # 将目标尺寸添加到输入字典中
    inputs["target_size"] = target_size
    
    # 返回预处理后的输入字典
    return inputs

def _forward(self, model_inputs):
    # 弹出输入字典中的目标尺寸,并保存到 target_size 变量中
    target_size = model_inputs.pop("target_size")
    
    # 使用模型处理输入字典,获取模型的输出
    model_outputs = self.model(**model_inputs)
    
    # 将目标尺寸添加到模型输出中
    model_outputs["target_size"] = target_size
    
    # 返回处理后的模型输出字典
    return model_outputs

def postprocess(
    self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
):
    # 此方法用于后处理模型的输出,可以根据需要进行进一步的处理和过滤
        ):
        # 初始化一个空的函数对象
        fn = None
        # 如果子任务是'panoptic'或者None,并且self.image_processor具有'post_process_panoptic_segmentation'属性
        if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
            # 将函数指向self.image_processor中的'post_process_panoptic_segmentation'函数
            fn = self.image_processor.post_process_panoptic_segmentation
        # 或者如果子任务是'instance'或者None,并且self.image_processor具有'post_process_instance_segmentation'属性
        elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
            # 将函数指向self.image_processor中的'post_process_instance_segmentation'函数
            fn = self.image_processor.post_process_instance_segmentation

        # 如果fn不为None,则执行以下代码块
        if fn is not None:
            # 调用fn函数,处理模型输出,根据指定阈值进行后处理
            outputs = fn(
                model_outputs,
                threshold=threshold,
                mask_threshold=mask_threshold,
                overlap_mask_area_threshold=overlap_mask_area_threshold,
                target_sizes=model_outputs["target_size"],
            )[0]

            # 初始化一个空的注释列表
            annotation = []
            # 获取输出中的分割结果
            segmentation = outputs["segmentation"]

            # 遍历每个分割信息
            for segment in outputs["segments_info"]:
                # 根据分割ID生成对应的掩码
                mask = (segmentation == segment["id"]) * 255
                mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
                # 获取分割标签和得分
                label = self.model.config.id2label[segment["label_id"]]
                score = segment["score"]
                # 将标签、得分和掩码添加到注释列表中
                annotation.append({"score": score, "label": label, "mask": mask})

        # 如果fn为None,并且子任务是'semantic'或者None,并且self.image_processor具有'post_process_semantic_segmentation'属性
        elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
            # 调用self.image_processor中的'post_process_semantic_segmentation'函数,处理语义分割的模型输出
            outputs = self.image_processor.post_process_semantic_segmentation(
                model_outputs, target_sizes=model_outputs["target_size"]
            )[0]

            # 初始化一个空的注释列表
            annotation = []
            # 将输出转换为numpy数组形式的分割结果
            segmentation = outputs.numpy()
            # 获取分割结果中的所有标签
            labels = np.unique(segmentation)

            # 遍历每个标签
            for label in labels:
                # 根据标签生成对应的掩码
                mask = (segmentation == label) * 255
                mask = Image.fromarray(mask.astype(np.uint8), mode="L")
                # 获取标签名称
                label = self.model.config.id2label[label]
                # 将标签和掩码添加到注释列表中,得分设为None
                annotation.append({"score": None, "label": label, "mask": mask})
        else:
            # 如果不满足任何处理条件,则抛出异常
            raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
        # 返回最终的注释列表
        return annotation

.\pipelines\image_to_image.py

# 导入所需的模块和函数
from typing import List, Union

import numpy as np

# 导入通用工具函数和类
from ..utils import (
    add_end_docstrings,
    is_torch_available,
    is_vision_available,
    logging,
    requires_backends,
)

# 导入基础类和函数
from .base import Pipeline, build_pipeline_init_args

# 如果视觉处理可用,则导入必要的图像处理库和函数
if is_vision_available():
    from PIL import Image
    from ..image_utils import load_image

# 如果 PyTorch 可用,则导入图像到图像映射模型名称列表
if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 添加文档字符串到类 ImageToImagePipeline,并设置 has_image_processor=True
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class ImageToImagePipeline(Pipeline):
    """
    Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
    image input.

    Example:

    ```
    >>> from PIL import Image
    >>> import requests

    >>> from transformers import pipeline

    >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
    >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
    >>> img = img.resize((64, 64))
    >>> upscaled_img = upscaler(img)
    >>> img.size
    (64, 64)

    >>> upscaled_img.size
    (144, 144)
    ```

    This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"image-to-image"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
    """

    # 初始化 ImageToImagePipeline 类的实例
    def __init__(self, *args, **kwargs):
        # 调用父类 Pipeline 的初始化方法
        super().__init__(*args, **kwargs)
        # 检查是否有必要的后端支持(这里是视觉处理)
        requires_backends(self, "vision")
        # 检查模型类型是否为图像到图像映射模型
        self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)

    # 清理参数的辅助函数,返回预处理、前向和后处理参数字典
    def _sanitize_parameters(self, **kwargs):
        preprocess_params = {}
        postprocess_params = {}
        forward_params = {}

        # 如果传入了超时参数,则设置到预处理参数中
        if "timeout" in kwargs:
            preprocess_params["timeout"] = kwargs["timeout"]
        # 如果传入了头部遮罩参数,则设置到前向参数中
        if "head_mask" in kwargs:
            forward_params["head_mask"] = kwargs["head_mask"]

        return preprocess_params, forward_params, postprocess_params

    # 调用实例时的方法,接收图像或图像列表作为输入
    def __call__(
        self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
    ) -> Union["Image.Image", List["Image.Image"]]:
        """
        Transform the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                images.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
                the call may block forever.

        Return:
            An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a
            single image, the return will be also a single image, if the input is a list of several images, it will
            return a list of transformed images.
        """
        # 调用父类的 __call__ 方法来处理输入的图像数据
        return super().__call__(images, **kwargs)

    def _forward(self, model_inputs):
        # 使用模型进行前向推断,获取模型输出
        model_outputs = self.model(**model_inputs)
        return model_outputs

    def preprocess(self, image, timeout=None):
        # 加载图像并进行预处理,将图像转换为模型接受的输入格式
        image = load_image(image, timeout=timeout)
        # 使用图像处理器处理预处理后的图像,返回模型输入
        inputs = self.image_processor(images=[image], return_tensors="pt")
        return inputs

    def postprocess(self, model_outputs):
        # 初始化空列表来存储后处理后的图像
        images = []
        # 如果模型输出中包含 "reconstruction" 键
        if "reconstruction" in model_outputs.keys():
            # 获取重建的输出
            outputs = model_outputs.reconstruction
        # 遍历每个输出
        for output in outputs:
            # 将输出数据转换为浮点数并在 CPU 上进行操作
            output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
            # 调整输出的轴顺序,将通道放到最后一个维度
            output = np.moveaxis(output, source=0, destination=-1)
            # 将浮点数转换为 uint8 类型的像素值
            output = (output * 255.0).round().astype(np.uint8)  # float32 to uint8
            # 将处理后的像素值数组转换为 PIL 图像并添加到 images 列表中
            images.append(Image.fromarray(output))

        # 如果 images 列表中有多个图像,则返回图像列表;否则返回单个图像
        return images if len(images) > 1 else images[0]

.\pipelines\image_to_text.py

# 设置文件编码为utf-8
# 版权声明
#     2024年 HuggingFace Inc. 团队保留所有权利。
# 
# 根据 Apache 许可证 2.0 版(“许可证”)获得许可。
# 您只能在遵守许可证的情况下使用本文件。
# 您可以获取许可证的副本
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非法律另有规定或书面同意,软件在
# 分发时是按“按现状”分发的,
# 没有任何明示或暗示的保证或条件
# 请参阅特定语言的许可证
# 限制条件
# 
# 加载所需的模块和库
from typing import List, Union

from ..utils import (
    add_end_docstrings,
    is_tf_available,
    is_torch_available,
    is_vision_available,
    logging,
    requires_backends,
)
# 导入 Pipeline 类,构建 Pipeline 初始化参数
from .base import Pipeline, build_pipeline_init_args

# 如果可用视觉处理模块
# 则导入 PIL 图像库和加载图像的函数
if is_vision_available():
    from PIL import Image
    from ..image_utils import load_image

# 如果可用 TensorFlow
# 则导入 TF 模型与视觉到序列映射的名称
if is_tf_available():
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES

# 如果可用 PyTorch
# 则导入 Torch 模块和模型到序列映射的名称
if is_torch_available():
    import torch
    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES

# 获取日志记录器
logger = logging.get_logger(__name__)

# 添加文档字符串
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
# 定义一个 ImageToTextPipeline 类,继承自 Pipeline 类
class ImageToTextPipeline(Pipeline):
    """
    图像到文本的 Pipeline,使用 AutoModelForVision2Seq 模型。该 Pipeline 预测给定图像的标题。

    示例:

    ```
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    在[pipeline 教程](../pipeline_tutorial)中了解有关使用 Pipeline 的基础知识

    当前的图像到文本 Pipeline 可以使用下面的任务标识符加载:
    "image-to-text"。

    查看 [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text)
    上可用模型的列表。
    """

    def __init__(self, *args, **kwargs):
        # 调用父类的构造函数
        super().__init__(*args, **kwargs)
        # 检查后端,确保 vision 模块可用
        requires_backends(self, "vision")
        # 检查模型类型并确定使用 TF 或者 Torch
        self.check_model_type(
            TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
        )
    # 对输入参数进行清理和预处理,返回两个字典:预处理参数和转发参数
    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
        # 初始化两个空字典,用于存储预处理参数和转发参数
        forward_params = {}
        preprocess_params = {}

        # 如果有传入提示文本,则添加到预处理参数字典中
        if prompt is not None:
            preprocess_params["prompt"] = prompt
        # 如果有传入超时时间,则添加到预处理参数字典中
        if timeout is not None:
            preprocess_params["timeout"] = timeout

        # 如果有传入最大新生成标记数,则添加到转发参数字典中
        if max_new_tokens is not None:
            forward_params["max_new_tokens"] = max_new_tokens
        # 如果有传入生成参数字典,并且生成参数字典中没有重复定义最大新生成标记数,则将其合并到转发参数字典中
        if generate_kwargs is not None:
            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
                # 如果同时在参数和生成参数中定义了最大新生成标记数,则抛出数值错误异常
                raise ValueError(
                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
                    " only 1 version"
                )
            forward_params.update(generate_kwargs)

        # 返回预处理参数字典、转发参数字典和一个空字典作为补充
        return preprocess_params, forward_params, {}

    # 调用函数,将标签分配给传入的图像或图像列表
    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
        """
        为传入的图像或图像列表分配标签。

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                该管道处理三种类型的图像:

                - 包含指向图像的HTTP(s)链接的字符串
                - 包含指向图像的本地路径的字符串
                - 直接加载到PIL中的图像

                该管道可以接受单个图像或批量图像。

            max_new_tokens (`int`, *optional*):
                要生成的最大标记数。默认情况下,将使用`generate`的默认值。

            generate_kwargs (`Dict`, *optional*):
                将这些参数直接传递给`generate`,允许对此函数进行完全控制。
                
            timeout (`float`, *optional*, 默认为None):
                从网络获取图像的最大等待时间(以秒为单位)。如果为None,则不设置超时,调用可能会永久阻塞。

        Return:
            `list` 或 `list` 的 `list`,每个结果作为一个字典返回,包含以下键:

            - **generated_text** (`str`) -- 生成的文本。
        """
        # 调用父类的`__call__`方法,传递图像和任意其他关键字参数
        return super().__call__(images, **kwargs)
    # 对输入的图像进行预处理,加载图像并返回处理后的图像对象
    def preprocess(self, image, prompt=None, timeout=None):
        image = load_image(image, timeout=timeout)

        # 如果有文本提示,则检查其是否为字符串类型,否则抛出数值错误异常
        if prompt is not None:
            if not isinstance(prompt, str):
                raise ValueError(
                    f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
                    "Note also that one single text can be provided for conditional image to text generation."
                )

            # 获取模型的类型
            model_type = self.model.config.model_type

            # 根据模型类型进行不同的处理
            if model_type == "git":
                # 使用图像处理器处理图像,返回张量数据
                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
                # 使用分词器处理文本提示,生成输入的token IDs
                input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
                # 将CLS token ID添加到输入token序列的开头
                input_ids = [self.tokenizer.cls_token_id] + input_ids
                input_ids = torch.tensor(input_ids).unsqueeze(0)
                model_inputs.update({"input_ids": input_ids})

            elif model_type == "pix2struct":
                # 使用图像处理器处理图像,并将文本提示作为头部文本处理
                model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)

            elif model_type != "vision-encoder-decoder":
                # 对于不支持条件生成的模型类型,使用图像处理器处理图像,同时使用分词器处理文本输入
                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
                text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
                model_inputs.update(text_inputs)

            else:
                # 如果模型类型不在预期的范围内,则抛出数值错误异常
                raise ValueError(f"Model type {model_type} does not support conditional text generation")

        else:
            # 如果没有文本提示,则仅使用图像处理器处理图像
            model_inputs = self.image_processor(images=image, return_tensors=self.framework)

        # 如果模型类型为"git"并且没有文本提示,则将输入的token IDs设为None
        if self.model.config.model_type == "git" and prompt is None:
            model_inputs["input_ids"] = None

        # 返回预处理后的模型输入
        return model_inputs

    # 执行模型推理过程的内部方法,根据模型输入调用生成方法生成模型输出
    def _forward(self, model_inputs, **generate_kwargs):
        # 对于批处理中的空输入token IDs列表,将其设为None,避免导致生成失败
        if (
            "input_ids" in model_inputs
            and isinstance(model_inputs["input_ids"], list)
            and all(x is None for x in model_inputs["input_ids"])
        ):
            model_inputs["input_ids"] = None

        # FIXME: 由于`generation.py`和`generation.tf_utils.py`中输入解析的差异,需要在此处弹出特定的输入键
        # 在TensorFlow版本中,如果不使用`input_ids`,则会导致生成方法抛出错误,而在PyTorch版本中,会使用
        # `self.model.main_input_name`或`self.model.encoder.main_input_name`作为输入名称来匹配。
        inputs = model_inputs.pop(self.model.main_input_name)
        # 调用模型的生成方法,生成模型输出
        model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
        # 返回模型输出
        return model_outputs
    # 定义一个方法用于后处理模型输出的结果
    def postprocess(self, model_outputs):
        # 初始化一个空列表用于存储处理后的记录
        records = []
        # 遍历模型输出中的每个输出 ID 列表
        for output_ids in model_outputs:
            # 解码当前输出 ID 列表,跳过特殊标记,生成文本
            record = {
                "generated_text": self.tokenizer.decode(
                    output_ids,
                    skip_special_tokens=True,
                )
            }
            # 将生成的记录添加到记录列表中
            records.append(record)
        # 返回所有处理后的记录列表
        return records

.\pipelines\mask_generation.py

from collections import defaultdict  # 导入 defaultdict 类,用于创建默认值为列表的字典
from typing import Optional  # 导入 Optional 类型,表示某些参数可选

from ..image_utils import load_image  # 导入 load_image 函数,用于加载图像
from ..utils import (  # 导入多个工具函数和类
    add_end_docstrings,  # 添加文档结尾的装饰器
    is_torch_available,  # 检查是否可用 Torch 库
    logging,  # 记录日志相关功能
    requires_backends,  # 检查所需后端
)
from .base import ChunkPipeline, build_pipeline_init_args  # 导入 ChunkPipeline 类和初始化参数构建函数

if is_torch_available():  # 如果 Torch 库可用
    import torch  # 导入 Torch 库

    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES  # 导入自动掩模生成模型映射名称

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


@add_end_docstrings(  # 使用装饰器添加文档结尾
    build_pipeline_init_args(has_image_processor=True),  # 使用构建管道初始化参数函数,指定有图像处理器
    r"""
        points_per_batch (*optional*, int, default to 64):
            设置模型同时运行的点数。数字越高可能速度更快但使用更多 GPU 内存。
        output_bboxes_mask (`bool`, *optional*, default to `False`):
            是否输出边界框预测。
        output_rle_masks (`bool`, *optional*, default to `False`):
            是否以 RLE 格式输出掩码。""",
)
class MaskGenerationPipeline(ChunkPipeline):  # 定义掩模生成管道类,继承自 ChunkPipeline
    """
    自动为图像生成掩模,使用 `SamForMaskGeneration` 模型。该管道预测给定图像的二进制掩模。它是一个 `ChunkPipeline`,
    因为可以将小批量中的点分开处理,以避免内存不足问题。使用 `points_per_batch` 参数控制同时处理的点数,默认为 `64`。

    该管道工作分为三个步骤:
        1. `preprocess`: 生成一个均匀分布的 1024 个点网格,以及边界框和点标签。
            更多关于如何创建点和边界框的细节,请查看 `_generate_crop_boxes` 函数。同时使用 `image_processor` 预处理图像。
            该函数生成一个 `points_per_batch` 的小批量。

        2. `forward`: 将 `preprocess` 的输出馈送到模型。仅计算图像嵌入一次。
            调用 `self.model.get_image_embeddings`,确保不计算梯度,并且张量和模型在同一设备上。

        3. `postprocess`: 自动掩模生成的最重要部分发生在这里。包括三个步骤:
            - image_processor.postprocess_masks(在每个小批量循环中运行):处理原始输出掩模,根据图像大小调整它们的大小,并将其转换为二进制掩模。
            - image_processor.filter_masks(在每个小批量循环中运行):使用 `pred_iou_thresh` 和 `stability_scores`,以及基于非最大抑制的各种过滤器,去除不良掩模。
            - image_processor.postprocess_masks_for_amg:对掩模应用 NSM,仅保留相关掩模。

    示例:

    ```
    >>> from transformers import pipeline
    ```
    >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
    # 使用预定义的pipeline函数创建一个生成器,用于执行模型推断任务,指定模型和任务类型为“mask-generation”。
    
    >>> outputs = generator(
    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
    ... )
    # 使用生成器执行推断任务,输入为指定的图像URL。该步骤将返回推断结果。
    
    >>> outputs = generator(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
    ... )
    # 再次使用生成器执行推断任务,输入为另一个图像URL,并设置额外的参数points_per_batch为128。该步骤将返回推断结果。
    
    """
    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
    
    This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"mask-generation"`.
    
    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
    """
    # 提供了有关使用管道的基础信息,并指出此分割管道可使用task标识符“mask-generation”从[`pipeline`]加载。
    
    class YourClassName:
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            requires_backends(self, "vision")
            requires_backends(self, "torch")
    
            if self.framework != "pt":
                raise ValueError(f"The {self.__class__} is only available in PyTorch.")
    
            self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
        # 初始化类的构造函数,进行基本的设置和检查,确保所需的后端库和框架为PyTorch。
    
        def _sanitize_parameters(self, **kwargs):
            preprocess_kwargs = {}
            postprocess_kwargs = {}
            forward_params = {}
    
            # 预处理参数
            if "points_per_batch" in kwargs:
                preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
            if "points_per_crop" in kwargs:
                preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
            if "crops_n_layers" in kwargs:
                preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
            if "crop_overlap_ratio" in kwargs:
                preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
            if "crop_n_points_downscale_factor" in kwargs:
                preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
            if "timeout" in kwargs:
                preprocess_kwargs["timeout"] = kwargs["timeout"]
    
            # 后处理参数
            if "pred_iou_thresh" in kwargs:
                forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
            if "stability_score_offset" in kwargs:
                forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
            if "mask_threshold" in kwargs:
                forward_params["mask_threshold"] = kwargs["mask_threshold"]
            if "stability_score_thresh" in kwargs:
                forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
            if "crops_nms_thresh" in kwargs:
                postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
            if "output_rle_mask" in kwargs:
                postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
            if "output_bboxes_mask" in kwargs:
                postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
    
            return preprocess_kwargs, forward_params, postprocess_kwargs
        # 对传入的参数进行清理和预处理,将预处理、前向和后处理的参数分别整理到三个字典中。
    def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
        """
        通过调用实例对象,生成二进制分割掩码

        Args:
            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                图像或图像列表。
            mask_threshold (`float`, *optional*, defaults to 0.0):
                将预测的掩码转换为二进制值时使用的阈值。
            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
                应用于模型预测掩码质量的过滤阈值,取值范围为 `[0,1]`。
            stability_score_thresh (`float`, *optional*, defaults to 0.95):
                应用于模型掩码预测稳定性的过滤阈值,取值范围为 `[0,1]`。
            stability_score_offset (`int`, *optional*, defaults to 1):
                在计算稳定性分数时,用于偏移截断的量。
            crops_nms_thresh (`float`, *optional*, defaults to 0.7):
                由非极大值抑制使用的框 IoU 截断,用于过滤重复的掩码。
            crops_n_layers (`int`, *optional*, defaults to 0):
                如果 `crops_n_layers>0`,则将再次对图像的裁剪运行掩码预测。设置运行的层数,每层有 2**i_layer 个图像裁剪。
            crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
                设置裁剪重叠的程度。在第一层裁剪中,裁剪将以图像长度的这一分数重叠。随后的层级通过更多的裁剪减少此重叠。
            crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
                在第 n 层采样的每边点数按 crop_n_points_downscale_factor**n 缩小。
            timeout (`float`, *optional*, defaults to None):
                从网页获取图像的最大等待时间(秒)。如果为 None,则不设置超时,调用可能会一直阻塞。

        Return:
            `Dict`: 包含以下键的字典:
                - **mask** (`PIL.Image`) -- 检测到对象的二进制掩码,作为原始图像 `(width, height)` 的 PIL 图像。如果未检测到对象,则返回一个填充零的掩码。
                - **score** (*optional* `float`) -- 可选,当模型能够估计标签和掩码描述的 "对象" 的置信度时。

        """
        return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
    def preprocess(
        self,
        image,
        points_per_batch=64,  # 每批处理的点数,默认为64
        crops_n_layers: int = 0,  # 裁剪层数,默认为0
        crop_overlap_ratio: float = 512 / 1500,  # 裁剪重叠比例,默认为512/1500
        points_per_crop: Optional[int] = 32,  # 每个裁剪的点数,默认为32
        crop_n_points_downscale_factor: Optional[int] = 1,  # 裁剪点数缩放因子,默认为1
        timeout: Optional[float] = None,  # 超时时间,默认为None
    ):
        image = load_image(image, timeout=timeout)  # 调用load_image函数加载图像,可以设置超时时间
        target_size = self.image_processor.size["longest_edge"]  # 获取图像处理器中最长边的尺寸作为目标尺寸
        crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
            image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
        )  # 使用图像处理器生成裁剪框、网格点、裁剪后的图像和输入标签

        model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")  # 使用图像处理器处理裁剪后的图像,返回PyTorch张量格式的模型输入

        with self.device_placement():  # 使用设备分配上下文管理器
            if self.framework == "pt":  # 如果框架是PyTorch
                inference_context = self.get_inference_context()  # 获取推断上下文
                with inference_context():  # 使用推断上下文管理器
                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)  # 确保模型输入张量位于指定设备上
                    image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))  # 获取图像嵌入向量
                    model_inputs["image_embeddings"] = image_embeddings  # 将图像嵌入向量添加到模型输入中

        n_points = grid_points.shape[1]  # 获取网格点的数量
        points_per_batch = points_per_batch if points_per_batch is not None else n_points  # 如果指定了每批处理的点数则使用,否则使用网格点的数量

        if points_per_batch <= 0:  # 如果每批处理的点数小于等于0
            raise ValueError(
                "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
                "To return all points at once, set points_per_batch to None"
            )  # 抛出数值错误异常,要求每批处理的点数必须大于等于1,或者设置为None以一次返回所有点

        for i in range(0, n_points, points_per_batch):  # 遍历网格点,每次处理points_per_batch个点
            batched_points = grid_points[:, i : i + points_per_batch, :, :]  # 分批次获取网格点
            labels = input_labels[:, i : i + points_per_batch]  # 获取对应的输入标签
            is_last = i == n_points - points_per_batch  # 判断是否是最后一批

            yield {
                "input_points": batched_points,  # 返回批次的输入点
                "input_labels": labels,  # 返回对应的输入标签
                "input_boxes": crop_boxes,  # 返回裁剪框
                "is_last": is_last,  # 返回是否是最后一批
                **model_inputs,  # 返回模型输入的其它内容
            }

    def _forward(
        self,
        model_inputs,
        pred_iou_thresh=0.88,  # 预测IOU阈值,默认为0.88
        stability_score_thresh=0.95,  # 稳定性分数阈值,默认为0.95
        mask_threshold=0,  # 掩码阈值,默认为0
        stability_score_offset=1,  # 稳定性分数偏移量,默认为1
    ):
        # 从模型输入中弹出"input_boxes",并保存在input_boxes变量中
        input_boxes = model_inputs.pop("input_boxes")
        # 从模型输入中弹出"is_last",并保存在is_last变量中
        is_last = model_inputs.pop("is_last")
        # 从模型输入中弹出"original_sizes",并将其转换为列表保存在original_sizes变量中
        original_sizes = model_inputs.pop("original_sizes").tolist()
        # 从模型输入中弹出"reshaped_input_sizes",并将其转换为列表保存在reshaped_input_sizes变量中
        reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()

        # 使用模型进行推理,将模型输入传递给模型并获取模型输出
        model_outputs = self.model(**model_inputs)

        # 在这里进行后处理,以避免复制所有掩码的CPU GPU
        # 从模型输出中获取"pred_masks",即低分辨率掩码
        low_resolution_masks = model_outputs["pred_masks"]
        # 调用图像处理器的方法对掩码进行后处理,得到更高分辨率的掩码
        masks = self.image_processor.post_process_masks(
            low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False
        )
        # 从模型输出中获取"iou_scores",即IoU分数
        iou_scores = model_outputs["iou_scores"]
        # 使用图像处理器的方法对掩码进行筛选,得到最终的掩码、IoU分数和边界框
        masks, iou_scores, boxes = self.image_processor.filter_masks(
            masks[0],
            iou_scores[0],
            original_sizes[0],
            input_boxes[0],
            pred_iou_thresh,
            stability_score_thresh,
            mask_threshold,
            stability_score_offset,
        )
        # 返回处理后的结果,包括掩码、is_last标志、边界框和IoU分数
        return {
            "masks": masks,
            "is_last": is_last,
            "boxes": boxes,
            "iou_scores": iou_scores,
        }

    # 定义后处理方法,用于整合多个模型输出并生成最终的掩码和分数
    def postprocess(
        self,
        model_outputs,
        output_rle_mask=False,
        output_bboxes_mask=False,
        crops_nms_thresh=0.7,
    ):
        # 存储所有模型输出的IoU分数、掩码和边界框
        all_scores = []
        all_masks = []
        all_boxes = []
        for model_output in model_outputs:
            # 弹出模型输出中的"IoU_scores"并添加到all_scores列表中
            all_scores.append(model_output.pop("iou_scores"))
            # 扩展模型输出中的"masks"并添加到all_masks列表中
            all_masks.extend(model_output.pop("masks"))
            # 弹出模型输出中的"boxes"并添加到all_boxes列表中
            all_boxes.append(model_output.pop("boxes"))

        # 使用PyTorch的方法连接所有IoU分数和边界框
        all_scores = torch.cat(all_scores)
        all_boxes = torch.cat(all_boxes)
        # 调用图像处理器的方法进行掩码生成的后处理,得到输出掩码、IoU分数、RLE掩码和边界框
        output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
            all_masks, all_scores, all_boxes, crops_nms_thresh
        )

        # 创建默认字典,用于存储额外的输出结果
        extra = defaultdict(list)
        for output in model_outputs:
            for k, v in output.items():
                extra[k].append(v)

        # 创建可选项字典,根据需要添加RLE掩码或边界框
        optional = {}
        if output_rle_mask:
            optional["rle_mask"] = rle_mask

        if output_bboxes_mask:
            optional["bounding_boxes"] = bounding_boxes

        # 返回最终处理结果,包括输出掩码、IoU分数以及额外的输出结果
        return {"masks": output_masks, "scores": iou_scores, **optional, **extra}

.\pipelines\object_detection.py

# 从 typing 模块中导入 Any, Dict, List, Union 类型
from typing import Any, Dict, List, Union

# 从 ..utils 模块中导入必要的函数和类
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
# 从 .base 模块中导入 Pipeline 类和 build_pipeline_init_args 函数
from .base import Pipeline, build_pipeline_init_args

# 如果 vision 可用,从 ..image_utils 模块中导入 load_image 函数
if is_vision_available():
    from ..image_utils import load_image

# 如果 torch 可用,导入 torch 模块和必要的类
if is_torch_available():
    import torch
    from ..models.auto.modeling_auto import (
        MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
    )

# 从 logging 模块中获取当前模块的 logger
logger = logging.get_logger(__name__)

# 定义用于预测结果的类型别名
Prediction = Dict[str, Any]
Predictions = List[Prediction]

# 使用装饰器为 ObjectDetectionPipeline 类添加文档字符串
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class ObjectDetectionPipeline(Pipeline):
    """
    Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
    and their classes.

    Example:

    ```
    >>> from transformers import pipeline

    >>> detector = pipeline(model="facebook/detr-resnet-50")
    >>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]

    >>> # x, y  are expressed relative to the top left hand corner.
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"object-detection"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
    """

    # 初始化方法,继承自 Pipeline 类的初始化方法
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # 如果使用的框架是 "tf",抛出 ValueError 异常
        if self.framework == "tf":
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")

        # 确保依赖的后端库已加载,这里要求加载 "vision"
        requires_backends(self, "vision")

        # 复制对象检测模型映射,并更新为包含对象分类映射的名称
        mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
        mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
        
        # 检查模型类型是否符合预期
        self.check_model_type(mapping)

    # 私有方法,用于处理和清理参数
    def _sanitize_parameters(self, **kwargs):
        preprocess_params = {}
        # 如果参数中包含 "timeout",则将其添加到预处理参数中
        if "timeout" in kwargs:
            preprocess_params["timeout"] = kwargs["timeout"]
        postprocess_kwargs = {}
        # 如果参数中包含 "threshold",则将其添加到后处理参数中
        if "threshold" in kwargs:
            postprocess_kwargs["threshold"] = kwargs["threshold"]
        return preprocess_params, {}, postprocess_kwargs
    # 调用对象实例时执行的方法,用于检测输入图像中的对象(边界框和类别)

    def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
        """
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing an HTTP(S) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
            threshold (`float`, *optional*, defaults to 0.9):
                The probability necessary to make a prediction.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
            image, will return a list of dictionaries, if the input is a list of several images, will return a list of
            list of dictionaries corresponding to each image.

            The dictionaries contain the following keys:

            - **label** (`str`) -- The class label identified by the model.
            - **score** (`float`) -- The score attributed by the model for that label.
            - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
        """

        return super().__call__(*args, **kwargs)

    # 对输入图像进行预处理,返回模型所需的输入格式
    def preprocess(self, image, timeout=None):
        # 载入图像,根据需要设置超时时间
        image = load_image(image, timeout=timeout)
        # 获取图像的高度和宽度并组成张量
        target_size = torch.IntTensor([[image.height, image.width]])
        # 使用图像处理器处理图像,返回PyTorch格式的输入
        inputs = self.image_processor(images=[image], return_tensors="pt")
        # 如果存在分词器,则使用分词器对文本和边界框进行处理,并返回PyTorch格式的输入
        if self.tokenizer is not None:
            inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
        # 将图像的目标尺寸添加到输入中
        inputs["target_size"] = target_size
        return inputs

    # 模型的内部前向传播方法,处理模型输入并返回模型输出
    def _forward(self, model_inputs):
        # 弹出目标尺寸以避免传递给模型
        target_size = model_inputs.pop("target_size")
        # 使用模型进行前向传播,获取输出
        outputs = self.model(**model_inputs)
        # 构建模型的输出对象,并将目标尺寸添加到输出中
        model_outputs = outputs.__class__({"target_size": target_size, **outputs})
        # 如果存在分词器,则将边界框信息添加到模型输出中
        if self.tokenizer is not None:
            model_outputs["bbox"] = model_inputs["bbox"]
        return model_outputs
    def postprocess(self, model_outputs, threshold=0.9):
        # 获取模型输出中的目标尺寸
        target_size = model_outputs["target_size"]
        if self.tokenizer is not None:
            # 这是 LayoutLMForTokenClassification 的变种。
            # OCR 获取了文本框,模型对单词进行了分类。
            # 从目标尺寸中获取高度和宽度
            height, width = target_size[0].tolist()

            def unnormalize(bbox):
                # 将归一化的边界框坐标转换为原始坐标
                return self._get_bounding_box(
                    torch.Tensor(
                        [
                            (width * bbox[0] / 1000),
                            (height * bbox[1] / 1000),
                            (width * bbox[2] / 1000),
                            (height * bbox[3] / 1000),
                        ]
                    )
                )

            # 计算模型输出中的得分和类别
            scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
            # 根据预测的类别获取类别标签
            labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
            # 将模型输出的边界框进行反归一化处理
            boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
            keys = ["score", "label", "box"]
            # 创建注释列表,包含得分、标签和边界框
            annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
        else:
            # 这是一个常规的 ForObjectDetectionModel
            # 对象检测后处理,获取原始注释信息
            raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
            raw_annotation = raw_annotations[0]
            # 获取原始注释中的分数、标签和边界框
            scores = raw_annotation["scores"]
            labels = raw_annotation["labels"]
            boxes = raw_annotation["boxes"]

            # 将分数、标签和边界框转换为列表形式
            raw_annotation["scores"] = scores.tolist()
            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
            raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]

            # 构建注释列表,包含得分、标签和边界框
            keys = ["score", "label", "box"]
            annotation = [
                dict(zip(keys, vals))
                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
            ]

        return annotation

    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
        """
        将列表 [xmin, xmax, ymin, ymax] 转换为字典 { "xmin": xmin, ... }

        Args:
            box (`torch.Tensor`): 包含角落格式坐标的张量。

        Returns:
            bbox (`Dict[str, int]`): 包含角落格式坐标的字典。
        """
        if self.framework != "pt":
            # 如果框架不是 PyTorch,则抛出数值错误
            raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
        # 将边界框张量转换为整数列表,并命名为边界框
        xmin, ymin, xmax, ymax = box.int().tolist()
        bbox = {
            "xmin": xmin,
            "ymin": ymin,
            "xmax": xmax,
            "ymax": ymax,
        }
        return bbox
posted @ 2024-07-01 10:56  绝不原创的飞龙  阅读(33)  评论(0编辑  收藏  举报