Transformers-源码解析-八十五-

Transformers 源码解析(八十五)

.\models\owlv2\image_processing_owlv2.py

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for OWLv2."""

import warnings
from typing import Dict, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import (
    center_to_corners_format,
    pad,
    to_channel_dimension_format,
)
from ...image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import (
    TensorType,
    is_scipy_available,
    is_torch_available,
    is_vision_available,
    logging,
    requires_backends,
)

# Set up logging
logger = logging.get_logger(__name__)


if is_torch_available():
    import torch


if is_vision_available():
    import PIL

if is_scipy_available():
    from scipy import ndimage as ndi


# Copied from transformers.models.owlvit.image_processing_owlvit._upcast
def _upcast(t):
    """
    Protects from numerical overflows in multiplications by upcasting to the equivalent higher type.

    Args:
        t (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Upcasted tensor to float32 or float64 for floating point types,
                      or to int32 or int64 for integer types.
    """
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.owlvit.image_processing_owlvit.box_area
def box_area(boxes):
    """
    Computes the area of a set of bounding boxes.

    Args:
        boxes (torch.FloatTensor): Bounding boxes in (x1, y1, x2, y2) format.

    Returns:
        torch.FloatTensor: Tensor containing the area for each box.
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.owlvit.image_processing_owlvit.box_iou
def box_iou(boxes1, boxes2):
    """
    Computes the intersection over union (IoU) of two sets of bounding boxes.

    Args:
        boxes1 (torch.FloatTensor): Bounding boxes set 1 in (x1, y1, x2, y2) format.
        boxes2 (torch.FloatTensor): Bounding boxes set 2 in (x1, y1, x2, y2) format.

    Returns:
        torch.FloatTensor: IoU for each pair of boxes from boxes1 and boxes2.
    """
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
    # 计算框的宽度和高度,使用 clamp 函数确保宽度和高度不小于 0
    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    # 计算交集的面积,对每个框的宽度和高度进行逐元素乘法
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    # 计算并集的面积,area1 和 area2 分别为两组框的面积,使用广播机制扩展计算
    union = area1[:, None] + area2 - inter

    # 计算 IoU(Intersection over Union),即交集面积除以并集面积
    iou = inter / union
    # 返回 IoU 和并集面积
    return iou, union
def _preprocess_resize_output(image, output_shape):
    """Validate resize output shape according to input image.

    Args:
        image (`np.ndarray`):
         Image to be resized.
        output_shape (`iterable`):
            Size of the generated output image `(rows, cols[, ...][, dim])`. If `dim` is not provided, the number of
            channels is preserved.

    Returns
    -------
    image (`np.ndarray):
        The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
        input.ndim`.
    output_shape (`Tuple`):
        The output shape converted to tuple.

    Raises
    ------
    ValueError:
        If output_shape length is smaller than the image number of dimensions.

    Notes
    -----
    The input image is reshaped if its number of dimensions is not equal to output_shape_length.
    """
    output_shape = tuple(output_shape)  # Convert output_shape to a tuple
    output_ndim = len(output_shape)  # Get the number of dimensions of output_shape
    input_shape = image.shape  # Get the shape of the input image
    if output_ndim > image.ndim:
        # If output_ndim is greater than the number of dimensions of the input image,
        # append singleton dimensions to input_shape
        input_shape += (1,) * (output_ndim - image.ndim)
        image = np.reshape(image, input_shape)  # Reshape the input image
    elif output_ndim == image.ndim - 1:
        # If output_ndim is equal to the number of dimensions of the input image minus one,
        # it's a multichannel case; append shape of last axis to output_shape
        output_shape = output_shape + (image.shape[-1],)
    elif output_ndim < image.ndim:
        # If output_ndim is less than the number of dimensions of the input image, raise an error
        raise ValueError("output_shape length cannot be smaller than the image number of dimensions")

    return image, output_shape


def _clip_warp_output(input_image, output_image):
    """Clip output image to range of values of input image.

    Note that this function modifies the values of *output_image* in-place.

    Taken from:
    https://github.com/scikit-image/scikit-image/blob/b4b521d6f0a105aabeaa31699949f78453ca3511/skimage/transform/_warps.py#L640.

    Args:
    ----
    input_image : ndarray
        Input image.
    output_image : ndarray
        Output image, which is modified in-place.
    """
    min_val = np.min(input_image)  # Get the minimum value of the input image
    if np.isnan(min_val):
        # If NaNs are detected in the input image, use NaN-safe min/max functions
        min_func = np.nanmin
        max_func = np.nanmax
        min_val = min_func(input_image)
    else:
        min_func = np.min
        max_func = np.max
    max_val = max_func(input_image)  # Get the maximum value of the input image

    output_image = np.clip(output_image, min_val, max_val)  # Clip output_image to the range [min_val, max_val]

    return output_image


class Owlv2ImageProcessor(BaseImageProcessor):
    r"""
    Constructs an OWLv2 image processor.
    """
    pass  # Placeholder, no additional functionality added here
    # 参数定义部分,用于配置图像预处理的选项
    Args:
        do_rescale (`bool`, *optional*, defaults to `True`):
            是否对图像进行重新缩放,缩放因子由 `rescale_factor` 指定。可以在 `preprocess` 方法中通过 `do_rescale` 参数进行覆盖。
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            如果需要重新缩放图像,使用的缩放因子。可以在 `preprocess` 方法中通过 `rescale_factor` 参数进行覆盖。
        do_pad (`bool`, *optional*, defaults to `True`):
            是否对图像进行填充,使其变成正方形,并在右下角用灰色像素填充。可以在 `preprocess` 方法中通过 `do_pad` 参数进行覆盖。
        do_resize (`bool`, *optional*, defaults to `True`):
            控制是否将图像的(高度,宽度)尺寸调整为指定的 `size`。可以在 `preprocess` 方法中通过 `do_resize` 参数进行覆盖。
        size (`Dict[str, int]` *optional*, defaults to `{"height": 960, "width": 960}`):
            要调整的图像大小。可以在 `preprocess` 方法中通过 `size` 参数进行覆盖。
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            如果调整图像大小,要使用的重采样方法。可以在 `preprocess` 方法中通过 `resample` 参数进行覆盖。
        do_normalize (`bool`, *optional*, defaults to `True`):
            是否对图像进行标准化。可以在 `preprocess` 方法中通过 `do_normalize` 参数进行覆盖。
        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
            如果进行图像标准化,要使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法中通过 `image_mean` 参数进行覆盖。
        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
            如果进行图像标准化,要使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法中通过 `image_std` 参数进行覆盖。
    ```
    ) -> None:
        super().__init__(**kwargs)  # 调用父类的构造方法,传递所有的关键字参数给父类

        self.do_rescale = do_rescale  # 设置是否进行重新缩放的标志
        self.rescale_factor = rescale_factor  # 设置重新缩放的因子
        self.do_pad = do_pad  # 设置是否进行填充的标志
        self.do_resize = do_resize  # 设置是否进行调整大小的标志
        self.size = size if size is not None else {"height": 960, "width": 960}  # 设置图像的目标大小,默认为960x960
        self.resample = resample  # 设置重采样方法
        self.do_normalize = do_normalize  # 设置是否进行归一化的标志
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN  # 设置图像归一化的均值
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD  # 设置图像归一化的标准差
        self._valid_processor_keys = [  # 定义有效的处理器关键字列表,用于后续检查和处理
            "images",
            "do_pad",
            "do_resize",
            "size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    def pad(
        self,
        image: np.array,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        """
        Pad an image to a square with gray pixels on the bottom and the right, as per the original OWLv2
        implementation.

        Args:
            image (`np.ndarray`):
                Image to pad.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred from the input
                image.
        """
        height, width = get_image_size(image)  # 获取图像的高度和宽度
        size = max(height, width)  # 计算需要填充到的正方形大小
        image = pad(
            image=image,
            padding=((0, size - height), (0, size - width)),  # 设置填充的方式,向下和向右填充灰色像素
            constant_values=0.5,  # 设置填充的常数值
            data_format=data_format,  # 设置填充后图像的数据格式
            input_data_format=input_data_format,  # 设置输入图像的数据格式
        )

        return image

    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        anti_aliasing: bool = True,
        anti_aliasing_sigma=None,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    def resize_image(
        image: np.ndarray,
        size: Dict[str, int],
        anti_aliasing: bool = True,
        anti_aliasing_sigma: Optional[float] = None,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None
    ) -> np.ndarray:
        """
        Resize an image using scipy functions with optional anti-aliasing.
    
        Args:
            image (`np.ndarray`):
                The input image to resize.
            size (`Dict[str, int]`):
                Dictionary containing the target height and width of the resized image.
            anti_aliasing (`bool`, *optional*, defaults to `True`):
                Whether to apply anti-aliasing when downsampling the image.
            anti_aliasing_sigma (`float`, *optional*, defaults to `None`):
                Standard deviation for the Gaussian kernel used in anti-aliasing. Automatically calculated if `None`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The desired channel dimension format of the output image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. Inferred from input image if not provided.
    
        Raises:
            ValueError: If anti_aliasing_sigma is negative or if it's greater than zero without downsampling along all axes.
            UserWarning: If anti_aliasing_sigma is greater than zero but not downsampling along all axes.
    
        Returns:
            np.ndarray:
                Resized image with dimensions specified by `size`.
        """
        # Check and ensure scipy is available
        requires_backends(self, "scipy")
    
        # Calculate the desired output shape based on `size`
        output_shape = (size["height"], size["width"])
    
        # Convert image to the last channel dimension format
        image = to_channel_dimension_format(image, ChannelDimension.LAST)
    
        # Preprocess: Adjust image and output shape based on resize operation
        image, output_shape = _preprocess_resize_output_shape(image, output_shape)
    
        # Determine the input shape of the image
        input_shape = image.shape
    
        # Compute scaling factors based on input and output shapes
        factors = np.divide(input_shape, output_shape)
    
        # Set parameters for np.pad translation to scipy.ndimage modes
        ndi_mode = "mirror"
        cval = 0
        order = 1
    
        # Apply anti-aliasing if specified
        if anti_aliasing:
            if anti_aliasing_sigma is None:
                anti_aliasing_sigma = np.maximum(0, (factors - 1) / 2)
            else:
                anti_aliasing_sigma = np.atleast_1d(anti_aliasing_sigma) * np.ones_like(factors)
                if np.any(anti_aliasing_sigma < 0):
                    raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero")
                elif np.any((anti_aliasing_sigma > 0) & (factors <= 1)):
                    warnings.warn(
                        "Anti-aliasing standard deviation greater than zero but not down-sampling along all axes"
                    )
            # Apply Gaussian filter for anti-aliasing
            filtered = ndi.gaussian_filter(image, anti_aliasing_sigma, cval=cval, mode=ndi_mode)
        else:
            # No anti-aliasing: use original image
            filtered = image
    
        # Compute zoom factors based on scaling factors
        zoom_factors = [1 / f for f in factors]
    
        # Perform zooming operation on the filtered image
        out = ndi.zoom(filtered, zoom_factors, order=order, mode=ndi_mode, cval=cval, grid_mode=True)
    
        # Clip and warp output image based on input and output shapes
        image = _clip_warp_output(image, out)
    
        # Convert back to the input data format if specified
        image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
    
        # Convert to the desired data format if specified
        image = (
            to_channel_dimension_format(image, data_format, input_data_format)
            if data_format is not None
            else image
        )
    
        # Return the resized image
        return image
    `
        # 定义一个方法 preprocess,接受多种参数,用于预处理图像数据
        def preprocess(
            self,
            images: ImageInput,  # 输入图像,类型为 ImageInput
            do_pad: bool = None,  # 是否进行填充,默认为 None
            do_resize: bool = None,  # 是否进行缩放,默认为 None
            size: Dict[str, int] = None,  # 图像尺寸,类型为字典,键为字符串,值为整数,默认为 None
            do_rescale: bool = None,  # 是否进行重缩放,默认为 None
            rescale_factor: float = None,  # 重缩放因子,默认为 None
            do_normalize: bool = None,  # 是否进行归一化,默认为 None
            image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值,支持单一浮点数或浮点数列表,默认为 None
            image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差,支持单一浮点数或浮点数列表,默认为 None
            return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型,可以是字符串或 TensorType,默认为 None
            data_format: ChannelDimension = ChannelDimension.FIRST,  # 数据格式,默认为 ChannelDimension.FIRST
            input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据格式,默认为 None
            **kwargs,  # 额外的关键字参数
        # 从 transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor 导入的 post_process_object_detection 方法的签名
        def post_process_object_detection(
            self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None  # 定义 post_process_object_detection 方法,接收输出、阈值和目标尺寸,默认为 None
    ):
        """
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # TODO: (amy) add support for other frameworks

        # Extract logits and boxes from the model outputs
        logits, boxes = outputs.logits, outputs.pred_boxes

        # Validate target sizes if provided
        if target_sizes is not None:
            if len(logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

        # Calculate probabilities, scores, and labels
        probs = torch.max(logits, dim=-1)
        scores = torch.sigmoid(probs.values)
        labels = probs.indices

        # Convert bounding boxes to [x0, y0, x1, y1] format
        boxes = center_to_corners_format(boxes)

        # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is provided
        if target_sizes is not None:
            if isinstance(target_sizes, List):
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)

            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        # Prepare results as a list of dictionaries containing scores, labels, and boxes
        results = []
        for s, l, b in zip(scores, labels, boxes):
            score = s[s > threshold]
            label = l[s > threshold]
            box = b[s > threshold]
            results.append({"scores": score, "labels": label, "boxes": box})

        return results

    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_image_guided_detection

.\models\owlv2\modeling_owlv2.py

# 设置文件编码为 UTF-8
# 版权声明,标识该文件版权归 Google AI 和 The HuggingFace Team 所有
#
# 根据 Apache 许可证 2.0 版本,除非符合许可证,否则禁止使用本文件
# 可以在以下网址获取许可证的副本:http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件是基于“原样”提供的,不提供任何明示或暗示的担保或条件
# 有关详细信息,请参阅许可证

""" PyTorch OWLv2 模型."""

# 引入警告模块
import warnings
# 引入数据类模块
from dataclasses import dataclass
# 引入类型提示
from typing import Any, Dict, Optional, Tuple, Union

# 引入 NumPy 库
import numpy as np
# 引入 PyTorch 库
import torch
# 引入 PyTorch 的检查点模块
import torch.utils.checkpoint
# 引入 PyTorch 的张量和神经网络模块
from torch import Tensor, nn

# 引入自定义的激活函数映射
from ...activations import ACT2FN
# 引入自定义的注意力掩码工具函数
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
# 引入自定义的模型输出类
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
# 引入预训练模型基类
from ...modeling_utils import PreTrainedModel
# 引入工具类函数
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_vision_available,
    logging,
    replace_return_docstrings,
)
# 引入 OWLv2 配置类
from .configuration_owlv2 import Owlv2Config, Owlv2TextConfig, Owlv2VisionConfig

# 如果视觉可用,引入转换函数
if is_vision_available():
    from transformers.image_transforms import center_to_corners_format

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点地址
_CHECKPOINT_FOR_DOC = "google/owlv2-base-patch16-ensemble"

# OWLv2 预训练模型存档列表
# 查看所有 OWLv2 模型:https://huggingface.co/models?filter=owlv2
OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/owlv2-base-patch16-ensemble",
    # 查看所有 OWLv2 模型:https://huggingface.co/models?filter=owlv2
]

# 从 transformers.models.clip.modeling_clip.contrastive_loss 复制的对比损失函数,替换为 owlv2
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))

# 从 transformers.models.clip.modeling_clip.clip_loss 复制的 OWLv2 损失函数
def owlv2_loss(similarity: torch.Tensor) -> torch.Tensor:
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())
    return (caption_loss + image_loss) / 2.0

# OWLv2 模型的输出数据类,继承自 ModelOutput
@dataclass
class Owlv2Output(ModelOutput):
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`Owlv2VisionModel`].
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`Owlv2TextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Owlv2VisionModel`].
    """

    # Optional attribute: contrastive loss for image-text similarity
    loss: Optional[torch.FloatTensor] = None
    # Tensor attribute: scores of image-text similarity
    logits_per_image: torch.FloatTensor = None
    # Tensor attribute: scores of text-image similarity
    logits_per_text: torch.FloatTensor = None
    # Tensor attribute: embeddings of text data
    text_embeds: torch.FloatTensor = None
    # Tensor attribute: embeddings of image data
    image_embeds: torch.FloatTensor = None
    # Tuple attribute: output from text model with pooling
    text_model_output: BaseModelOutputWithPooling = None
    # Object attribute: output from vision model with pooling
    vision_model_output: BaseModelOutputWithPooling = None

    def to_tuple(self) -> Tuple[Any]:
        # Convert all attributes to a tuple, handling special cases for complex objects
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
    # 如果输入张量是浮点型,则根据需要提升到更高的浮点类型,以避免数值溢出问题
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        # 如果输入张量是整型,则根据需要提升到更高的整型类型
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
    """
    计算一组边界框的面积,这些边界框由其(x1,y1,x2,y2)坐标指定。

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            要计算面积的边界框。它们应该以(x1,y1,x2,y2)格式给出,其中 `0 <= x1 < x2` 和 `0 <= y1 < y2`。

    Returns:
        `torch.FloatTensor`: 包含每个边界框面积的张量。
    """
    # 将边界框张量提升到相应的数值类型,以处理数值精度问题
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
    """
    使用 https://giou.stanford.edu/ 中定义的广义 IoU 计算方法。边界框应该以 [x0, y0, x1, y1](角点)格式给出。

    Returns:
        `torch.FloatTensor`: 一个 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
    """
    # 如果存在退化的边界框,会导致无限大或非数值结果,因此进行早期检查
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 必须以 [x0, y0, x1, y1](角点)格式给出,但实际得到 {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 必须以 [x0, y0, x1, y1](角点)格式给出,但实际得到 {boxes2}")
    
    # 计算边界框的 IoU 和并集面积
    iou, union = box_iou(boxes1, boxes2)

    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    return iou - (area - union) / area


@dataclass
class Owlv2ObjectDetectionOutput(ModelOutput):
    """
    [`Owlv2ForObjectDetection`] 的输出类型。
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
            The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
            total number of patches is (image_size / patch_size)**2.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
            embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`Owlv2TextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Owlv2VisionModel`].
    """

    # Optional: Total loss combining cross-entropy and bounding box loss
    loss: Optional[torch.FloatTensor] = None
    # Optional: Dictionary containing individual losses
    loss_dict: Optional[Dict] = None
    # Optional: Classification logits (including no-object) for all queries
    logits: torch.FloatTensor = None
    # Optional: Objectness logits for all image patches
    objectness_logits: torch.FloatTensor = None
    # Optional: Normalized bounding box coordinates for all queries
    pred_boxes: torch.FloatTensor = None
    # Optional: Text embeddings obtained from Owlv2TextModel
    text_embeds: torch.FloatTensor = None
    # Optional: Image embeddings obtained from Owlv2VisionModel
    image_embeds: torch.FloatTensor = None
    # Optional: Class embeddings of all image patches
    class_embeds: torch.FloatTensor = None
    # Optional: Output of Owlv2TextModel, including pooling
    text_model_output: BaseModelOutputWithPooling = None
    # Optional: Output of Owlv2VisionModel, including pooling
    vision_model_output: BaseModelOutputWithPooling = None
    # 将对象转换为元组的方法定义,返回一个元组
    def to_tuple(self) -> Tuple[Any]:
        # 使用生成器表达式生成元组的每个元素
        return tuple(
            # 如果键不是"text_model_output"或"vision_model_output",则取出该键对应的值
            self[k] if k not in ["text_model_output", "vision_model_output"] 
            # 否则,调用对象自身的属性 k 的 to_tuple 方法来获取值,并作为元素
            else getattr(self, k).to_tuple()
            # 遍历对象自身的所有键
            for k in self.keys()
        )
@dataclass
# 数据类装饰器,用于定义具有预定义字段的类
class Owlv2ImageGuidedObjectDetectionOutput(ModelOutput):
    """
    [`Owlv2ForObjectDetection.image_guided_detection`] 的输出类型。

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            所有查询的分类 logits(包括无对象)。
        target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            所有查询的标准化框坐标,表示为 (center_x, center_y, width, height)。这些值在 [0, 1] 范围内,
            相对于批次中每个目标图像的大小(忽略可能的填充)。您可以使用 [`~Owlv2ImageProcessor.post_process_object_detection`]
            来获取未标准化的边界框。
        query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            所有查询的标准化框坐标,表示为 (center_x, center_y, width, height)。这些值在 [0, 1] 范围内,
            相对于批次中每个查询图像的大小(忽略可能的填充)。您可以使用 [`~Owlv2ImageProcessor.post_process_object_detection`]
            来获取未标准化的边界框。
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim)`):
            [`Owlv2VisionModel`] 的汇聚输出。OWLv2 将图像表示为一组图像补丁,并为每个补丁计算图像嵌入。
        query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim)`):
            [`Owlv2VisionModel`] 的汇聚输出。OWLv2 将图像表示为一组图像补丁,并为每个补丁计算图像嵌入。
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            所有图像补丁的类嵌入。OWLv2 将图像表示为一组图像补丁,其中补丁总数为 (image_size / patch_size)**2。
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            [`Owlv2TextModel`] 的输出。
        vision_model_output (`BaseModelOutputWithPooling`):
            [`Owlv2VisionModel`] 的输出。
    """

    logits: torch.FloatTensor = None
    image_embeds: torch.FloatTensor = None
    query_image_embeds: torch.FloatTensor = None
    target_pred_boxes: torch.FloatTensor = None
    query_pred_boxes: torch.FloatTensor = None
    class_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPooling = None
    vision_model_output: BaseModelOutputWithPooling = None
    # 定义一个方法将对象转换为元组形式,返回元组
    def to_tuple(self) -> Tuple[Any]:
        # 使用生成器表达式生成元组,遍历对象的键
        return tuple(
            # 如果键不在指定的列表中,则返回该键对应的值
            self[k] if k not in ["text_model_output", "vision_model_output"]
            # 否则,调用对象的相应属性的 to_tuple 方法并返回结果
            else getattr(self, k).to_tuple()
            for k in self.keys()  # 遍历对象的所有键
        )
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionEmbeddings with OwlViT->Owlv2
class Owlv2VisionEmbeddings(nn.Module):
    def __init__(self, config: Owlv2VisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))

        # Define patch embedding layer for image patches
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=config.patch_size,
            stride=config.patch_size,
            bias=False,
        )

        # Calculate total number of patches
        self.num_patches = (config.image_size // config.patch_size) ** 2
        self.num_positions = self.num_patches + 1

        # Positional embeddings for patches
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]

        # Extract patch embeddings from input image
        patch_embeds = self.patch_embedding(pixel_values)  # shape = [batch_size, num_channels, height, width]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)  # flatten patches and transpose for attention

        # Expand class embeddings to match batch size
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)

        # Concatenate class embeddings with patch embeddings
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)

        # Add positional embeddings to the combined embeddings
        embeddings = embeddings + self.position_embedding(self.position_ids)

        return embeddings


# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextEmbeddings with OwlViT->Owlv2
class Owlv2TextEmbeddings(nn.Module):
    def __init__(self, config: Owlv2TextConfig):
        super().__init__()

        # Token embeddings based on vocabulary size and hidden size
        self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)

        # Positional embeddings based on maximum position embeddings and hidden size
        self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

        # If position_ids is not provided, use default position_ids
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # If inputs_embeds is not provided, compute token embeddings from input_ids
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # Get positional embeddings based on position_ids
        position_embeddings = self.position_embedding(position_ids)

        # Combine token embeddings with positional embeddings
        embeddings = inputs_embeds + position_embeddings

        return embeddings


# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTAttention with OwlViT->Owlv2
class Owlv2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    # 初始化方法,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置对象保存为实例变量
        self.config = config
        # 从配置中获取隐藏层大小,作为嵌入维度
        self.embed_dim = config.hidden_size
        # 从配置中获取注意力头的数量
        self.num_heads = config.num_attention_heads
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查是否可以完全整除,否则抛出异常
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 计算缩放因子,用于缩放注意力分数
        self.scale = self.head_dim**-0.5
        # 从配置中获取注意力机制的dropout率
        self.dropout = config.attention_dropout

        # 初始化线性层,用于投影查询、键和值
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        # 输出投影层,用于最终的线性变换
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # 将输入张量重塑为适当形状的私有方法
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播方法,接受隐藏状态和各种掩码作为输入
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Owlv2
class Owlv2MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config  # 存储配置对象
        self.activation_fn = ACT2FN[config.hidden_act]  # 设置激活函数
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 定义线性层 fc1
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 定义线性层 fc2

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)  # 应用第一个线性层
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数
        hidden_states = self.fc2(hidden_states)  # 应用第二个线性层
        return hidden_states


# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Owlv2
class Owlv2EncoderLayer(nn.Module):
    def __init__(self, config: Owlv2Config):
        super().__init__()
        self.embed_dim = config.hidden_size  # 设置嵌入维度
        self.self_attn = Owlv2Attention(config)  # 创建自注意力层
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 创建 LayerNorm 层 1
        self.mlp = Owlv2MLP(config)  # 创建 MLP 层
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 创建 LayerNorm 层 2

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存残差连接

        hidden_states = self.layer_norm1(hidden_states)  # 应用 LayerNorm 层 1
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )  # 应用自注意力机制
        hidden_states = residual + hidden_states  # 添加残差连接

        residual = hidden_states  # 更新残差连接

        hidden_states = self.layer_norm2(hidden_states)  # 应用 LayerNorm 层 2
        hidden_states = self.mlp(hidden_states)  # 应用 MLP 层
        hidden_states = residual + hidden_states  # 添加残差连接

        outputs = (hidden_states,)  # 准备输出结果

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重,则添加到输出中

        return outputs


# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTPreTrainedModel with OwlViT->Owlv2,owlvit->owlv2
class Owlv2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    # 使用 Owlv2Config 类作为配置类
    config_class = Owlv2Config
    # 设置基础模型前缀为 "owlv2"
    base_model_prefix = "owlv2"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要进行模块拆分的模块列表
    _no_split_modules = ["Owlv2EncoderLayer"]

    # 初始化模型权重的函数
    def _init_weights(self, module):
        """Initialize the weights"""
        # 获取初始化因子
        factor = self.config.initializer_factor

        # 如果模块是 Owlv2TextEmbeddings 类型
        if isinstance(module, Owlv2TextEmbeddings):
            # 初始化 token_embedding 和 position_embedding 的权重
            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)

        # 如果模块是 Owlv2VisionEmbeddings 类型
        elif isinstance(module, Owlv2VisionEmbeddings):
            # 初始化 class_embedding, patch_embedding 和 position_embedding 的权重
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)

        # 如果模块是 Owlv2Attention 类型
        elif isinstance(module, Owlv2Attention):
            # 初始化 attention 模块的权重
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            out_proj_std = (module.embed_dim**-0.5) * factor
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)

        # 如果模块是 Owlv2MLP 类型
        elif isinstance(module, Owlv2MLP):
            # 初始化 MLP 模块的权重
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)

        # 如果模块是 Owlv2Model 类型
        elif isinstance(module, Owlv2Model):
            # 初始化模型的 text_projection 和 visual_projection 的权重
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )

        # 如果模块是 nn.LayerNorm 类型
        if isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 模块的偏置和权重
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        # 如果模块是 nn.Linear 类型且有偏置
        if isinstance(module, nn.Linear) and module.bias is not None:
            # 将线性层的偏置初始化为零
            module.bias.data.zero_()
# OWLV2_START_DOCSTRING 是模型文档字符串的开始,描述了该模型从 PreTrainedModel 继承,提供了通用方法,
# 如下载或保存模型、调整输入嵌入大小、修剪头等。
OWLV2_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Owvl2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# OWLV2_TEXT_INPUTS_DOCSTRING 是用于文本输入的文档字符串,描述了输入参数及其用途。
OWLV2_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# OWLV2_VISION_INPUTS_DOCSTRING 是用于视觉输入的文档字符串,描述了输入参数及其用途。
OWLV2_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# OWLV2_INPUTS_DOCSTRING 是输入参数文档字符串的起始,作为一个整体,包含了不同输入类型的文档字符串。
OWLV2_INPUTS_DOCSTRING = r"""
"""  # 这是一个占位符字符串,用于组合不同输入类型的文档字符串。
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列标记在词汇表中的索引。可以使用 [`AutoTokenizer`] 获得这些索引。参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`] 进行详细了解。[什么是输入 ID?](../glossary#input-ids)
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 像素值。
            Pixel values.
        
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 注意力掩码,用于避免在填充标记索引上执行注意力操作。掩码取值为 `[0, 1]`:
            # - 1 表示 **未被遮蔽** 的标记,
            # - 0 表示 **被遮蔽** 的标记。
            [什么是注意力掩码?](../glossary#attention-mask)
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        
        return_loss (`bool`, *optional*):
            # 是否返回对比损失。
            Whether or not to return the contrastive loss.
        
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。在返回的张量中查看 `attentions` 以获取更多详细信息。
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。在返回的张量中查看 `hidden_states` 以获取更多详细信息。
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        return_base_image_embeds (`bool`, *optional*):
            # 是否返回基础图像嵌入。
            Whether or not to return the base image embeddings.
        
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而非普通元组。
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING 是一个文档字符串,描述了 OWLV2 模型输入的参数及其形状和含义
OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素值。
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            输入序列标记在词汇表中的索引。索引可以通过 [`AutoTokenizer`] 获取。参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`] 获取更多详情。[什么是输入 ID?](../glossary#input-ids)。
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            遮罩,避免对填充的标记索引执行注意力。遮罩值选择在 `[0, 1]`:
            - 1 表示 **未被遮罩** 的标记,
            - 0 表示 **被遮罩** 的标记。
            [什么是注意力遮罩?](../glossary#attention-mask)
        output_hidden_states (`bool`, *optional*):
            是否返回最后一个隐藏状态。查看返回张量中的 `text_model_last_hidden_state` 和 `vision_model_last_hidden_state` 获取更多详情。
        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""

# OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING 是一个文档字符串,描述了 OWLV2 模型输入的参数及其形状和含义
OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素值。
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            要检测的查询图像的像素值。每个目标图像都传入一个查询图像。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 获取更多详情。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。查看返回张量中的 `hidden_states` 获取更多详情。
        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""

# 以下是一个自定义的编码器类 Owlv2Encoder,用于 OWLV2 模型
class Owlv2Encoder(nn.Module):
    """
    Transformer 编码器,包含 `config.num_hidden_layers` 个自注意力层。每一层是一个 [`Owlv2EncoderLayer`]。

    Args:
        config: Owlv2Config
    """

    def __init__(self, config: Owlv2Config):
        super().__init__()
        # 创建一个包含 `config.num_hidden_layers` 个 Owlv2EncoderLayer 的模块列表
        self.layers = nn.ModuleList([Owlv2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点默认关闭
        self.gradient_checkpointing = False
    # 定义一个方法 `forward`,用于执行模型的前向传播
    def forward(
        self,
        # 输入的嵌入向量,可以是任意形状的张量
        inputs_embeds,
        # 注意力掩码,用于指示哪些位置需要注意,可以是可选的张量
        attention_mask: Optional[torch.Tensor] = None,
        # 因果注意力掩码,用于自回归任务中,指示哪些位置不应被关注
        causal_attention_mask: Optional[torch.Tensor] = None,
        # 是否输出注意力权重
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态
        output_hidden_states: Optional[bool] = None,
        # 是否返回一个字典作为输出
        return_dict: Optional[bool] = None,
# 从 transformers.models.owlvit.modeling_owlvit.OwlViTTextTransformer 复制而来,将 OWLVIT 替换为 OWLV2,OwlViT 替换为 Owlv2
class Owlv2TextTransformer(nn.Module):
    # 初始化函数,接收 Owlv2TextConfig 类型的参数 config
    def __init__(self, config: Owlv2TextConfig):
        super().__init__()
        self.config = config
        # 根据配置的 hidden_size 获取嵌入维度
        embed_dim = config.hidden_size
        # 初始化 Owlv2TextEmbeddings 对象,用于处理输入的嵌入
        self.embeddings = Owlv2TextEmbeddings(config)
        # 初始化 Owlv2Encoder 对象,用于编码输入序列
        self.encoder = Owlv2Encoder(config)
        # 初始化 LayerNorm 层,用于最终的归一化处理
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 定义前向传播函数,添加 OWLV2_TEXT_INPUTS_DOCSTRING 的模型输入文档注释
    # 并使用 replace_return_docstrings 将输出类型替换为 BaseModelOutputWithPooling,配置类为 Owlv2TextConfig
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:
        """
        # 如果未显式提供,则使用配置中的输出注意力机制设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未显式提供,则使用配置中的输出隐藏状态设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未显式提供,则使用配置中的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 获取输入的张量形状
        input_shape = input_ids.size()
        # 将输入的张量重新视图为二维张量,其中的最后一个维度保持不变
        input_ids = input_ids.view(-1, input_shape[-1])
        # 使用嵌入层将输入张量和位置编码作为参数传入,生成隐藏状态
        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # num_samples, seq_len = input_shape  where num_samples = batch_size * num_max_text_queries
        # OWLV2 的文本模型使用因果注意力掩码,在此处准备它
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        # 创建一个四维因果注意力掩码,基于输入形状、隐藏状态的数据类型和设备
        causal_attention_mask = _create_4d_causal_attention_mask(
            input_shape, hidden_states.dtype, device=hidden_states.device
        )
        # 如果存在注意力掩码,则扩展它的维度
        if attention_mask is not None:
            # 将二维注意力掩码扩展为四维格式 [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len]
            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)

        # 将隐藏状态作为输入嵌入传递给编码器
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器输出中的最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 对最后一个隐藏状态进行最终的层归一化
        last_hidden_state = self.final_layer_norm(last_hidden_state)

        # 从tokens嵌入的末尾获取特征(每个序列中最大的数值对应的位置)
        # 为了兼容ONNX,将input_ids转换为torch.int类型进行argmax操作
        pooled_output = last_hidden_state[
            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
            input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device),
        ]

        # 如果不使用返回字典模式,则返回多个输出项的元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 使用BaseModelOutputWithPooling类封装结果,以返回更结构化的输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 从 transformers.models.owlvit.modeling_owlvit.OwlViTTextModel 复制到 Owlv2TextModel,并更新了一些路径和类名
class Owlv2TextModel(Owlv2PreTrainedModel):
    # 指定配置类为 Owlv2TextConfig
    config_class = Owlv2TextConfig

    def __init__(self, config: Owlv2TextConfig):
        # 调用父类的初始化方法,传入配置对象
        super().__init__(config)
        # 使用 Owlv2TextTransformer 创建文本模型
        self.text_model = Owlv2TextTransformer(config)
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回文本模型的输入嵌入层(token_embedding)
        return self.text_model.embeddings.token_embedding

    def set_input_embeddings(self, value):
        # 设置文本模型的输入嵌入层(token_embedding)
        self.text_model.embeddings.token_embedding = value

    @add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2TextConfig)
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        """
        前向传播方法
        
        Returns:
            返回一个元组或者 BaseModelOutputWithPooling 类型的对象
        """
        # 调用文本模型的前向传播方法,传入输入参数
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


# 从 transformers.models.owlvit.modeling_owlvit.OwlViTVisionTransformer 复制到 Owlv2VisionTransformer,并更新了一些路径和类名
class Owlv2VisionTransformer(nn.Module):
    def __init__(self, config: Owlv2VisionConfig):
        super().__init__()
        self.config = config

        # 初始化视觉嵌入层和前层归一化层
        self.embeddings = Owlv2VisionEmbeddings(config)
        self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化编码器和后层归一化层
        self.encoder = Owlv2Encoder(config)
        self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    @add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2VisionConfig)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        神经网络模型的前向传播函数,接收输入像素值和一些可选参数,并返回模型输出。

        Args:
            pixel_values (torch.FloatTensor): 输入的像素值张量。
            output_attentions (Optional[bool]): 是否输出注意力权重,默认为None。
            output_hidden_states (Optional[bool]): 是否输出隐藏状态,默认为None。
            return_dict (Optional[bool]): 是否以字典形式返回输出,默认为None。

        Returns:
            Union[Tuple, BaseModelOutputWithPooling]: 根据 `return_dict` 参数返回不同形式的模型输出。

        """
        # 根据传入的参数或配置选择是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据传入的参数或配置选择是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据传入的参数或配置选择是否使用返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入像素值转换为预期的数据类型
        expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
        pixel_values = pixel_values.to(expected_input_dtype)

        # 嵌入层处理输入像素值
        hidden_states = self.embeddings(pixel_values)
        # 应用预层归一化
        hidden_states = self.pre_layernorm(hidden_states)

        # 编码器处理嵌入后的隐藏状态
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态和池化输出
        last_hidden_state = encoder_outputs[0]
        pooled_output = last_hidden_state[:, 0, :]

        # 应用后层归一化到池化输出
        pooled_output = self.post_layernorm(pooled_output)

        # 根据 `return_dict` 参数返回不同形式的模型输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 使用自定义的输出对象构建返回结果
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 从 transformers.models.owlvit.modeling_owlvit.OwlViTVisionModel 复制并修改为 Owlv2VisionModel,包括 OWLVIT->OWLV2,OwlViT->Owlv2,google/owlvit-base-patch32->google/owlv2-base-patch16 的变更
class Owlv2VisionModel(Owlv2PreTrainedModel):
    # 设置配置类为 Owlv2VisionConfig
    config_class = Owlv2VisionConfig
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    def __init__(self, config: Owlv2VisionConfig):
        # 调用父类构造函数初始化模型
        super().__init__(config)
        # 初始化视觉模型为 Owlv2VisionTransformer
        self.vision_model = Owlv2VisionTransformer(config)
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回视觉模型的嵌入层中的 patch_embedding
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2VisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        返回视觉模型的前向传播结果。

        参数:
        - pixel_values: 可选的 torch.FloatTensor,像素值
        - output_attentions: 可选的 bool,是否输出注意力权重
        - output_hidden_states: 可选的 bool,是否输出隐藏状态
        - return_dict: 可选的 bool,是否返回字典形式的输出

        返回:
        - BaseModelOutputWithPooling 或 Tuple,模型输出的汇总结果

        示例:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Owlv2VisionModel

        >>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # 池化后的 CLS 状态
        ```"""
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(OWLV2_START_DOCSTRING)
# 从 transformers.models.owlvit.modeling_owlvit.OwlViTModel 复制并修改为 Owlv2Model,包括 google/owlvit-base-patch32->google/owlv2-base-patch16-ensemble, OWLVIT->OWLV2,OwlViT->Owlv2,owlvit->owlv2,OWL-ViT->OWLv2 的变更
class Owlv2Model(Owlv2PreTrainedModel):
    # 设置配置类为 Owlv2Config
    config_class = Owlv2Config
    def __init__(self, config: Owlv2Config):
        # 调用父类构造函数初始化
        super().__init__(config)

        # 检查配置文件中的文本配置是否为Owlv2TextConfig类型,若不是则抛出数值错误异常
        if not isinstance(config.text_config, Owlv2TextConfig):
            raise ValueError(
                "config.text_config is expected to be of type Owlv2TextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查配置文件中的视觉配置是否为Owlv2VisionConfig类型,若不是则抛出数值错误异常
        if not isinstance(config.vision_config, Owlv2VisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type Owlv2VisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从配置文件中获取文本和视觉配置
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置投影维度,文本嵌入维度和视觉嵌入维度
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 初始化文本模型和视觉模型
        self.text_model = Owlv2TextTransformer(text_config)
        self.vision_model = Owlv2VisionTransformer(vision_config)

        # 设置视觉投影和文本投影层,不带偏置
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

        # 设置logit缩放作为模型参数
        self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Owlv2TextModel`].

        Examples:
        ```
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astronaut"]], return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # 如果未指定返回字典,则使用配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 获取所有批次样本中所有文本查询的嵌入
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=return_dict)
        # 提取池化输出作为文本特征
        pooled_output = text_output[1]
        # 将池化输出投影到文本投影层
        text_features = self.text_projection(pooled_output)

        return text_features
    @add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Owlv2VisionModel`].

        Examples:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```"""
        
        # Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components.
        # 根据需要,使用 OWLv2 模型的配置替换视觉和文本组件的相关字段。
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # 调用视觉模型来获取视觉输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从视觉模型的输出中获取池化后的输出
        pooled_output = vision_outputs[1]
        # 使用视觉投影层对池化输出进行投影,得到图像特征
        image_features = self.visual_projection(pooled_output)

        # 返回图像特征
        return image_features

    @add_start_docstrings_to_model_forward(OWLV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Owlv2Output, config_class=Owlv2Config)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_base_image_embeds: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 与 get_image_features 方法类似,该方法在模型的正向传播过程中处理输入和输出
        # 具体实现的细节会因模型的不同而有所不同,这里给出了一个大致的框架和输入参数
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTBoxPredictionHead with OwlViT->Owlv2
class Owlv2BoxPredictionHead(nn.Module):
    def __init__(self, config: Owlv2Config, out_dim: int = 4):
        super().__init__()

        # Extract hidden size from configuration
        width = config.vision_config.hidden_size
        # Define fully connected layers for box prediction head
        self.dense0 = nn.Linear(width, width)
        self.dense1 = nn.Linear(width, width)
        self.gelu = nn.GELU()
        self.dense2 = nn.Linear(width, out_dim)

    def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
        # Pass through the first linear layer followed by GELU activation
        output = self.dense0(image_features)
        output = self.gelu(output)
        # Pass through the second linear layer followed by GELU activation
        output = self.dense1(output)
        output = self.gelu(output)
        # Final prediction through the third linear layer
        output = self.dense2(output)
        return output


# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTClassPredictionHead with OwlViT->Owlv2
class Owlv2ClassPredictionHead(nn.Module):
    def __init__(self, config: Owlv2Config):
        super().__init__()

        # Extract hidden sizes from configuration
        out_dim = config.text_config.hidden_size
        self.query_dim = config.vision_config.hidden_size

        # Define fully connected layers and activation functions for class prediction head
        self.dense0 = nn.Linear(self.query_dim, out_dim)
        self.logit_shift = nn.Linear(self.query_dim, 1)
        self.logit_scale = nn.Linear(self.query_dim, 1)
        self.elu = nn.ELU()

    def forward(
        self,
        image_embeds: torch.FloatTensor,
        query_embeds: Optional[torch.FloatTensor],
        query_mask: Optional[torch.Tensor],
    ) -> Tuple[torch.FloatTensor]:
        # Compute image class embeddings
        image_class_embeds = self.dense0(image_embeds)

        # Handle case when query embeddings are not provided
        if query_embeds is None:
            device = image_class_embeds.device
            batch_size, num_patches = image_class_embeds.shape[:2]
            # Initialize prediction logits with zeros
            pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device)
            return (pred_logits, image_class_embeds)

        # Normalize image and query embeddings
        image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
        query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)

        # Calculate class predictions using matrix multiplication
        pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)

        # Apply a learnable shift and scale to logits
        logit_shift = self.logit_shift(image_embeds)
        logit_scale = self.logit_scale(image_embeds)
        logit_scale = self.elu(logit_scale) + 1
        pred_logits = (pred_logits + logit_shift) * logit_scale

        # Apply mask to logits if provided
        if query_mask is not None:
            if query_mask.ndim > 1:
                query_mask = torch.unsqueeze(query_mask, dim=-2)

            pred_logits = pred_logits.to(torch.float64)
            pred_logits = torch.where(query_mask == 0, -1e6, pred_logits)
            pred_logits = pred_logits.to(torch.float32)

        return (pred_logits, image_class_embeds)


class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
    config_class = Owlv2Config
    # 初始化函数,接收一个 Owlv2Config 类型的参数 config
    def __init__(self, config: Owlv2Config):
        # 调用父类的初始化方法,传入 config 参数
        super().__init__(config)

        # 创建 Owlv2Model 模型对象,使用传入的 config 参数
        self.owlv2 = Owlv2Model(config)
        # 创建 Owlv2ClassPredictionHead 类对象,使用传入的 config 参数
        self.class_head = Owlv2ClassPredictionHead(config)
        # 创建 Owlv2BoxPredictionHead 类对象,使用传入的 config 参数
        self.box_head = Owlv2BoxPredictionHead(config)
        # 创建 Owlv2BoxPredictionHead 类对象,使用传入的 config 参数,设置 out_dim=1
        self.objectness_head = Owlv2BoxPredictionHead(config, out_dim=1)

        # 创建一个 LayerNorm 层,使用 config 中的 hidden_size 和 layer_norm_eps 参数
        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
        # 创建一个 Sigmoid 激活函数对象
        self.sigmoid = nn.Sigmoid()

        # 计算 sqrt_num_patches,即图像尺寸除以补丁尺寸,结果取整
        self.sqrt_num_patches = config.vision_config.image_size // config.vision_config.patch_size

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.normalize_grid_corner_coordinates 复制而来
    def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
        # 计算特征图中每个 patch 的归一化的 xy 角落坐标

        # 检查 feature_map 的维度是否为 4
        if not feature_map.ndim == 4:
            raise ValueError("Expected input shape is [batch_size, num_patches, num_patches, hidden_dim]")

        # 获取 feature_map 的设备信息
        device = feature_map.device
        num_patches = feature_map.shape[1]

        # 使用 numpy 创建二维坐标网格,表示每个 patch 的角落坐标,结果类型为 float32
        box_coordinates = np.stack(
            np.meshgrid(np.arange(1, num_patches + 1), np.arange(1, num_patches + 1)), axis=-1
        ).astype(np.float32)
        box_coordinates /= np.array([num_patches, num_patches], np.float32)

        # 将 (h, w, 2) 的坐标数组展平为 (h*w, 2)
        box_coordinates = box_coordinates.reshape(
            box_coordinates.shape[0] * box_coordinates.shape[1], box_coordinates.shape[2]
        )
        # 将 numpy 数组转换为 torch 张量,并发送到与 feature_map 相同的设备
        box_coordinates = torch.from_numpy(box_coordinates).to(device)

        return box_coordinates

    # 预测每个图像特征 token 是否是对象的概率
    def objectness_predictor(self, image_features: torch.FloatTensor) -> torch.FloatTensor:
        """Predicts the probability that each image feature token is an object.

        Args:
            image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
                Features extracted from the image.
        Returns:
            Objectness scores.
        """
        # 对输入的 image_features 进行去除梯度操作
        image_features = image_features.detach()
        # 使用 objectness_head 对 image_features 进行预测,得到对象性得分
        objectness_logits = self.objectness_head(image_features)
        # 从 objectness_logits 中提取第一个维度的数据,通常表示对象性的得分
        objectness_logits = objectness_logits[..., 0]
        return objectness_logits

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.compute_box_bias 复制而来
    # 计算盒子中心相对于特征网格位置的偏置
    def compute_box_bias(self, feature_map: torch.FloatTensor) -> torch.FloatTensor:
        # 将盒子坐标规范化到网格角落坐标
        box_coordinates = self.normalize_grid_corner_coordinates(feature_map)
        # 将坐标裁剪到区间 [0.0, 1.0]
        box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)

        # 反归一化 xy
        box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)

        # 计算盒子尺寸相对于补丁尺寸的偏置
        box_size = torch.full_like(box_coord_bias, 1.0 / feature_map.shape[-2])
        box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)

        # 计算盒子偏置
        box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1)
        return box_bias

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.box_predictor 复制而来
    def box_predictor(
        self,
        image_feats: torch.FloatTensor,
        feature_map: torch.FloatTensor,
    ) -> torch.FloatTensor:
        """
        Args:
            image_feats:
                从 image_text_embedder 方法返回的图像提取特征。
            feature_map:
                图像特征的空间重新排列,也是从 image_text_embedder 方法返回的。

        Returns:
            pred_boxes:
                预测框列表 (cxcywh,归一化到 0 到 1),嵌套在一个字典中。
        """
        # 边界框检测头 [batch_size, num_boxes, 4]。
        pred_boxes = self.box_head(image_feats)

        # 计算每个令牌在网格上的位置,并用它来计算bbox预测的偏置
        pred_boxes += self.compute_box_bias(feature_map)
        pred_boxes = self.sigmoid(pred_boxes)
        return pred_boxes

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.class_predictor 复制而来
    def class_predictor(
        self,
        image_feats: torch.FloatTensor,
        query_embeds: Optional[torch.FloatTensor] = None,
        query_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            image_feats:
                从 image_text_embedder 提取的特征。
            query_embeds:
                文本查询嵌入。
            query_mask:
                必须与 query_embeddings 一起提供。指示哪些查询嵌入是有效的掩码。

        Returns:
            (pred_logits, image_class_embeds):
                预测的逻辑张量和图像类别嵌入。
        """
        (pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask)

        return (pred_logits, image_class_embeds)

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.image_text_embedder 复制而来,owlvit 改为 owlv2
    def image_text_embedder(
        self,
        input_ids: torch.Tensor,
        pixel_values: torch.FloatTensor,
        attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            input_ids:
                输入的令牌 IDs。
            pixel_values:
                图像的像素值。
            attention_mask:
                用于指示输入的注意力掩码。
            output_attentions:
                是否输出注意力权重。
            output_hidden_states:
                是否输出隐藏状态。

        Returns:
            (sequence_output, pooled_output):
                序列输出和池化输出。
        """
        # 实现在模型中嵌入图像和文本的函数
        raise NotImplementedError
    ) -> Tuple[torch.FloatTensor]:
        # 对文本和图像进行编码

        # 使用 OwlV2 模型进行处理,传入像素值、输入 ID、注意力掩码等参数
        outputs = self.owlv2(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
        )

        # 获取图像嵌入
        # 从 OwlV2 模型的输出中提取最后隐藏状态
        last_hidden_state = outputs.vision_model_output[0]
        # 应用后层归一化到图像嵌入
        image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state)

        # 调整类别令牌的大小
        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)

        # 将图像嵌入与类别令牌合并
        image_embeds = image_embeds[:, 1:, :] * class_token_out
        # 应用层归一化到图像嵌入
        image_embeds = self.layer_norm(image_embeds)

        # 调整大小为 [batch_size, num_patches, num_patches, hidden_size]
        new_size = (
            image_embeds.shape[0],
            self.sqrt_num_patches,
            self.sqrt_num_patches,
            image_embeds.shape[-1],
        )
        image_embeds = image_embeds.reshape(new_size)
        # 从 OwlV2 模型的输出中提取文本嵌入
        text_embeds = outputs[-4]

        return (text_embeds, image_embeds, outputs)

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.image_embedder 复制而来,将 owlvit->owlv2,OwlViTModel->Owlv2Model
    def image_embedder(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
    ) -> Tuple[torch.FloatTensor]:
        # 获取 Owlv2Model 视觉嵌入(与 CLIP 相同)

        # 使用 OwlV2 模型处理像素值,返回字典
        vision_outputs = self.owlv2.vision_model(pixel_values=pixel_values, return_dict=True)

        # 应用后层归一化到最后隐藏状态,返回非投影输出
        last_hidden_state = vision_outputs[0]
        image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state)

        # 调整类别令牌的大小
        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)

        # 将图像嵌入与类别令牌合并
        image_embeds = image_embeds[:, 1:, :] * class_token_out
        # 应用层归一化到图像嵌入
        image_embeds = self.layer_norm(image_embeds)

        # 调整大小为 [batch_size, num_patches, num_patches, hidden_size]
        new_size = (
            image_embeds.shape[0],
            self.sqrt_num_patches,
            self.sqrt_num_patches,
            image_embeds.shape[-1],
        )
        image_embeds = image_embeds.reshape(new_size)

        return (image_embeds, vision_outputs)

    # 从 transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.embed_image_query 复制而来
    def embed_image_query(
        self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
    ) -> torch.FloatTensor:
        # 获取类别预测的结果,但不使用
        _, class_embeds = self.class_predictor(query_image_features)
        # 使用查询图像特征进行边界框预测
        pred_boxes = self.box_predictor(query_image_features, query_feature_map)
        # 将预测的边界框转换为左上角和右下角坐标格式
        pred_boxes_as_corners = center_to_corners_format(pred_boxes)

        # 遍历查询图像
        best_class_embeds = []
        best_box_indices = []
        pred_boxes_device = pred_boxes_as_corners.device

        for i in range(query_image_features.shape[0]):
            # 创建一个形状为 [1, 4] 的张量,用于表示每个查询框
            each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device)
            # 获取当前查询图像的预测边界框
            each_query_pred_boxes = pred_boxes_as_corners[i]
            # 计算每个查询框与预测边界框之间的 IoU
            ious, _ = box_iou(each_query_box, each_query_pred_boxes)

            # 如果没有重叠的框,则使用广义 IoU
            if torch.all(ious[0] == 0.0):
                ious = generalized_box_iou(each_query_box, each_query_pred_boxes)

            # 使用自适应阈值选取IoU最佳80%范围内的所有框
            iou_threshold = torch.max(ious) * 0.8

            # 找到满足阈值条件的预测框索引
            selected_inds = (ious[0] >= iou_threshold).nonzero()
            if selected_inds.numel():
                # 选取类别嵌入向量
                selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
                # 计算选取类别嵌入向量的平均值
                mean_embeds = torch.mean(class_embeds[i], axis=0)
                # 计算平均相似度
                mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
                # 选择平均相似度最小的预测框索引
                best_box_ind = selected_inds[torch.argmin(mean_sim)]
                best_class_embeds.append(class_embeds[i][best_box_ind])
                best_box_indices.append(best_box_ind)

        # 如果存在最佳类别嵌入向量,则堆叠它们
        if best_class_embeds:
            query_embeds = torch.stack(best_class_embeds)
            box_indices = torch.stack(best_box_indices)
        else:
            query_embeds, box_indices = None, None

        # 返回查询嵌入向量、预测框索引和预测框
        return query_embeds, box_indices, pred_boxes

    @add_start_docstrings_to_model_forward(OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Owlv2ImageGuidedObjectDetectionOutput, config_class=Owlv2Config)
    def image_guided_detection(
        self,
        pixel_values: torch.FloatTensor,
        query_pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 实现 OWLV2 模型的图像引导目标检测

    @add_start_docstrings_to_model_forward(OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Owlv2ObjectDetectionOutput, config_class=Owlv2Config)
    def forward(
        self,
        input_ids: torch.Tensor,
        pixel_values: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # OWLV2 模型的前向传播

.\models\owlv2\processing_owlv2.py

# coding=utf-8
# 版权所有 2023 年 HuggingFace Inc. 团队
#
# 根据 Apache 许可证 2.0 版本(“许可证”)许可;
# 除非符合许可证的规定,否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则依据“原样”分发的软件
# 无任何担保或条件,包括但不限于,适销性和特定用途适用性的保证。
# 有关许可证的详细信息,请参阅许可证。
"""
OWLv2 的图像/文本处理器类
"""

from typing import List

import numpy as np

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from ...utils import is_flax_available, is_tf_available, is_torch_available


class Owlv2Processor(ProcessorMixin):
    r"""
    构建 OWLv2 处理器,将 [`Owlv2ImageProcessor`] 和 [`CLIPTokenizer`] / [`CLIPTokenizerFast`] 包装成一个处理器,
    继承了图像处理器和分词器的功能。详细信息请参阅 [`~OwlViTProcessor.__call__`] 和 [`~OwlViTProcessor.decode`]。

    Args:
        image_processor ([`Owlv2ImageProcessor`]):
            必需的图像处理器输入。
        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
            必需的分词器输入。
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "Owlv2ImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")

    def __init__(self, image_processor, tokenizer, **kwargs):
        super().__init__(image_processor, tokenizer)

    # 从 transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ 复制,将 OWLViT->OWLv2
    # 从 transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection 复制,将 OWLViT->OWLv2
    def post_process_object_detection(self, *args, **kwargs):
        """
        此方法将所有参数转发给 [`OwlViTImageProcessor.post_process_object_detection`]。
        有关更多信息,请参阅此方法的文档字符串。
        """
        return self.image_processor.post_process_object_detection(*args, **kwargs)

    # 从 transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection 复制,将 OWLViT->OWLv2
    def post_process_image_guided_detection(self, *args, **kwargs):
        """
        此方法将所有参数转发给 [`OwlViTImageProcessor.post_process_one_shot_object_detection`]。
        有关更多信息,请参阅此方法的文档字符串。
        """
        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)

    # 从 transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode 复制
    # 将所有参数转发给 CLIPTokenizerFast 的 batch_decode 方法
    # 请参考该方法的文档字符串获取更多信息
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # 调用 CLIPTokenizerFast 的 batch_decode 方法,并返回结果
        return self.tokenizer.batch_decode(*args, **kwargs)

    # 从 transformers.models.owlvit.processing_owlvit.OwlViTProcessor.decode 复制而来
    # 将所有参数转发给 CLIPTokenizerFast 的 decode 方法
    # 请参考该方法的文档字符串获取更多信息
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        # 调用 CLIPTokenizerFast 的 decode 方法,并返回结果
        return self.tokenizer.decode(*args, **kwargs)

.\models\owlv2\__init__.py

# 版权声明和许可证信息,声明版权归 HuggingFace 团队所有
#
# 根据 Apache License, Version 2.0 进行许可
# 除非符合许可证的要求,否则不得使用此文件
# 可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律不允许或书面同意,软件按"原样"分发
# 无任何明示或暗示的保证或条件
# 详细信息请查看许可证内容
from typing import TYPE_CHECKING

# 从 utils 中导入必要的类和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_vision_available,
)

# 定义导入结构,用于按需导入模块和类
_import_structure = {
    "configuration_owlv2": [
        "OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Owlv2Config",
        "Owlv2TextConfig",
        "Owlv2VisionConfig",
    ],
    "processing_owlv2": ["Owlv2Processor"],
}

# 检查视觉处理是否可用,若不可用则抛出异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用则添加视觉处理相关的导入结构
    _import_structure["image_processing_owlv2"] = ["Owlv2ImageProcessor"]

# 检查 Torch 是否可用,若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用则添加模型处理相关的导入结构
    _import_structure["modeling_owlv2"] = [
        "OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Owlv2Model",
        "Owlv2PreTrainedModel",
        "Owlv2TextModel",
        "Owlv2VisionModel",
        "Owlv2ForObjectDetection",
    ]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从相关模块中导入必要的类和函数
    from .configuration_owlv2 import (
        OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        Owlv2Config,
        Owlv2TextConfig,
        Owlv2VisionConfig,
    )
    from .processing_owlv2 import Owlv2Processor

    # 再次检查视觉处理是否可用,若不可用则忽略导入
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_owlv2 import Owlv2ImageProcessor

    # 再次检查 Torch 是否可用,若不可用则忽略导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_owlv2 import (
            OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Owlv2ForObjectDetection,
            Owlv2Model,
            Owlv2PreTrainedModel,
            Owlv2TextModel,
            Owlv2VisionModel,
        )

# 如果不是类型检查阶段,则将当前模块注册为 LazyModule
else:
    import sys

    # 使用 LazyModule 类来延迟导入模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\owlvit\configuration_owlvit.py

# 指定编码方式为 UTF-8

# 版权声明和许可条款,声明代码版权和使用许可
# 根据 Apache License, Version 2.0 许可,使用该文件需要遵守该许可协议

# 导入标准库中的 os 模块
import os
# 导入 collections 模块中的 OrderedDict 类
from collections import OrderedDict
# 导入类型检查相关模块
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union

# 如果 TYPE_CHECKING 为 True,则导入处理工具的混合器和张量类型
if TYPE_CHECKING:
    from ...processing_utils import ProcessorMixin
    from ...utils import TensorType

# 导入配置工具中的预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入 ONNX 配置类 OnnxConfig
from ...onnx import OnnxConfig
# 导入日志记录工具
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# OWLVIT_PRETRAINED_CONFIG_ARCHIVE_MAP 字典,映射了预训练模型名到配置文件 URL 的对应关系
OWLVIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/owlvit-base-patch32": "https://huggingface.co/google/owlvit-base-patch32/resolve/main/config.json",
    "google/owlvit-base-patch16": "https://huggingface.co/google/owlvit-base-patch16/resolve/main/config.json",
    "google/owlvit-large-patch14": "https://huggingface.co/google/owlvit-large-patch14/resolve/main/config.json",
}


class OwlViTTextConfig(PretrainedConfig):
    r"""
    这是配置类,用于存储 [`OwlViTTextModel`] 的配置信息。它被用来实例化 OwlViT 文本编码器,根据指定的参数定义模型架构。
    使用默认参数实例化一个配置对象将会产生与 OwlViT [google/owlvit-base-patch32] 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`],可以用来控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
    """
    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the OWL-ViT text model. Defines the number of different tokens that can be represented
            by the `inputs_ids` passed when calling [`OwlViTTextModel`].
        hidden_size (`int`, *optional*, defaults to 512):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        max_position_embeddings (`int`, *optional*, defaults to 16):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token in the input sequences.
        bos_token_id (`int`, *optional*, defaults to 49406):
            The id of the beginning-of-sequence token in the input sequences.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the input sequences.

    Example:

    ```
    >>> from transformers import OwlViTTextConfig, OwlViTTextModel

    >>> # Initializing a OwlViTTextModel with google/owlvit-base-patch32 style configuration
    >>> configuration = OwlViTTextConfig()

    >>> # Initializing a OwlViTTextConfig from the google/owlvit-base-patch32 style configuration
    >>> model = OwlViTTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```



    # 定义模型类型为 "owlvit_text_model"
    model_type = "owlvit_text_model"
    # 初始化函数,用于创建一个新的配置对象实例
    def __init__(
        self,
        vocab_size=49408,                    # 词汇表大小,默认为 49408
        hidden_size=512,                     # 隐藏层大小,默认为 512
        intermediate_size=2048,              # 中间层大小,默认为 2048
        num_hidden_layers=12,                # 隐藏层数,默认为 12
        num_attention_heads=8,               # 注意力头数,默认为 8
        max_position_embeddings=16,          # 最大位置编码数,默认为 16
        hidden_act="quick_gelu",             # 隐藏层激活函数,默认为 "quick_gelu"
        layer_norm_eps=1e-5,                 # LayerNormalization 中的 epsilon,默认为 1e-5
        attention_dropout=0.0,               # 注意力机制中的 dropout 概率,默认为 0.0
        initializer_range=0.02,              # 初始化范围,默认为 0.02
        initializer_factor=1.0,              # 初始化因子,默认为 1.0
        pad_token_id=0,                      # 填充标记的 ID,默认为 0
        bos_token_id=49406,                  # 起始标记的 ID,默认为 49406
        eos_token_id=49407,                  # 结束标记的 ID,默认为 49407
        **kwargs,                            # 其他关键字参数
    ):
        # 调用父类的初始化方法,设置填充、起始和结束标记的 ID,以及其他参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置对象的属性值
        self.vocab_size = vocab_size                     # 设置词汇表大小
        self.hidden_size = hidden_size                   # 设置隐藏层大小
        self.intermediate_size = intermediate_size       # 设置中间层大小
        self.num_hidden_layers = num_hidden_layers       # 设置隐藏层数
        self.num_attention_heads = num_attention_heads   # 设置注意力头数
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置编码数
        self.hidden_act = hidden_act                     # 设置隐藏层激活函数
        self.layer_norm_eps = layer_norm_eps             # 设置 LayerNormalization 中的 epsilon
        self.attention_dropout = attention_dropout       # 设置注意力机制中的 dropout 概率
        self.initializer_range = initializer_range       # 设置初始化范围
        self.initializer_factor = initializer_factor     # 设置初始化因子

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置 token 相关的参数到 kwargs 中
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和更新后的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型为 "owlvit",则从 "text_config" 中获取文本配置字典
        if config_dict.get("model_type") == "owlvit":
            config_dict = config_dict["text_config"]

        # 如果配置字典中包含 "model_type" 并且类本身有 "model_type" 属性,并且两者不同,发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用配置字典和 kwargs 创建一个新的对象实例
        return cls.from_dict(config_dict, **kwargs)
# 定义 OwlViTVisionConfig 类,继承自 PretrainedConfig 类
class OwlViTVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`OwlViTVisionModel`]. It is used to instantiate
    an OWL-ViT image encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the OWL-ViT
    [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 768):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```
    >>> from transformers import OwlViTVisionConfig, OwlViTVisionModel

    >>> # Initializing a OwlViTVisionModel with google/owlvit-base-patch32 style configuration
    >>> configuration = OwlViTVisionConfig()

    >>> # Initializing a OwlViTVisionModel model from the google/owlvit-base-patch32 style configuration
    >>> model = OwlViTVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # 定义模型类型字符串常量
    model_type = "owlvit_vision_model"
    
    # 定义模型配置类
    class PretrainedConfig:
    
        # 初始化方法,设置模型的各种参数
        def __init__(
            self,
            hidden_size=768,
            intermediate_size=3072,
            num_hidden_layers=12,
            num_attention_heads=12,
            num_channels=3,
            image_size=768,
            patch_size=32,
            hidden_act="quick_gelu",
            layer_norm_eps=1e-5,
            attention_dropout=0.0,
            initializer_range=0.02,
            initializer_factor=1.0,
            **kwargs,
        ):
            # 调用父类的初始化方法
            super().__init__(**kwargs)
    
            # 设置对象的各个属性
            self.hidden_size = hidden_size
            self.intermediate_size = intermediate_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.num_channels = num_channels
            self.image_size = image_size
            self.patch_size = patch_size
            self.hidden_act = hidden_act
            self.layer_norm_eps = layer_norm_eps
            self.attention_dropout = attention_dropout
            self.initializer_range = initializer_range
            self.initializer_factor = initializer_factor
    
        # 类方法,从预训练模型加载配置
        @classmethod
        def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
            # 在 kwargs 中设置 token 相关参数
            cls._set_token_in_kwargs(kwargs)
    
            # 调用 get_config_dict 方法获取配置字典和更新后的 kwargs
            config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
    
            # 如果配置字典中的模型类型是 owlviT,则使用其中的 vision_config
            if config_dict.get("model_type") == "owlvit":
                config_dict = config_dict["vision_config"]
    
            # 如果配置字典中有 model_type 属性,并且与当前类中的 model_type 不一致,输出警告信息
            if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
                logger.warning(
                    f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                    f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
                )
    
            # 调用 from_dict 方法,从配置字典创建对象
            return cls.from_dict(config_dict, **kwargs)
# `OwlViTConfig` 是存储 `OwlViTModel` 配置的类。
# 该类用于实例化 OWL-ViT 模型,根据指定参数定义文本模型和视觉模型配置。
# 通过使用默认参数实例化配置对象将生成类似于 OWL-ViT [google/owlvit-base-patch32] 架构的配置。
class OwlViTConfig(PretrainedConfig):
    r"""
    [`OwlViTConfig`] 是用于存储 [`OwlViTModel`] 配置的类。它用于根据指定的参数实例化 OWL-ViT 模型,
    定义文本模型和视觉模型的配置。使用默认参数实例化配置对象将生成类似于 OWL-ViT
    [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) 架构的配置。

    配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    Args:
        text_config (`dict`, *optional*):
            用于初始化 [`OwlViTTextConfig`] 的配置选项字典。
        vision_config (`dict`, *optional*):
            用于初始化 [`OwlViTVisionConfig`] 的配置选项字典。
        projection_dim (`int`, *optional*, defaults to 512):
            文本和视觉投影层的维度。
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            *logit_scale* 参数的初始值。默认值按照原始 OWL-ViT 实现使用。
        return_dict (`bool`, *optional*, defaults to `True`):
            模型是否应返回一个字典。如果为 `False`,则返回一个元组。
        kwargs (*optional*):
            关键字参数字典。
    """

    # 模型类型为 "owlvit"
    model_type = "owlvit"

    def __init__(
        self,
        text_config=None,
        vision_config=None,
        projection_dim=512,
        logit_scale_init_value=2.6592,
        return_dict=True,
        **kwargs,
    ):
        # 调用父类的构造方法
        super().__init__(**kwargs)

        # 如果 text_config 为 None,则使用默认值初始化 OwlViTTextConfig,并记录日志
        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the OwlViTTextConfig with default values.")

        # 如果 vision_config 为 None,则使用默认值初始化 OwlViTVisionConfig,并记录日志
        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. initializing the OwlViTVisionConfig with default values.")

        # 使用 text_config 初始化 self.text_config
        self.text_config = OwlViTTextConfig(**text_config)
        # 使用 vision_config 初始化 self.vision_config
        self.vision_config = OwlViTVisionConfig(**vision_config)

        # 设置 projection_dim 属性为传入的参数 projection_dim
        self.projection_dim = projection_dim
        # 设置 logit_scale_init_value 属性为传入的参数 logit_scale_init_value
        self.logit_scale_init_value = logit_scale_init_value
        # 设置 return_dict 属性为传入的参数 return_dict
        self.return_dict = return_dict
        # 设置 initializer_factor 属性为 1.0
        self.initializer_factor = 1.0

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)  # 调用类方法设置关键字参数中的 token

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        # 获取预训练模型的配置字典和更新后的关键字参数

        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            # 如果配置字典中包含 "model_type" 键,并且类中有 "model_type" 属性,并且它们不相等
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )
            # 发出警告,指出正在使用不同类型的模型进行实例化,可能导致错误

        return cls.from_dict(config_dict, **kwargs)
        # 使用配置字典和关键字参数实例化类对象,并返回该对象

    @classmethod
    def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
        r"""
        Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision
        model configuration.

        Returns:
            [`OwlViTConfig`]: An instance of a configuration object
        """
        config_dict = {}
        config_dict["text_config"] = text_config  # 将文本模型配置存储到配置字典中
        config_dict["vision_config"] = vision_config  # 将视觉模型配置存储到配置字典中

        return cls.from_dict(config_dict, **kwargs)
        # 使用配置字典和关键字参数实例化类对象,并返回该对象
# 定义一个继承自OnnxConfig的OwlViTOnnxConfig类,用于配置Owl Vision Transformer模型的ONNX导出设置

@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
    # 定义模型输入的顺序字典,指定每个输入的名称和对应的维度标识
    return OrderedDict(
        [
            ("input_ids", {0: "batch", 1: "sequence"}),  # input_ids输入的维度说明
            ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),  # pixel_values输入的维度说明
            ("attention_mask", {0: "batch", 1: "sequence"}),  # attention_mask输入的维度说明
        ]
    )

@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
    # 定义模型输出的顺序字典,指定每个输出的名称和对应的维度标识
    return OrderedDict(
        [
            ("logits_per_image", {0: "batch"}),  # logits_per_image输出的维度说明
            ("logits_per_text", {0: "batch"}),  # logits_per_text输出的维度说明
            ("text_embeds", {0: "batch"}),  # text_embeds输出的维度说明
            ("image_embeds", {0: "batch"}),  # image_embeds输出的维度说明
        ]
    )

@property
def atol_for_validation(self) -> float:
    # 定义用于验证的绝对误差容限
    return 1e-4

def generate_dummy_inputs(
    self,
    processor: "ProcessorMixin",
    batch_size: int = -1,
    seq_length: int = -1,
    framework: Optional["TensorType"] = None,
) -> Mapping[str, Any]:
    # 生成用于模型推理的虚拟输入数据,包括文本和图像的虚拟输入
    text_input_dict = super().generate_dummy_inputs(
        processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
    )
    image_input_dict = super().generate_dummy_inputs(
        processor.image_processor, batch_size=batch_size, framework=framework
    )
    return {**text_input_dict, **image_input_dict}

@property
def default_onnx_opset(self) -> int:
    # 定义默认的ONNX操作集版本
    return 14

.\models\owlvit\convert_owlvit_original_flax_to_hf.py

# 设定脚本编码格式为UTF-8,确保支持中文等非ASCII字符
# 版权声明,声明该代码的版权归The HuggingFace Inc.团队所有,保留所有权利
#
# 根据Apache许可证2.0版,除非符合许可证的条款,否则禁止使用本文件
# 您可以在以下网址获取许可证的副本:
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件是基于"按原样"提供的,没有任何形式的担保或条件
# 您可以查看许可证了解具体的法律条款和条件

"""从原始仓库中转换OWL-ViT检查点。URL:
https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit"""

import argparse  # 导入解析命令行参数的模块
import collections  # 导入用于操作集合的模块

import jax  # 导入用于自动求导的数值计算库JAX
import jax.numpy as jnp  # 导入JAX中的数学运算模块并重命名为jnp
import torch  # 导入PyTorch深度学习库
import torch.nn as nn  # 导入PyTorch中的神经网络模块
from clip.model import CLIP  # 从CLIP模块中导入CLIP模型
from flax.training import checkpoints  # 导入用于处理检查点的flax训练模块
from huggingface_hub import Repository  # 从Hugging Face Hub中导入Repository类

from transformers import (  # 从transformers库中导入以下模块
    CLIPTokenizer,  # 导入用于CLIP模型的分词器
    OwlViTConfig,  # 导入OWL-ViT模型的配置类
    OwlViTForObjectDetection,  # 导入用于物体检测的OWL-ViT模型
    OwlViTImageProcessor,  # 导入用于图像处理的OWL-ViT模型
    OwlViTModel,  # 导入OWL-ViT模型
    OwlViTProcessor,  # 导入OWL-ViT模型的处理器
)

CONFIGS = {
    "vit_b32": {  # vit_b32配置
        "embed_dim": 512,  # 嵌入维度
        "image_resolution": 768,  # 图像分辨率
        "context_length": 16,  # 上下文长度
        "vocab_size": 49408,  # 词汇表大小
        "vision_layers": 12,  # 视觉层数量
        "vision_width": 768,  # 视觉宽度
        "vision_patch_size": 32,  # 视觉补丁大小
        "transformer_width": 512,  # 转换器宽度
        "transformer_heads": 8,  # 转换器头部数量
        "transformer_layers": 12,  # 转换器层数
    },
    "vit_b16": {  # vit_b16配置
        "embed_dim": 512,  # 嵌入维度
        "image_resolution": 768,  # 图像分辨率
        "context_length": 16,  # 上下文长度
        "vocab_size": 49408,  # 词汇表大小
        "vision_layers": 12,  # 视觉层数量
        "vision_width": 768,  # 视觉宽度
        "vision_patch_size": 16,  # 视觉补丁大小
        "transformer_width": 512,  # 转换器宽度
        "transformer_heads": 8,  # 转换器头部数量
        "transformer_layers": 12,  # 转换器层数
    },
    "vit_l14": {  # vit_l14配置
        "embed_dim": 768,  # 嵌入维度
        "image_resolution": 840,  # 图像分辨率
        "context_length": 16,  # 上下文长度
        "vocab_size": 49408,  # 词汇表大小
        "vision_layers": 24,  # 视觉层数量
        "vision_width": 1024,  # 视觉宽度
        "vision_patch_size": 14,  # 视觉补丁大小
        "transformer_width": 768,  # 转换器宽度
        "transformer_heads": 12,  # 转换器头部数量
        "transformer_layers": 12,  # 转换器层数
    },
}


def flatten_nested_dict(params, parent_key="", sep="/"):
    """将嵌套字典展开为扁平化字典

    Args:
        params (dict): 要展开的嵌套字典
        parent_key (str, optional): 父键名. Defaults to "".
        sep (str, optional): 键之间的分隔符. Defaults to "/".

    Returns:
        dict: 扁平化后的字典
    """
    items = []

    for k, v in params.items():
        new_key = parent_key + sep + k if parent_key else k

        if isinstance(v, collections.MutableMapping):
            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def to_f32(params):
    """将参数中的bfloat16类型转换为float32类型

    Args:
        params (any): 待转换的参数

    Returns:
        any: 转换后的参数
    """
    return jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, params)


def copy_attn_layer(hf_attn_layer, pt_attn_layer):
    """复制注意力层参数

    Args:
        hf_attn_layer (torch.nn.Module): Hugging Face模型中的注意力层
        pt_attn_layer (torch.nn.Module): PyTorch模型中的注意力层
    """
    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)

    out_proj_weights = pt_attn_layer.out_proj.weight
    out_proj_bias = pt_attn_layer.out_proj.bias
    # 设置自注意力层的查询投影权重数据为给定的张量 q_proj
    hf_attn_layer.q_proj.weight.data = q_proj
    # 设置自注意力层的查询投影偏置数据为给定的张量 q_proj_bias
    hf_attn_layer.q_proj.bias.data = q_proj_bias
    
    # 设置自注意力层的键投影权重数据为给定的张量 k_proj
    hf_attn_layer.k_proj.weight.data = k_proj
    # 设置自注意力层的键投影偏置数据为给定的张量 k_proj_bias
    hf_attn_layer.k_proj.bias.data = k_proj_bias
    
    # 设置自注意力层的值投影权重数据为给定的张量 v_proj
    hf_attn_layer.v_proj.weight.data = v_proj
    # 设置自注意力层的值投影偏置数据为给定的张量 v_proj_bias
    hf_attn_layer.v_proj.bias.data = v_proj_bias
    
    # 设置自注意力层的输出投影权重数据为给定的张量 out_proj_weights
    hf_attn_layer.out_proj.weight = out_proj_weights
    # 设置自注意力层的输出投影偏置数据为给定的张量 out_proj_bias
    hf_attn_layer.out_proj.bias = out_proj_bias
def copy_mlp(hf_mlp, pt_mlp):
    # 复制多层感知机的线性层
    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)


def copy_linear(hf_linear, pt_linear):
    # 复制线性层的权重和偏置
    hf_linear.weight = pt_linear.weight
    hf_linear.bias = pt_linear.bias


def copy_layer(hf_layer, pt_layer):
    # 复制层的归一化层
    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)

    # 复制多层感知机
    copy_mlp(hf_layer.mlp, pt_layer.mlp)

    # 复制注意力层
    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)


def copy_layers(hf_layers, pt_layers):
    # 遍历并复制每一层
    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
        copy_layer(hf_layer, pt_layer)


def copy_encoder(hf_encoder, pt_model):
    # 复制编码器的嵌入层
    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding

    # 复制最终层归一化
    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)

    # 复制隐藏层
    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)


def copy_text_model_and_projection(hf_model, pt_model):
    # 复制文本投影
    hf_model.text_projection.weight.data = pt_model.text_projection.data.T

    # 复制文本编码器
    copy_encoder(hf_model.text_model, pt_model)


def copy_vision_model_and_projection(hf_model, pt_model):
    # 复制视觉投影
    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T

    # 复制视觉模型的归一化层
    copy_linear(hf_model.vision_model.pre_layernorm, pt_model.visual.ln_pre)
    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)

    # 复制视觉模型的嵌入层
    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data

    # 复制视觉模型的编码器
    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)


def copy_class_merge_token(hf_model, flax_params):
    # 扁平化嵌套字典的类合并标记参数
    flax_class_token_params = flatten_nested_dict(flax_params["backbone"]["merged_class_token"])

    # 将参数转换为PyTorch张量并复制到层归一化的权重和偏置
    weight = torch.from_numpy(flax_class_token_params["scale"])
    bias = torch.from_numpy(flax_class_token_params["bias"])
    hf_model.layer_norm.weight = nn.Parameter(weight)
    hf_model.layer_norm.bias = nn.Parameter(bias)


def copy_class_box_heads(hf_model, flax_params):
    pt_params = hf_model.state_dict()
    new_params = {}

    # 将Flax类预测头参数重命名为PyTorch HF
    flax_class_params = flatten_nested_dict(flax_params["class_head"])
    # 遍历flax_class_params字典,其中包含Flax模型的类别头参数
    for flax_key, v in flax_class_params.items():
        # 将flax_key中的斜杠替换为点号,以匹配PyTorch参数命名风格
        torch_key = flax_key.replace("/", ".")
        # 替换".kernel"为".weight",调整命名以匹配PyTorch的权重命名
        torch_key = torch_key.replace(".kernel", ".weight")
        # 将"Dense_0"替换为"dense0",调整命名以匹配PyTorch的命名约定
        torch_key = torch_key.replace("Dense_0", "dense0")
        # 将调整后的参数名加上"class_head."前缀,以表示这些参数属于分类头部
        torch_key = "class_head." + torch_key

        # 如果参数名中包含"weight"且v的维度为2,则将v转置
        if "weight" in torch_key and v.ndim == 2:
            v = v.T

        # 使用torch.from_numpy(v)创建一个PyTorch的参数对象,并保存到new_params字典中
        new_params[torch_key] = nn.Parameter(torch.from_numpy(v))

    # 重命名盒预测的Flax参数到PyTorch HF
    # 将obj_box_head中的Flax参数展平为一个字典
    flax_box_params = flatten_nested_dict(flax_params["obj_box_head"])

    # 遍历flax_box_params字典,其中包含盒预测头部的Flax参数
    for flax_key, v in flax_box_params.items():
        # 将flax_key中的斜杠替换为点号,以匹配PyTorch参数命名风格
        torch_key = flax_key.replace("/", ".")
        # 替换".kernel"为".weight",调整命名以匹配PyTorch的权重命名
        torch_key = torch_key.replace(".kernel", ".weight")
        # 替换下划线为空字符串,将所有字符转为小写,调整命名以匹配PyTorch的命名约定
        torch_key = torch_key.replace("_", "").lower()
        # 将调整后的参数名加上"box_head."前缀,以表示这些参数属于盒预测头部
        torch_key = "box_head." + torch_key

        # 如果参数名中包含"weight"且v的维度为2,则将v转置
        if "weight" in torch_key and v.ndim == 2:
            v = v.T

        # 使用torch.from_numpy(v)创建一个PyTorch的参数对象,并保存到new_params字典中
        new_params[torch_key] = nn.Parameter(torch.from_numpy(v))

    # 将调整后的参数复制到PyTorch的模型参数中
    for name, param in new_params.items():
        # 如果new_params中的参数名在pt_params中存在,则将其复制到pt_params中
        if name in pt_params.keys():
            pt_params[name].copy_(param)
# 将 Flax CLIP 模型的注意力参数复制到 Hugging Face PyTorch 模型的对应位置
def copy_flax_attn_params(hf_backbone, flax_attn_params):
    # 遍历 Flax 模型的注意力参数字典
    for k, v in flax_attn_params.items():
        # 如果键名以 "transformer" 开头,则替换为对应的 PyTorch 键名
        if k.startswith("transformer"):
            torch_key = k.replace("transformer.resblocks", "text_model.encoder.layers")
        else:
            torch_key = k.replace("visual.transformer.resblocks", "vision_model.encoder.layers")

        # 将键名中的 "attn" 替换为 "self_attn"
        torch_key = torch_key.replace("attn", "self_attn")
        # 将键名中的 "key" 替换为 "k_proj"
        torch_key = torch_key.replace("key", "k_proj")
        # 将键名中的 "value" 替换为 "v_proj"
        torch_key = torch_key.replace("value", "v_proj")
        # 将键名中的 "query" 替换为 "q_proj"
        torch_key = torch_key.replace("query", "q_proj")
        # 将键名中的 "out" 替换为 "out_proj"
        torch_key = torch_key.replace("out", "out_proj")

        # 如果键名包含 "bias" 并且值的维度为 2,则将值进行形状变换
        if "bias" in torch_key and v.ndim == 2:
            shape = v.shape[0] * v.shape[1]
            v = v.reshape(shape)

        # 如果键名包含 "weight" 并且包含 "out",则将值进行形状变换和转置
        if "weight" in torch_key and "out" in torch_key:
            shape = (v.shape[0] * v.shape[1], v.shape[2])
            v = v.reshape(shape).T

        # 如果键名包含 "weight" 但不包含 "out",则将值进行形状变换和转置
        if "weight" in torch_key and "out" not in torch_key:
            shape = (v.shape[0], v.shape[1] * v.shape[2])
            v = v.reshape(shape).T

        # 将 NumPy 数组转换为 PyTorch 张量,并复制到 Hugging Face PyTorch 模型的对应位置
        v = torch.from_numpy(v)
        hf_backbone.state_dict()[torch_key].copy_(v)


# 将 Flax CLIP 模型的注意力层参数转换为适合 Hugging Face PyTorch 模型的参数格式
def _convert_attn_layers(params):
    new_params = {}
    processed_attn_layers = []

    # 遍历参数字典
    for k, v in params.items():
        # 如果键名中包含 "attn."
        if "attn." in k:
            # 提取基础键名
            base = k[: k.rindex("attn.") + 5]
            # 如果基础键名已经处理过,则跳过
            if base in processed_attn_layers:
                continue

            # 将基础键名加入已处理列表
            processed_attn_layers.append(base)
            # 获取维度信息
            dim = params[base + "out.weight"].shape[-1]
            # 转换权重参数并进行转置,存入新参数字典
            new_params[base + "out_proj.weight"] = params[base + "out.weight"].reshape(dim, dim).T
            # 复制偏置参数到新参数字典
            new_params[base + "out_proj.bias"] = params[base + "out.bias"]
        else:
            # 直接复制非注意力层参数到新参数字典
            new_params[k] = v
    return new_params


# 将 Flax CLIP 模型的参数转换为适合 Hugging Face PyTorch CLIP 模型的参数格式
def convert_clip_backbone(flax_params, torch_config):
    # 使用给定的 PyTorch 配置创建 CLIP 模型
    torch_model = CLIP(**torch_config)
    # 将模型设为评估模式
    torch_model.eval()
    # 获取 PyTorch CLIP 模型的状态字典
    torch_clip_params = torch_model.state_dict()

    # 将嵌套字典展平为一级键值对
    flax_clip_params = flatten_nested_dict(flax_params["backbone"]["clip"])
    # 初始化新的 PyTorch 参数字典
    new_torch_params = {}
    # 遍历 flax_clip_params 字典的键值对
    for flax_key, v in flax_clip_params.items():
        # 将 flax 的键名替换为符合 PyTorch 命名规范的格式
        torch_key = flax_key.replace("/", ".")
        # 进一步替换特定的文本处理层的命名格式
        torch_key = torch_key.replace("text.token_embedding.embedding", "token_embedding.kernel")

        # 如果 torch_key 以指定的文本处理模块开头,则删除开头的部分
        if (
            torch_key.startswith("text.transformer")
            or torch_key.startswith("text.text_projection")
            or torch_key.startswith("text.ln_final")
            or torch_key.startswith("text.positional_embedding")
        ):
            torch_key = torch_key[5:]

        # 进一步替换其他特定的模块命名格式
        torch_key = torch_key.replace("text_projection.kernel", "text_projection")
        torch_key = torch_key.replace("visual.proj.kernel", "visual.proj")
        torch_key = torch_key.replace(".scale", ".weight")
        torch_key = torch_key.replace(".kernel", ".weight")

        # 如果 torch_key 包含 "conv" 或者 "downsample.0.weight",进行张量维度转置
        if "conv" in torch_key or "downsample.0.weight" in torch_key:
            v = v.transpose(3, 2, 0, 1)

        # 如果 torch_key 包含 "weight",且张量维度为二维,并且不是嵌入层,进行转置
        elif "weight" in torch_key and v.ndim == 2 and "embedding" not in torch_key:
            # 全连接层进行转置,嵌入层不转置
            v = v.T

        # 将处理后的键值对存入 new_torch_params 字典
        new_torch_params[torch_key] = v

    # 调用 _convert_attn_layers 函数将注意力层参数进行转换
    attn_params = _convert_attn_layers(new_torch_params)
    # 将转换后的注意力层参数更新到 new_torch_params 字典中
    new_torch_params.update(attn_params)
    # 清空 attn_params 字典
    attn_params = {}

    # 将 flax CLIP 骨干网络参数复制到 PyTorch 参数中
    for name, param in new_torch_params.items():
        # 如果参数名在 torch_clip_params 的键中存在
        if name in torch_clip_params.keys():
            # 将 new_torch_params 中的 NumPy 数组转换为 PyTorch 张量,并复制给 torch_clip_params
            new_param = torch.from_numpy(new_torch_params[name])
            torch_clip_params[name].copy_(new_param)
        else:
            # 将未复制的参数存入 attn_params 字典中
            attn_params[name] = param

    # 返回更新后的 PyTorch 参数、模型及注意力参数
    return torch_clip_params, torch_model, attn_params
@torch.no_grad()
def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 创建一个本地仓库对象,克隆或加载现有的PyTorch模型存储路径
    repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}")
    # 执行git pull操作,更新本地仓库内容
    repo.git_pull()

    # 如果提供了配置文件路径,则从预训练模型配置中加载OwlViTConfig
    if config_path is not None:
        config = OwlViTConfig.from_pretrained(config_path)
    else:
        # 否则创建一个空的OwlViTConfig对象
        config = OwlViTConfig()

    # 初始化一个评估模式的OwlViTModel和OwlViTForObjectDetection模型
    hf_backbone = OwlViTModel(config).eval()
    hf_model = OwlViTForObjectDetection(config).eval()

    # 复制文本模型和投影层到hf_backbone
    copy_text_model_and_projection(hf_backbone, pt_backbone)
    # 复制视觉模型和投影层到hf_backbone
    copy_vision_model_and_projection(hf_backbone, pt_backbone)
    # 将pt_backbone的logit_scale属性复制到hf_backbone的logit_scale属性
    hf_backbone.logit_scale = pt_backbone.logit_scale
    # 复制Flax的注意力参数到hf_backbone
    copy_flax_attn_params(hf_backbone, attn_params)

    # 将hf_backbone设置为hf_model的OwlViT模块
    hf_model.owlvit = hf_backbone
    # 复制flax_params中的类合并令牌到hf_model
    copy_class_merge_token(hf_model, flax_params)
    # 复制flax_params中的类盒头到hf_model
    copy_class_box_heads(hf_model, flax_params)

    # 保存转换后的HF模型到本地仓库目录
    hf_model.save_pretrained(repo.local_dir)

    # 初始化图像处理器,使用指定的图像大小和裁剪大小
    image_processor = OwlViTImageProcessor(
        size=config.vision_config.image_size, crop_size=config.vision_config.image_size
    )
    # 初始化CLIPTokenizer,从预训练模型"openai/clip-vit-base-patch32"加载,并设置pad_token和model_max_length
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)

    # 初始化OwlViTProcessor,传入image_processor和tokenizer作为参数
    processor = OwlViTProcessor(image_processor=image_processor, tokenizer=tokenizer)
    # 将image_processor保存到本地仓库目录
    image_processor.save_pretrained(repo.local_dir)
    # 将processor保存到本地仓库目录
    processor.save_pretrained(repo.local_dir)

    # 向git仓库添加修改
    repo.git_add()
    # 提交修改,并添加描述信息"Upload model and processor"
    repo.git_commit("Upload model and processor")
    # 推送本地仓库内容到远程仓库

    repo.git_push()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 必选参数
    parser.add_argument(
        "--owlvit_version",
        default=None,
        type=str,
        required=True,
        help="OWL-ViT model name [clip_b16, clip_b32, clip_l14].",
    )
    parser.add_argument(
        "--owlvit_checkpoint", default=None, type=str, required=True, help="Path to flax model checkpoint."
    )
    parser.add_argument("--hf_config", default=None, type=str, required=True, help="Path to HF model config.")
    parser.add_argument(
        "--pytorch_dump_folder_path", default="hf_model", type=str, help="Path to the output PyTorch model."
    )
    args = parser.parse_args()

    # 初始化PyTorch CLIP模型
    model_name = args.owlvit_version
    if model_name == "clip_b16":
        torch_config = CONFIGS["vit_b16"]
    elif model_name == "clip_b32":
        torch_config = CONFIGS["vit_b32"]
    elif model_name == "clip_l14":
        torch_config = CONFIGS["vit_l14"]

    # 从检查点中加载变量,并将参数转换为float-32
    variables = checkpoints.restore_checkpoint(args.owlvit_checkpoint, target=None)["optimizer"]["target"]
    flax_params = jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, variables)
    del variables

    # 转换CLIP的backbone
    # 调用函数 convert_clip_backbone 将 flax_params 转换为 pt_backbone_params、clip_pt 和 attn_params
    pt_backbone_params, clip_pt, attn_params = convert_clip_backbone(flax_params, torch_config)

    # 调用函数 convert_owlvit_checkpoint,将 clip_pt、flax_params 和 attn_params 转换为 PyTorch 模型的检查点
    # 将结果保存到指定的路径 args.pytorch_dump_folder_path,并传入额外的配置参数 args.hf_config
    convert_owlvit_checkpoint(clip_pt, flax_params, attn_params, args.pytorch_dump_folder_path, args.hf_config)

.\models\owlvit\feature_extraction_owlvit.py

# 设置编码为 UTF-8,确保脚本能够正确处理 Unicode 字符串
# 版权声明,标明 HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证版本 2.0 进行许可,除非符合许可证条款,否则不得使用此文件
# 可以在以下网址获取许可证副本:http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则不得根据此许可证分发本软件
# 本软件基于"按原样"基础提供,不提供任何明示或暗示的担保或条件
# 有关特定语言的权限,请参阅许可证
"""OwlViT 的特征提取器类。"""

# 导入警告模块,用于标记类已经被弃用
import warnings

# 从 utils 模块中导入 logging 功能
from ...utils import logging

# 从 image_processing_owlvit 模块中导入 OwlViTImageProcessor 类
from .image_processing_owlvit import OwlViTImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# OwlViTFeatureExtractor 类继承自 OwlViTImageProcessor 类
class OwlViTFeatureExtractor(OwlViTImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告,表明 OwlViTFeatureExtractor 类已被弃用,并将在 Transformers 的版本 5 中移除
        warnings.warn(
            "The class OwlViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use OwlViTImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 OwlViTImageProcessor 的初始化方法
        super().__init__(*args, **kwargs)

.\models\owlvit\image_processing_owlvit.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for OwlViT"""

import warnings
from typing import Dict, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    center_crop,
    center_to_corners_format,
    rescale,
    resize,
    to_channel_dimension_format,
)
from ...image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import TensorType, is_torch_available, logging

if is_torch_available():
    import torch

logger = logging.get_logger(__name__)


def _upcast(t):
    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


def box_area(boxes):
    """
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.
    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    """
    boxes = _upcast(boxes)
    # Calculate the area of each box using the formula: (width) * (height)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


def box_iou(boxes1, boxes2):
    """
    Computes the Intersection over Union (IoU) between two sets of bounding boxes.

    Args:
        boxes1 (`torch.FloatTensor` of shape `(N, 4)`): Bounding boxes in format (x1, y1, x2, y2).
        boxes2 (`torch.FloatTensor` of shape `(M, 4)`): Bounding boxes in format (x1, y1, x2, y2).

    Returns:
        `torch.FloatTensor`: IoU values of shape `(N, M)`.
        `torch.FloatTensor`: Union area of shape `(N, M)`.
    """
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    # Calculate the coordinates of the intersection boxes
    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    # Calculate width and height of intersection area, clamping at zero
    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]

    # Calculate intersection area
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    # Calculate union area
    union = area1[:, None] + area2 - inter

    # Calculate IoU
    iou = inter / union
    return iou, union


class OwlViTImageProcessor(BaseImageProcessor):
    r"""
    Constructs an OWL-ViT image processor.
    """
    pass  # Placeholder for future implementation
    """
    This image processor inherits from `ImageProcessingMixin` which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the shorter edge of the input to a certain `size`.
        size (`Dict[str, int]`, *optional*, defaults to {"height": 768, "width": 768}):
            The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
            sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
            to (size, size).
        resample (`int`, *optional*, defaults to `Resampling.BICUBIC`):
            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
            to `True`.
        do_center_crop (`bool`, *optional*, defaults to `False`):
            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
            image is padded with 0's and then center cropped.
        crop_size (`int`, *optional*, defaults to {"height": 768, "width": 768}):
            The size to use for center cropping the image. Only has an effect if `do_center_crop` is set to `True`.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the input by a certain factor.
        rescale_factor (`float`, *optional*, defaults to `1/255`):
            The factor to use for rescaling the image. Only has an effect if `do_rescale` is set to `True`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with `image_mean` and `image_std`. Desired output size when applying
            center-cropping. Only has an effect if `do_center_crop` is set to `True`.
        image_mean (`List[int]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            The sequence of means for each channel, to be used when normalizing images.
        image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    """
        ):
            # 如果未指定尺寸,则默认为 768x768,并确保尺寸是一个字典格式
            size = size if size is not None else {"height": 768, "width": 768}
            # 使用函数将尺寸规范化为标准字典格式,确保宽高一致
            size = get_size_dict(size, default_to_square=True)

            # 如果未指定裁剪尺寸,则默认为 768x768,并确保尺寸是一个字典格式
            crop_size = crop_size if crop_size is not None else {"height": 768, "width": 768}
            # 使用函数将裁剪尺寸规范化为标准字典格式,确保宽高一致
            crop_size = get_size_dict(crop_size, default_to_square=True)

            # 在 OWL-ViT hub 上早期的配置中,使用了 "rescale" 作为标志位。
            # 这与视觉图像处理方法 `rescale` 冲突,因为它将在 super().__init__ 调用期间设置为属性。
            # 为了向后兼容,这里将其处理为 `do_rescale` 的键值对参数。
            if "rescale" in kwargs:
                rescale_val = kwargs.pop("rescale")
                kwargs["do_rescale"] = rescale_val

            # 调用父类的初始化方法,传递所有参数
            super().__init__(**kwargs)
            # 设置对象的各个属性
            self.do_resize = do_resize
            self.size = size
            self.resample = resample
            self.do_center_crop = do_center_crop
            self.crop_size = crop_size
            self.do_rescale = do_rescale
            self.rescale_factor = rescale_factor
            self.do_normalize = do_normalize
            # 如果未指定图像均值,则使用默认的 OpenAI Clip 均值
            self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
            # 如果未指定图像标准差,则使用默认的 OpenAI Clip 标准差
            self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
            # 初始化有效的处理器关键字列表,用于验证参数的完整性
            self._valid_processor_keys = [
                "images",
                "do_resize",
                "size",
                "resample",
                "do_center_crop",
                "crop_size",
                "do_rescale",
                "rescale_factor",
                "do_normalize",
                "image_mean",
                "image_std",
                "return_tensors",
                "data_format",
                "input_data_format",
            ]

        def resize(
            self,
            image: np.ndarray,
            size: Dict[str, int],
            resample: PILImageResampling.BICUBIC,
            data_format: Optional[Union[str, ChannelDimension]] = None,
            input_data_format: Optional[Union[str, ChannelDimension]] = None,
            **kwargs,
    def center_crop(
        self,
        image: np.ndarray,
        crop_size: Dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Center crop an image to a certain size.

        Args:
            image (`np.ndarray`):
                Image to center crop.
            crop_size (`Dict[str, int]`):
                The size to center crop the image to. Must contain height and width keys.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 将crop_size字典转换为包含高度和宽度的标准尺寸字典
        crop_size = get_size_dict(crop_size, default_to_square=True)
        # 检查crop_size字典是否包含必需的高度和宽度键
        if "height" not in crop_size or "width" not in crop_size:
            raise ValueError("crop_size dictionary must contain height and width keys")

        # 调用函数进行中心裁剪,并返回裁剪后的图像
        return center_crop(
            image,
            (crop_size["height"], crop_size["width"]),
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        # 调用内部的图像重缩放函数,返回重缩放后的图像
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    def preprocess(
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = None,
        do_center_crop: Optional[bool] = None,
        crop_size: Optional[Dict[str, int]] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[TensorType, str]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Preprocess images with various transformations like resizing, cropping, rescaling, and normalization.

        Args:
            images (`ImageInput`):
                Input images to preprocess.
            do_resize (`bool`, *optional*):
                Whether to resize the images.
            size (`Dict[str, int]`, *optional*):
                Target size for resizing, as a dictionary with keys 'height' and 'width'.
            resample (`PILImageResampling`, *optional*):
                Resampling method for resizing images.
            do_center_crop (`bool`, *optional*):
                Whether to perform center cropping.
            crop_size (`Dict[str, int]`, *optional*):
                Size of the center crop, as a dictionary with keys 'height' and 'width'.
            do_rescale (`bool`, *optional*):
                Whether to rescale the images.
            rescale_factor (`float`, *optional*):
                Factor to use for rescaling the images.
            do_normalize (`bool`, *optional*):
                Whether to normalize the images.
            image_mean (`float` or `List[float]`, *optional*):
                Mean values for image normalization.
            image_std (`float` or `List[float]`, *optional*):
                Standard deviation values for image normalization.
            return_tensors (`TensorType` or `str`, *optional*):
                Desired tensor type for output images.
            data_format (`str` or `ChannelDimension`):
                The channel dimension format for the output images.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input images.
            **kwargs:
                Additional keyword arguments for preprocessing.

        Returns:
            Preprocessed images.
        """
        # 省略了具体的预处理步骤,根据参数进行图像预处理并返回预处理后的结果
        pass
    def post_process(self, outputs, target_sizes):
        """
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
                image size (before any data augmentation). For visualization, this should be the image size after data
                augment, but before padding.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # TODO: (amy) add support for other frameworks
        # 发出警告信息,提醒用户该函数将在 Transformers 版本 v5 中被移除,并建议使用新函数
        warnings.warn(
            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
            FutureWarning,
        )

        # 提取模型输出的分类 logits 和预测框 boxes
        logits, boxes = outputs.logits, outputs.pred_boxes

        # 检查 logits 和 target_sizes 的维度是否匹配
        if len(logits) != len(target_sizes):
            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
        # 检查 target_sizes 的每个元素是否包含正确的大小 (h, w)
        if target_sizes.shape[1] != 2:
            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

        # 计算每个预测框的概率 scores、类别 labels
        probs = torch.max(logits, dim=-1)
        scores = torch.sigmoid(probs.values)
        labels = probs.indices

        # 将预测框转换为 [x0, y0, x1, y1] 格式
        boxes = center_to_corners_format(boxes)

        # 将相对坐标 [0, 1] 转换为绝对坐标 [0, height],其中 height 和 width 分别来自 target_sizes
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
        boxes = boxes * scale_fct[:, None, :]

        # 构建输出结果列表,每个元素是一个字典包含 scores、labels 和 boxes
        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]

        return results

    def post_process_object_detection(
        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
    ):
        """
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # TODO: (amy) add support for other frameworks
        # Extract logits and bounding boxes from model outputs
        logits, boxes = outputs.logits, outputs.pred_boxes

        # Check if target_sizes is provided and validate its length
        if target_sizes is not None:
            if len(logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

        # Calculate probabilities and perform sigmoid activation
        probs = torch.max(logits, dim=-1)
        scores = torch.sigmoid(probs.values)
        labels = probs.indices

        # Convert bounding boxes from center format to corners format [x0, y0, x1, y1]
        boxes = center_to_corners_format(boxes)

        # Convert relative [0, 1] coordinates to absolute [0, height] coordinates if target_sizes is provided
        if target_sizes is not None:
            if isinstance(target_sizes, List):
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)

            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        # Filter predictions based on score threshold and organize results into dictionaries
        results = []
        for s, l, b in zip(scores, labels, boxes):
            score = s[s > threshold]
            label = l[s > threshold]
            box = b[s > threshold]
            results.append({"scores": score, "labels": label, "boxes": box})

        return results

    # TODO: (Amy) Make compatible with other frameworks

.\models\owlvit\modeling_owlvit.py

# 设置代码文件的字符编码为UTF-8
# Copyright声明和许可证信息,使用Apache License 2.0
# 详情参见:http://www.apache.org/licenses/LICENSE-2.0
# 本段代码是PyTorch版本的OWL-ViT模型定义

import warnings
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union

import numpy as np
import torch
import torch.utils.checkpoint
from torch import Tensor, nn

# 导入OWL-ViT模型中的激活函数映射
from ...activations import ACT2FN
# 导入OWL-ViT模型中的注意力掩码相关的函数
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
# 导入OWL-ViT模型的输出类
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
# 导入OWL-ViT模型的基类PreTrainedModel
from ...modeling_utils import PreTrainedModel
# 导入通用的工具函数
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_vision_available,
    logging,
    replace_return_docstrings,
)
# 导入OWL-ViT配置类
from .configuration_owlvit import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig

# 如果视觉处理可用,则导入视觉相关的转换函数
if is_vision_available():
    from transformers.image_transforms import center_to_corners_format

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点路径
_CHECKPOINT_FOR_DOC = "google/owlvit-base-patch32"

# 可用的预训练模型列表,参见 https://huggingface.co/models?filter=owlvit
OWLVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/owlvit-base-patch32",
    "google/owlvit-base-patch16",
    "google/owlvit-large-patch14",
]

# 定义对比损失函数,用于OWL-ViT模型
# 从transformers.models.clip.modeling_clip.contrastive_loss复制并修改为owlvit
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))

# 定义OWL-ViT模型特定的损失函数
# 从transformers.models.clip.modeling_clip.clip_loss复制并修改为owlvit
def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor:
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())
    return (caption_loss + image_loss) / 2.0

# 定义OWL-ViT模型的输出类,继承自ModelOutput
@dataclass
class OwlViTOutput(ModelOutput):
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim)`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`OwlViTVisionModel`].
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    """

    # Optional attribute: Contrastive loss for image-text similarity
    loss: Optional[torch.FloatTensor] = None
    # Tensor: Scores of image-text similarity, shape (image_batch_size, text_batch_size)
    logits_per_image: torch.FloatTensor = None
    # Tensor: Scores of text-image similarity, shape (text_batch_size, image_batch_size)
    logits_per_text: torch.FloatTensor = None
    # Tensor: Embeddings of text, shape (batch_size * num_max_text_queries, output_dim)
    text_embeds: torch.FloatTensor = None
    # Tensor: Embeddings of images, shape (batch_size, output_dim)
    image_embeds: torch.FloatTensor = None
    # Tuple[`BaseModelOutputWithPooling`]: Output of text model
    text_model_output: BaseModelOutputWithPooling = None
    # `BaseModelOutputWithPooling`: Output of vision model

    def to_tuple(self) -> Tuple[Any]:
        # Convert object attributes to a tuple, handling special cases for complex attributes
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
# Copied from transformers.models.detr.modeling_detr._upcast
# 将输入的张量升级到更高的数据类型,以防止在乘法操作中出现数值溢出
def _upcast(t: Tensor) -> Tensor:
    if t.is_floating_point():
        # 如果输入张量已经是浮点类型,则直接返回
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        # 如果输入张量是整型,则将其升级为对应的整型类型
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.detr.modeling_detr.box_area
# 计算一组边界框的面积,边界框通过其 (x1, y1, x2, y2) 坐标来指定
def box_area(boxes: Tensor) -> Tensor:
    """
    计算一组边界框的面积,边界框的格式为 (x1, y1, x2, y2),其中 `0 <= x1 < x2` 且 `0 <= y1 < y2`。

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            待计算面积的边界框。

    Returns:
        `torch.FloatTensor`: 包含每个边界框面积的张量。
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.detr.modeling_detr.box_iou
# 计算两组边界框之间的 IoU(Intersection over Union)
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    # 计算交集的左上角和右下角坐标
    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    # 计算交集区域的宽度和高度
    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    # 计算并集的面积
    union = area1[:, None] + area2 - inter

    # 计算 IoU
    iou = inter / union
    return iou, union


# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
# 计算广义 IoU,支持包括不完全矩形在内的任意形状的边界框
def generalized_box_iou(boxes1, boxes2):
    """
    根据 https://giou.stanford.edu/ 计算广义 IoU。边界框应为 [x0, y0, x1, y1](左上角和右下角)格式。

    Returns:
        `torch.FloatTensor`: 一个 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
    """
    # 检查是否存在退化的边界框,这些边界框会导致 inf / nan 的结果,因此进行早期检查
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 必须以 [x0, y0, x1, y1](左上角和右下角)格式给出,但得到的是 {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 必须以 [x0, y0, x1, y1](左上角和右下角)格式给出,但得到的是 {boxes2}")
    
    # 计算标准 IoU 和并集面积
    iou, union = box_iou(boxes1, boxes2)

    # 计算最小外接矩形的左上角和右下角坐标
    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    # 计算最小外接矩形的宽度和高度
    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    # 计算广义 IoU
    return iou - (area - union) / area


@dataclass
class OwlViTObjectDetectionOutput(ModelOutput):
    """
    [`OwlViTForObjectDetection`] 的输出类型。
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    """

    # Optional attributes initialized to `None` by default
    loss: Optional[torch.FloatTensor] = None
    loss_dict: Optional[Dict] = None
    logits: torch.FloatTensor = None
    pred_boxes: torch.FloatTensor = None
    text_embeds: torch.FloatTensor = None
    image_embeds: torch.FloatTensor = None
    class_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPooling = None
    vision_model_output: BaseModelOutputWithPooling = None

    # Method to convert the attributes to a tuple, excluding specific complex types
    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
@dataclass
class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
    """
    Output type of [`OwlViTForObjectDetection.image_guided_detection`].

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual target image in the batch
            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual query image in the batch
            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    """

    # 定义输出类,用于图像引导物体检测的结果
    logits: torch.FloatTensor = None
    image_embeds: torch.FloatTensor = None
    query_image_embeds: torch.FloatTensor = None
    target_pred_boxes: torch.FloatTensor = None
    query_pred_boxes: torch.FloatTensor = None
    class_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPooling = None
    vision_model_output: BaseModelOutputWithPooling = None

    # 转换为元组的方法,将类的字段转换为元组,除了"text_model_output"和"vision_model_output"外,它们将被转换为元组形式
    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
    # 初始化函数,接受一个 OwlViTVisionConfig 类型的配置对象作为参数
    def __init__(self, config: OwlViTVisionConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置对象保存在实例变量中
        self.config = config
        # 设置嵌入维度为配置对象中的隐藏大小
        self.embed_dim = config.hidden_size
        # 初始化类嵌入,使用随机生成的张量作为参数
        self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))

        # 创建图块嵌入层
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,          # 输入通道数为配置对象中的通道数
            out_channels=self.embed_dim,              # 输出通道数为嵌入维度
            kernel_size=config.patch_size,            # 卷积核大小为配置对象中的图块大小
            stride=config.patch_size,                 # 步幅为配置对象中的图块大小
            bias=False,                              # 不使用偏置项
        )

        # 计算图块数目,即图像尺寸除以图块大小的平方
        self.num_patches = (config.image_size // config.patch_size) ** 2
        # 设置位置嵌入层的位置数目,等于图块数目加一
        self.num_positions = self.num_patches + 1
        # 创建位置嵌入层,使用 Embedding 类,位置数目为 num_positions,嵌入维度为 embed_dim
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
        # 注册位置 ID 缓冲区,创建一个张量表示从 0 到 num_positions-1 的序列
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    # 前向传播函数,接受像素值作为输入,返回嵌入向量
    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        # 获取输入张量的批量大小
        batch_size = pixel_values.shape[0]
        # 对输入像素值进行图块嵌入,输出形状为 [batch_size, num_channels, height, width]
        patch_embeds = self.patch_embedding(pixel_values)
        # 将图块嵌入展平并转置,形状为 [batch_size, num_patches, embed_dim]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 扩展类嵌入,形状为 [batch_size, 1, embed_dim]
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        # 拼接类嵌入和图块嵌入,形状为 [batch_size, num_patches + 1, embed_dim]
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        # 加上位置嵌入,形状为 [batch_size, num_patches + 1, embed_dim]
        embeddings = embeddings + self.position_embedding(self.position_ids)

        # 返回嵌入向量
        return embeddings
class OwlViTTextEmbeddings(nn.Module):
    # OwlViTTextEmbeddings 类,用于处理文本嵌入
    def __init__(self, config: OwlViTTextConfig):
        super().__init__()
        # 初始化 token_embedding 层,用于词嵌入
        self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        # 初始化 position_embedding 层,用于位置编码
        self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # 创建 position_ids 张量,并注册为缓冲区,用于处理位置编码
        # 这个张量在序列化时会被导出,位置是内存中的连续存储
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 如果未提供 position_ids,则使用预先创建的 position_ids 张量
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供 inputs_embeds,则通过 token_embedding 层获取
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # 获取位置编码的嵌入
        position_embeddings = self.position_embedding(position_ids)
        # 计算最终的嵌入表示,包括词嵌入和位置编码的和
        embeddings = inputs_embeds + position_embeddings

        return embeddings


class OwlViTAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        # 初始化线性投影层,用于查询(q_proj)、键(k_proj)、值(v_proj)和输出(out_proj)
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重新形状张量,以便进行多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> torch.Tensor:
        # 省略了具体的前向传播代码,但通常会涉及到自注意力机制和线性投影操作
        pass


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->OwlViT
class OwlViTMLP(nn.Module):
    # OwlViTMLP 类,用于多层感知机(MLP)部分的实现
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 激活函数根据配置选择
        self.activation_fn = ACT2FN[config.hidden_act]
        # 第一个全连接层,将隐藏大小映射到中间大小
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # 第二个全连接层,将中间大小映射回隐藏大小
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
    # 定义前向传播方法,接收隐藏状态张量并返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态张量通过全连接层 fc1
        hidden_states = self.fc1(hidden_states)
        # 应用激活函数到 fc1 的输出上
        hidden_states = self.activation_fn(hidden_states)
        # 将激活后的隐藏状态张量通过全连接层 fc2
        hidden_states = self.fc2(hidden_states)
        # 返回处理后的张量作为前向传播的结果
        return hidden_states
# 从 transformers.models.clip.modeling_clip.CLIPEncoderLayer 复制而来,修改为 OwlViTEncoderLayer
class OwlViTEncoderLayer(nn.Module):
    def __init__(self, config: OwlViTConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏大小
        self.self_attn = OwlViTAttention(config)  # 初始化自注意力机制
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第一个层归一化层
        self.mlp = OwlViTMLP(config)  # 多层感知机模块
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第二个层归一化层

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        前向传播函数
        Args:
            hidden_states (`torch.FloatTensor`): 形状为 `(batch, seq_len, embed_dim)` 的输入状态
            attention_mask (`torch.FloatTensor`): 大小为 `(batch, 1, tgt_len, src_len)` 的注意力掩码,
                其中填充元素由非常大的负值表示。
            causal_attention_mask (`torch.FloatTensor`): 大小为 `(config.encoder_attention_heads,)` 的因果注意力掩码。
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。详细信息请参阅返回张量中的 `attentions`。
        """
        residual = hidden_states  # 保留残差连接

        hidden_states = self.layer_norm1(hidden_states)  # 应用第一个归一化层
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )  # 应用自注意力机制,获取注意力权重
        hidden_states = residual + hidden_states  # 添加残差连接

        residual = hidden_states  # 更新残差连接变量
        hidden_states = self.layer_norm2(hidden_states)  # 应用第二个归一化层
        hidden_states = self.mlp(hidden_states)  # 应用多层感知机模块
        hidden_states = residual + hidden_states  # 添加残差连接

        outputs = (hidden_states,)  # 设置输出为隐藏状态张量

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重,则添加到输出元组中

        return outputs  # 返回输出元组作为前向传播的结果


class OwlViTPreTrainedModel(PreTrainedModel):
    """
    一个处理权重初始化、下载预训练模型及简单接口的抽象类。
    """

    config_class = OwlViTConfig  # 使用 OwlViTConfig 类来配置模型
    base_model_prefix = "owlvit"  # 基础模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["OwlViTEncoderLayer"]  # 不进行模块分割的模块列表
    # 初始化模型权重函数,用于为给定模块初始化权重
    def _init_weights(self, module):
        """Initialize the weights"""
        # 从配置中获取初始化因子
        factor = self.config.initializer_factor
        
        # 如果模块是 OwlViTTextEmbeddings 类型,则初始化其权重
        if isinstance(module, OwlViTTextEmbeddings):
            # 初始化 token_embedding 和 position_embedding 的权重
            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
        
        # 如果模块是 OwlViTVisionEmbeddings 类型,则初始化其权重
        elif isinstance(module, OwlViTVisionEmbeddings):
            # 初始化 class_embedding、patch_embedding 和 position_embedding 的权重
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
        
        # 如果模块是 OwlViTAttention 类型,则初始化其权重
        elif isinstance(module, OwlViTAttention):
            # 初始化 attention 层的权重:q_proj、k_proj、v_proj 和 out_proj
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            out_proj_std = (module.embed_dim**-0.5) * factor
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        
        # 如果模块是 OwlViTMLP 类型,则初始化其权重
        elif isinstance(module, OwlViTMLP):
            # 初始化 MLP 层的权重:fc1 和 fc2
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        
        # 如果模块是 OwlViTModel 类型,则初始化其权重
        elif isinstance(module, OwlViTModel):
            # 初始化模型的 text_projection 和 visual_projection 的权重
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )
        
        # 如果模块是 nn.LayerNorm 类型,则对其进行归一化操作
        if isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
        # 如果模块是 nn.Linear 类型且具有偏置,则将偏置数据初始化为零
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
# 定义 OWLVIT 模型输入文档字符串,说明模型的输入参数及其形状
OWLVIT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义 OWLVIT 文本输入文档字符串,描述文本输入相关参数
OWLVIT_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 定义 OWLVIT 视觉输入文档字符串,描述视觉输入相关参数
OWLVIT_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 定义 OWLVIT 输入文档字符串,作为 OWLVIT_TEXT_INPUTS_DOCSTRING 和 OWLVIT_VISION_INPUTS_DOCSTRING 的总结
OWLVIT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列的标记索引,形状为(batch_size, sequence_length),可以使用AutoTokenizer获取。参见PreTrainedTokenizer.encode和PreTrainedTokenizer.__call__了解详情。
            # [什么是输入 ID?](../glossary#input-ids)
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using `AutoTokenizer`. See
            `PreTrainedTokenizer.encode` and `PreTrainedTokenizer.__call__` for details.
            [What are input IDs?](../glossary#input-ids)
        
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 注意力掩码,形状为(batch_size, sequence_length),用于避免在填充的标记索引上执行注意力操作。掩码值选择在[0, 1]:
            # - 1 表示**未掩码**的标记,
            # - 0 表示**掩码**的标记。
            # [什么是注意力掩码?](../glossary#attention-mask)
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 像素值,形状为(batch_size, num_channels, height, width)的浮点张量。
            Pixel values.
        
        return_loss (`bool`, *optional*):
            # 是否返回对比损失。
            Whether or not to return the contrastive loss.
        
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。查看返回张量中的`attentions`以获取更多详细信息。
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。查看返回张量中的`hidden_states`以获取更多详细信息。
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        return_dict (`bool`, *optional*):
            # 是否返回`utils.ModelOutput`而不是普通的元组。
            Whether or not to return a `utils.ModelOutput` instead of a plain tuple.
"""

# OWLVIT_OBJECT_DETECTION_INPUTS_DOCSTRING 是一个原始字符串,用于描述 OwlViT 模型输入的文档字符串。
"""
Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        像素值。
    input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
        输入序列标记在词汇表中的索引。可以使用 [`AutoTokenizer`] 获取索引。详见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。[输入 ID 是什么?](../glossary#input-ids)。
    attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
        遮罩,避免对填充的标记索引执行注意力操作。遮罩值在 `[0, 1]` 之间:
        - 1 表示**未被遮罩**的标记,
        - 0 表示**被遮罩**的标记。
        [注意力遮罩是什么?](../glossary#attention-mask)
    output_hidden_states (`bool`, *optional*):
        是否返回最后一个隐藏状态。详见返回张量中的 `text_model_last_hidden_state` 和 `vision_model_last_hidden_state`。
    return_dict (`bool`, *optional*):
        是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""

# OWLVIT_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING 是一个原始字符串,用于描述 OwlViT 图像导向目标检测模型输入的文档字符串。
"""
Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        像素值。
    query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        待检测的查询图像的像素值。每个目标图像传入一个查询图像。
    output_attentions (`bool`, *optional*):
        是否返回所有注意力层的注意力张量。详见返回张量中的 `attentions`。
    output_hidden_states (`bool`, *optional*):
        是否返回所有层的隐藏状态。详见返回张量中的 `hidden_states`。
    return_dict (`bool`, *optional*):
        是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""

class OwlViTEncoder(nn.Module):
    """
    OwlViT 编码器,包含 `config.num_hidden_layers` 个自注意力层。每层都是一个 [`OwlViTEncoderLayer`]。

    Args:
        config: OwlViTConfig
    """

    def __init__(self, config: OwlViTConfig):
        super().__init__()
        self.layers = nn.ModuleList([OwlViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False
    # 定义一个方法 `forward`,用于执行模型的前向传播操作
    def forward(
        # 输入的嵌入表示,通常是一个张量
        self,
        inputs_embeds,
        # 注意力掩码,可选的张量,默认为None,用于指定哪些位置的输入需要被忽略
        attention_mask: Optional[torch.Tensor] = None,
        # 因果注意力掩码,可选的张量,默认为None,用于自回归任务中避免信息泄漏
        causal_attention_mask: Optional[torch.Tensor] = None,
        # 是否输出注意力权重的标志,可选的布尔值,默认为None
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态的标志,可选的布尔值,默认为None
        output_hidden_states: Optional[bool] = None,
        # 是否以字典形式返回输出结果的标志,可选的布尔值,默认为None
        return_dict: Optional[bool] = None,
# 定义一个名为 OwlViTTextTransformer 的类,继承自 nn.Module
class OwlViTTextTransformer(nn.Module):
    
    # 初始化方法,接受一个 OwlViTTextConfig 类型的参数 config
    def __init__(self, config: OwlViTTextConfig):
        # 调用父类 nn.Module 的初始化方法
        super().__init__()
        # 将传入的配置对象保存到实例变量 self.config 中
        self.config = config
        # 根据配置中的 hidden_size 设置 embed_dim 变量
        embed_dim = config.hidden_size
        # 创建 OwlViTTextEmbeddings 类的实例,并保存到实例变量 self.embeddings 中
        self.embeddings = OwlViTTextEmbeddings(config)
        # 创建 OwlViTEncoder 类的实例,并保存到实例变量 self.encoder 中
        self.encoder = OwlViTEncoder(config)
        # 创建一个 LayerNorm 层,用于对最终输出进行归一化,指定归一化的维度为 embed_dim
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 前向传播方法,通过装饰器添加了输入文档和返回文档的注释
    @add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
    def forward(
        self,
        input_ids: torch.Tensor,                           # 输入的 token IDs 张量
        attention_mask: Optional[torch.Tensor] = None,     # 注意力掩码张量,可选
        position_ids: Optional[torch.Tensor] = None,       # 位置 IDs 张量,可选
        output_attentions: Optional[bool] = None,          # 是否输出注意力权重,可选
        output_hidden_states: Optional[bool] = None,       # 是否输出隐藏状态,可选
        return_dict: Optional[bool] = None,                # 是否以字典形式返回结果,可选
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:
        """
        # 如果没有显式提供 output_attentions 参数,则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有显式提供 output_hidden_states 参数,则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式提供 return_dict 参数,则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 获取输入张量的形状
        input_shape = input_ids.size()
        # 将 input_ids 转换为二维张量,去除 batch 维度
        input_ids = input_ids.view(-1, input_shape[-1])
        # 使用 embeddings 层处理 input_ids 和 position_ids
        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # 根据输入形状和隐藏状态数据类型,创建 causal attention mask
        causal_attention_mask = _create_4d_causal_attention_mask(
            input_shape, hidden_states.dtype, device=hidden_states.device
        )

        # 如果提供了 attention_mask,则将其扩展到四维张量
        if attention_mask is not None:
            # 将二维 attention_mask 扩展为四维张量
            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)

        # 使用 encoder 处理隐藏状态,传递参数包括输入嵌入、注意力掩码等
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取 encoder 输出的最后隐藏状态并进行 layer normalization
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.final_layer_norm(last_hidden_state)

        # 从 tokens 嵌入的末尾获取特征(每个序列中的最大数值处)
        # 为了兼容 ONNX,将 input_ids 转换为整型再执行 argmax 操作
        pooled_output = last_hidden_state[
            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
            input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device),
        ]

        # 如果不使用 return_dict,则返回一个元组,包括最后隐藏状态、汇总输出和额外的 encoder 输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 使用 BaseModelOutputWithPooling 创建返回字典,包括最后隐藏状态、汇总输出、隐藏状态和注意力矩阵
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# OwlViTTextModel 类,继承自 OwlViTPreTrainedModel 类
class OwlViTTextModel(OwlViTPreTrainedModel):
    # 指定配置类为 OwlViTTextConfig
    config_class = OwlViTTextConfig

    # 初始化方法
    def __init__(self, config: OwlViTTextConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建一个 OwlViTTextTransformer 对象作为文本模型
        self.text_model = OwlViTTextTransformer(config)
        # 执行初始化权重和最终处理
        self.post_init()

    # 获取输入嵌入的方法
    def get_input_embeddings(self) -> nn.Module:
        # 返回文本模型的 token 嵌入层
        return self.text_model.embeddings.token_embedding

    # 设置输入嵌入的方法
    def set_input_embeddings(self, value):
        # 设置文本模型的 token 嵌入层
        self.text_model.embeddings.token_embedding = value

    # 前向传播方法,使用装饰器添加文档字符串
    @add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        Examples:
        ```
        >>> from transformers import AutoProcessor, OwlViTTextModel

        >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astronaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```"""

        # 调用文本模型的前向传播方法,获取所有批次样本的文本查询嵌入
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


# OwlViTVisionTransformer 类,继承自 nn.Module 类
class OwlViTVisionTransformer(nn.Module):
    # 初始化方法
    def __init__(self, config: OwlViTVisionConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 存储配置信息
        self.config = config

        # 创建视觉嵌入层对象
        self.embeddings = OwlViTVisionEmbeddings(config)
        # 创建预层归一化层对象
        self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建编码器对象
        self.encoder = OwlViTEncoder(config)
        # 创建后层归一化层对象
        self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播方法,使用装饰器添加文档字符串
    @add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        # 此处省略部分前向传播代码
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        返回:
        返回类型注解声明函数返回的类型为元组或BaseModelOutputWithPooling类的实例。

        """
        # 确定是否输出注意力权重,默认为模型配置中的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态,默认为模型配置中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定返回类型是否为字典形式,默认为模型配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入数据转换为预期的数据类型
        expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
        pixel_values = pixel_values.to(expected_input_dtype)

        # 通过嵌入层处理像素值
        hidden_states = self.embeddings(pixel_values)
        # 应用预层归一化处理隐藏状态
        hidden_states = self.pre_layernorm(hidden_states)

        # 将隐藏状态输入编码器
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 提取池化输出,通常是最后隐藏状态的第一个位置
        pooled_output = last_hidden_state[:, 0, :]

        # 对池化输出进行后层归一化处理
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不需要返回字典形式的输出,则返回元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果需要返回字典形式的输出,则创建BaseModelOutputWithPooling对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 定义一个视觉模型类,继承自预训练模型 OwlViTPreTrainedModel
class OwlViTVisionModel(OwlViTPreTrainedModel):
    # 指定配置类为 OwlViTVisionConfig
    config_class = OwlViTVisionConfig
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # 初始化方法,接收配置参数 config
    def __init__(self, config: OwlViTVisionConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建视觉模型实例,使用 OwlViTVisionTransformer 类来构建
        self.vision_model = OwlViTVisionTransformer(config)
        # 执行初始化权重和应用最终处理
        self.post_init()

    # 获取输入嵌入的方法,返回视觉模型的嵌入层
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    # 前向传播方法装饰器,添加模型输入文档字符串和输出文档字符串
    @add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        前向传播函数,接收像素值、是否输出注意力、是否输出隐藏状态和是否返回字典作为参数。

        返回:
        - BaseModelOutputWithPooling 或者 Tuple
            模型的输出,可能包括最后的隐藏状态和汇聚输出(汇聚的CLS状态)。

        示例:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTVisionModel

        >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # 汇聚的CLS状态
        ```
        """
        # 调用视觉模型的前向传播方法,传递所有参数
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


# 使用起始文档字符串装饰器添加 OWLVIT_START_DOCSTRING 的类 OwlViTModel
@add_start_docstrings(OWLVIT_START_DOCSTRING)
class OwlViTModel(OwlViTPreTrainedModel):
    # 指定配置类为 OwlViTConfig
    config_class = OwlViTConfig
    def __init__(self, config: OwlViTConfig):
        super().__init__(config)

        # 检查并确保config.text_config是OwlViTTextConfig类型
        if not isinstance(config.text_config, OwlViTTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type OwlViTTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查并确保config.vision_config是OwlViTVisionConfig类型
        if not isinstance(config.vision_config, OwlViTVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type OwlViTVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从config中获取text_config和vision_config
        text_config = config.text_config
        vision_config = config.vision_config

        # 初始化模型的参数
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 初始化text_model和vision_model
        self.text_model = OwlViTTextTransformer(text_config)
        self.vision_model = OwlViTVisionTransformer(vision_config)

        # 初始化visual_projection和text_projection线性层
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

        # 初始化logit_scale参数
        self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))

        # 初始化完成后执行后处理函数
        self.post_init()

    @add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`OwlViTTextModel`].

        Examples:
        ```
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astronaut"]], return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # 根据self.config.use_return_dict的设置决定是否返回字典形式的结果
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用text_model获取文本输出
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=return_dict)
        # 从文本输出中获取汇总的输出
        pooled_output = text_output[1]
        # 将汇总的输出通过text_projection线性层进行映射,得到文本特征
        text_features = self.text_projection(pooled_output)

        return text_features
    @add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`OwlViTVisionModel`].

        Examples:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```"""

        # Use OWL-ViT model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass the input pixel values and optional flags to the vision model
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the pooled output from vision model outputs
        pooled_output = vision_outputs[1]
        
        # Apply projection layer to the pooled output to obtain image features
        image_features = self.visual_projection(pooled_output)

        # Return the computed image features
        return image_features

    @add_start_docstrings_to_model_forward(OWLVIT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=OwlViTOutput, config_class=OwlViTConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_base_image_embeds: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        This method performs the forward pass of the OwlViT model.

        Returns:
            output (:class:`~transformers.OwlViTOutput` or :obj:`torch.FloatTensor`):
                The output of the OwlViT model, containing various elements depending on the configuration.
        """

        # Set default values for optional parameters if not provided explicitly
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Call the vision and text integration model with specified inputs and configuration
        model_outputs = self.owlvit_model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            return_loss=return_loss,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_base_image_embeds=return_base_image_embeds,
            return_dict=return_dict,
        )

        # Return the model outputs
        return model_outputs
class OwlViTBoxPredictionHead(nn.Module):
    def __init__(self, config: OwlViTConfig, out_dim: int = 4):
        super().__init__()

        # 获取视觉配置中的隐藏层大小作为宽度
        width = config.vision_config.hidden_size
        # 创建线性层 dense0,输入和输出大小都为 width
        self.dense0 = nn.Linear(width, width)
        # 创建线性层 dense1,输入和输出大小都为 width
        self.dense1 = nn.Linear(width, width)
        # 创建 GELU 激活函数
        self.gelu = nn.GELU()
        # 创建线性层 dense2,输入为 width,输出为 out_dim
        self.dense2 = nn.Linear(width, out_dim)

    def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
        # 将输入特征 image_features 经过 dense0 线性层
        output = self.dense0(image_features)
        # 应用 GELU 激活函数
        output = self.gelu(output)
        # 将结果经过 dense1 线性层
        output = self.dense1(output)
        # 再次应用 GELU 激活函数
        output = self.gelu(output)
        # 将最终结果经过 dense2 线性层
        output = self.dense2(output)
        return output


class OwlViTClassPredictionHead(nn.Module):
    def __init__(self, config: OwlViTConfig):
        super().__init__()

        # 根据配置获取文本配置中的隐藏层大小作为输出维度
        out_dim = config.text_config.hidden_size
        # 获取视觉配置中的隐藏层大小作为查询维度
        self.query_dim = config.vision_config.hidden_size

        # 创建线性层 dense0,输入大小为 self.query_dim,输出大小为 out_dim
        self.dense0 = nn.Linear(self.query_dim, out_dim)
        # 创建线性层 logit_shift,输入大小为 self.query_dim,输出大小为 1
        self.logit_shift = nn.Linear(self.query_dim, 1)
        # 创建线性层 logit_scale,输入大小为 self.query_dim,输出大小为 1
        self.logit_scale = nn.Linear(self.query_dim, 1)
        # 创建 ELU 激活函数
        self.elu = nn.ELU()

    def forward(
        self,
        image_embeds: torch.FloatTensor,
        query_embeds: Optional[torch.FloatTensor],
        query_mask: Optional[torch.Tensor],
    ) -> Tuple[torch.FloatTensor]:
        # 将图像嵌入经过 dense0 线性层
        image_class_embeds = self.dense0(image_embeds)
        
        # 如果查询嵌入为空,则创建全零预测 logits
        if query_embeds is None:
            device = image_class_embeds.device
            batch_size, num_patches = image_class_embeds.shape[:2]
            pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device)
            return (pred_logits, image_class_embeds)

        # 对图像和文本特征进行归一化
        image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
        query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)

        # 获取类别预测 logits
        pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)

        # 对 logits 应用可学习的偏移和缩放
        logit_shift = self.logit_shift(image_embeds)
        logit_scale = self.logit_scale(image_embeds)
        logit_scale = self.elu(logit_scale) + 1
        pred_logits = (pred_logits + logit_shift) * logit_scale

        # 如果存在查询掩码,则应用掩码
        if query_mask is not None:
            if query_mask.ndim > 1:
                query_mask = torch.unsqueeze(query_mask, dim=-2)

            pred_logits = pred_logits.to(torch.float64)
            pred_logits = torch.where(query_mask == 0, -1e6, pred_logits)
            pred_logits = pred_logits.to(torch.float32)

        return (pred_logits, image_class_embeds)


class OwlViTForObjectDetection(OwlViTPreTrainedModel):
    config_class = OwlViTConfig
    def __init__(self, config: OwlViTConfig):
        # 调用父类构造函数初始化
        super().__init__(config)

        # 初始化 OwlViT 模型
        self.owlvit = OwlViTModel(config)
        # 初始化 OwlViT 分类头部
        self.class_head = OwlViTClassPredictionHead(config)
        # 初始化 OwlViT 边界框头部
        self.box_head = OwlViTBoxPredictionHead(config)

        # 初始化 LayerNorm 层,用于归一化视觉特征的隐藏大小
        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
        # 初始化 Sigmoid 激活函数
        self.sigmoid = nn.Sigmoid()

        # 计算平方根数目的图块
        self.sqrt_num_patches = config.vision_config.image_size // config.vision_config.patch_size

    def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
        # 从特征图计算归一化的 xy 角落坐标
        if not feature_map.ndim == 4:
            raise ValueError("Expected input shape is [batch_size, num_patches, num_patches, hidden_dim]")

        device = feature_map.device
        num_patches = feature_map.shape[1]

        # 使用 numpy 创建网格坐标,然后堆叠成二维数组
        box_coordinates = np.stack(
            np.meshgrid(np.arange(1, num_patches + 1), np.arange(1, num_patches + 1)), axis=-1
        ).astype(np.float32)
        box_coordinates /= np.array([num_patches, num_patches], np.float32)

        # 将二维数组展平成一维数组
        box_coordinates = box_coordinates.reshape(
            box_coordinates.shape[0] * box_coordinates.shape[1], box_coordinates.shape[2]
        )
        # 转换为 Torch 张量并放置在合适的设备上
        box_coordinates = torch.from_numpy(box_coordinates).to(device)

        return box_coordinates

    def compute_box_bias(self, feature_map: torch.FloatTensor) -> torch.FloatTensor:
        # 边界框中心相对于特征网格位置的偏差
        box_coordinates = self.normalize_grid_corner_coordinates(feature_map)
        # 将坐标限制在 [0.0, 1.0] 范围内
        box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)

        # 对 xy 进行反归一化
        box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)

        # 边界框大小相对于图块大小的偏差
        box_size = torch.full_like(box_coord_bias, 1.0 / feature_map.shape[-2])
        box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)

        # 计算边界框偏差
        box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1)
        return box_bias

    def box_predictor(
        self,
        image_feats: torch.FloatTensor,
        feature_map: torch.FloatTensor,
    ) -> torch.FloatTensor:
        """
        Args:
            image_feats:
                从图像提取的特征,由 `image_text_embedder` 方法返回。
            feature_map:
                图像特征的空间重排列,也由 `image_text_embedder` 方法返回。
        Returns:
            pred_boxes:
                预测框的列表(归一化为 0 到 1 的 cxcywh 格式),嵌套在一个字典中。
        """
        # 边界框检测头 [batch_size, num_boxes, 4].
        pred_boxes = self.box_head(image_feats)

        # 计算每个标记在网格上的位置,并用它来计算边界框预测的偏置
        pred_boxes += self.compute_box_bias(feature_map)
        pred_boxes = self.sigmoid(pred_boxes)
        return pred_boxes

    def class_predictor(
        self,
        image_feats: torch.FloatTensor,
        query_embeds: Optional[torch.FloatTensor] = None,
        query_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            image_feats:
                从 `image_text_embedder` 提取的图像特征。
            query_embeds:
                文本查询的嵌入向量。
            query_mask:
                必须与查询嵌入一起提供。指示哪些查询嵌入是有效的掩码。
        """
        (pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask)

        return (pred_logits, image_class_embeds)

    def image_text_embedder(
        self,
        input_ids: torch.Tensor,
        pixel_values: torch.FloatTensor,
        attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
    ) -> Tuple[torch.FloatTensor]:
        # 编码文本和图像
        outputs = self.owlvit(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
        )

        # 获取图像嵌入
        last_hidden_state = outputs.vision_model_output[0]
        image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)

        # 调整类标记的大小
        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)

        # 将图像嵌入与类标记合并
        image_embeds = image_embeds[:, 1:, :] * class_token_out
        image_embeds = self.layer_norm(image_embeds)

        # 调整大小为 [batch_size, num_patches, num_patches, hidden_size]
        new_size = (
            image_embeds.shape[0],
            self.sqrt_num_patches,
            self.sqrt_num_patches,
            image_embeds.shape[-1],
        )
        image_embeds = image_embeds.reshape(new_size)
        text_embeds = outputs[-4]

        return (text_embeds, image_embeds, outputs)
    # 定义一个方法用于嵌入图像特征到 OwlViT 模型中
    def image_embedder(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
    ) -> Tuple[torch.FloatTensor]:
        # 调用 OwlViT 模型的视觉嵌入方法,并返回一个字典形式的输出
        vision_outputs = self.owlvit.vision_model(pixel_values=pixel_values, return_dict=True)

        # 获取最后一个隐藏状态并应用 post_layernorm,返回非投影输出
        last_hidden_state = vision_outputs[0]
        image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)

        # 调整类别标记的大小
        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)

        # 将图像嵌入与类别标记合并
        image_embeds = image_embeds[:, 1:, :] * class_token_out
        image_embeds = self.layer_norm(image_embeds)

        # 调整形状为 [batch_size, num_patches, num_patches, hidden_size]
        new_size = (
            image_embeds.shape[0],
            self.sqrt_num_patches,
            self.sqrt_num_patches,
            image_embeds.shape[-1],
        )
        image_embeds = image_embeds.reshape(new_size)

        # 返回嵌入的图像和 OwlViT 模型的全部输出
        return (image_embeds, vision_outputs)

    # 定义一个方法用于嵌入图像查询
    def embed_image_query(
        self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
    ):
    ) -> torch.FloatTensor:
        _, class_embeds = self.class_predictor(query_image_features)
        pred_boxes = self.box_predictor(query_image_features, query_feature_map)
        pred_boxes_as_corners = center_to_corners_format(pred_boxes)

        # Loop over query images
        best_class_embeds = []
        best_box_indices = []
        pred_boxes_device = pred_boxes_as_corners.device

        for i in range(query_image_features.shape[0]):
            # Create a single query box for comparison
            each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device)
            each_query_pred_boxes = pred_boxes_as_corners[i]
            # Calculate IoU between the query box and predicted boxes
            ious, _ = box_iou(each_query_box, each_query_pred_boxes)

            # If there are no overlapping boxes, use generalized IoU
            if torch.all(ious[0] == 0.0):
                ious = generalized_box_iou(each_query_box, each_query_pred_boxes)

            # Use an adaptive threshold to select relevant boxes based on IoU
            iou_threshold = torch.max(ious) * 0.8

            # Select indices of predicted boxes with IoU above the threshold
            selected_inds = (ious[0] >= iou_threshold).nonzero()
            if selected_inds.numel():
                # Select corresponding embeddings for selected boxes
                selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
                # Compute mean similarity between mean embedding and selected embeddings
                mean_embeds = torch.mean(class_embeds[i], axis=0)
                mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
                # Find the index of the box with the highest similarity
                best_box_ind = selected_inds[torch.argmin(mean_sim)]
                best_class_embeds.append(class_embeds[i][best_box_ind])
                best_box_indices.append(best_box_ind)

        # Stack selected embeddings and box indices if any valid boxes are found
        if best_class_embeds:
            query_embeds = torch.stack(best_class_embeds)
            box_indices = torch.stack(best_box_indices)
        else:
            query_embeds, box_indices = None, None

        return query_embeds, box_indices, pred_boxes

    @add_start_docstrings_to_model_forward(OWLVIT_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=OwlViTImageGuidedObjectDetectionOutput, config_class=OwlViTConfig)
    def image_guided_detection(
        self,
        pixel_values: torch.FloatTensor,
        query_pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # Implementation details of the image-guided object detection method
        pass

    @add_start_docstrings_to_model_forward(OWLVIT_OBJECT_DETECTION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=OwlViTObjectDetectionOutput, config_class=OwlViTConfig)
    def forward(
        self,
        input_ids: torch.Tensor,
        pixel_values: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # Forward method implementation for OwlViT model for object detection tasks
        pass
posted @ 2024-06-29 15:49  绝不原创的飞龙  阅读(16)  评论(0编辑  收藏  举报