# 设置编码为 UTF-8# 版权声明,指明版权归 Fairseq 作者和 HuggingFace Inc. 团队所有## 根据 Apache License, Version 2.0 许可证,除非符合许可证要求,否则不得使用此文件# 您可以在以下网址获取许可证副本:## http://www.apache.org/licenses/LICENSE-2.0## 除非适用法律要求或书面同意,否则按"现状"分发软件# 没有任何明示或暗示的担保或条件。有关详细信息,请参阅许可证""" TensorFlow Hubert 模型."""from __future__ import annotations
# 引入警告模块import warnings
# 引入类型提示from typing importAny, Optional, Tuple, Union# 引入 numpy 库,并命名为 npimport numpy as np
# 引入 TensorFlow 库,并命名为 tfimport tensorflow as tf
# 引入相关模块和函数from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
from ...modeling_tf_utils import (
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
# 引入 Hubert 模型的配置文件from .configuration_hubert import HubertConfig
# 获取日志记录器对象
logger = logging.get_logger(__name__)
# 用于文档的配置名
_CONFIG_FOR_DOC = "HubertConfig"# 预训练模型存档列表
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/hubert-base-ls960",
# 查看所有 Hubert 模型,请访问 https://huggingface.co/models?filter=hubert
]
# 定义一个大负数常量
LARGE_NEGATIVE = -1e8# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2._sample_without_replacement 复制而来def_sample_without_replacement(distribution, num_samples):
"""
未实现的无重复分类抽样。目前可以使用 Gumbel-max 技巧代替 - 参见
https://github.com/tensorflow/tensorflow/issues/9260 了解更多信息
"""# 使用 Gumbel-max 技巧进行抽样
z = -tf.math.log(tf.random.uniform(shape_list(distribution), 0, 1))
_, indices = tf.nn.top_k(distribution + z, num_samples)
return indices
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2._scatter_values_on_batch_indices 复制而来def_scatter_values_on_batch_indices(values, batch_indices, output_shape):
"""
类似于 PyTorch 中的 scatter 函数,使用格式为 (batch_dim, indices) 的索引
"""
indices_shape = shape_list(batch_indices)
# 将批次维度广播到 indices_shape
broad_casted_batch_dims = tf.reshape(
tf.broadcast_to(tf.expand_dims(tf.range(indices_shape[0]), axis=-1), indices_shape), [1, -1]
)
# 将 batch_indices 转换为 pair_indices
pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
# 将值 values 散布到 pair_indices 上return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), output_shape)
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2._compute_mask_indices 复制而来def_compute_mask_indices(
shape: Tuple[int, int],
# 定义一个名为 shape 的变量,其类型为元组,包含两个整数值,分别表示形状的尺寸
mask_prob: float,
# 定义一个名为 mask_prob 的变量,其类型为浮点数,表示掩码生成的概率
mask_length: int,
# 定义一个名为 mask_length 的变量,其类型为整数,表示每个掩码的长度
min_masks: int = 0,
# 定义一个名为 min_masks 的变量,其类型为整数,默认值为 0,表示最少需要的掩码数量def compute_random_mask_spans(shape: Tuple[int, int],
attention_mask: Optional[tf.Tensor] = None,
mask_prob: float = 0.15,
mask_length: int = 10,
min_masks: int = 0) -> tf.Tensor:
"""
Computes random mask spans for a given shape
Args:
shape: the shape for which to compute masks.
should be of size 2 where first element is batch size and 2nd is timesteps
attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
mask_prob:
probability for each token to be chosen as start of the span to be masked. this will be multiplied by
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
mask_length: size of the mask
min_masks: minimum number of masked spans
Adapted from fairseq's data_utils.py.
"""# Extract batch size and sequence length from the shape tuple
batch_size, sequence_length = shape
# Check if mask length is validif mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
# Assert that mask length is smaller than sequence length
tf.debugging.assert_less(
mask_length,
sequence_length,
message=(f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"f" `sequence_length`: {sequence_length}`"),
)
# Compute the number of masked spans in the batch
num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
num_masked_spans = tf.maximum(num_masked_spans, min_masks)
num_masked_spans = tf.cast(num_masked_spans, tf.int32)
# Ensure num masked indices <= sequence length
num_masked_spans = tf.math.minimum(sequence_length // mask_length, num_masked_spans)
num_masked_spans = tf.squeeze(num_masked_spans)
# Initialize the specAugment mask
spec_aug_mask = tf.zeros((batch_size, sequence_length), dtype=tf.int32)
# Create a uniform distribution to sample from, ensuring offset samples are < sequence_length
uniform_dist = tf.ones((batch_size, sequence_length - (mask_length - 1)))
# Get random indices to mask using _sample_without_replacement function
spec_aug_mask_idxs = _sample_without_replacement(uniform_dist, num_masked_spans)
# Expand masked indices to masked spans
spec_aug_mask_idxs = tf.expand_dims(spec_aug_mask_idxs, -1)
spec_aug_mask_idxs = tf.tile(spec_aug_mask_idxs, (1, 1, mask_length))
spec_aug_mask_idxs = tf.reshape(spec_aug_mask_idxs, (batch_size, num_masked_spans * mask_length))
# Create offsets for each mask span
offsets = tf.range(mask_length)[tf.newaxis, tf.newaxis, :]
offsets = tf.tile(offsets, (batch_size, num_masked_spans, 1))
offsets = tf.reshape(offsets, (batch_size, num_masked_spans * mask_length))
# Apply offsets to the mask indices
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
# Scatter indices to mask using _scatter_values_on_batch_indices function
spec_aug_mask = _scatter_values_on_batch_indices(
tf.ones_like(spec_aug_mask_idxs), spec_aug_mask_idxs, tf.shape(spec_aug_mask)
)
return spec_aug_mask
# Copied from transformers.models.bart.modeling_tf_bart._expand_maskdef _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""# 获取输入张量的第二维长度,即序列长度
src_len = shape_list(mask)[1]
# 如果未提供目标长度,则默认使用源长度
tgt_len = tgt_len if tgt_len isnotNoneelse src_len
# 创建常量张量,数值为1.0
one_cst = tf.constant(1.0)
# 将输入的 mask 转换为浮点型张量
mask = tf.cast(mask, dtype=one_cst.dtype)
# 在第二维和第三维上复制 mask 张量,扩展为 `[bsz, 1, tgt_len, src_len]`
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNorm with Wav2Vec2->Hubertclass TFHubertGroupNorm(keras.layers.Layer):
"""
From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization
"""def __init__(
self,
groups: int = 32,
axis: int = -1,
epsilon: float = 1e-3,
center: bool = True,
scale: bool = True,
beta_initializer: keras.initializers.Initializer = "zeros",
gamma_initializer: keras.initializers.Initializer = "ones",
beta_regularizer: keras.regularizers.Regularizer = None,
gamma_regularizer: keras.regularizers.Regularizer = None,
beta_constraint: keras.constraints.Constraint = None,
gamma_constraint: keras.constraints.Constraint = None,
**kwargs,
):
super().__init__(**kwargs)
self.supports_masking = True# 设置 GroupNormalization 的参数
self.groups = groups
self.axis = axis
self.epsilon = epsilon
self.center = center
self.scale = scale
self.beta_initializer = keras.initializers.get(beta_initializer)
self.gamma_initializer = keras.initializers.get(gamma_initializer)
self.beta_regularizer = keras.regularizers.get(beta_regularizer)
self.gamma_regularizer = keras.regularizers.get(gamma_regularizer)
self.beta_constraint = keras.constraints.get(beta_constraint)
self.gamma_constraint = keras.constraints.get(gamma_constraint)
self._check_axis()
def build(self, input_shape):
# 检查输入张量的形状是否为 None
self._check_if_input_shape_is_none(input_shape)
# 设置实例标准化中的组数
self._set_number_of_groups_for_instance_norm(input_shape)
# 检查维度大小
self._check_size_of_dimensions(input_shape)
# 创建输入规范
self._create_input_spec(input_shape)
# 添加 gamma 权重
self._add_gamma_weight(input_shape)
# 添加 beta 权重
self._add_beta_weight(input_shape)
self.built = Truesuper().build(input_shape)
# 定义一个方法,用于处理输入数据def call(self, inputs):
# 获取输入数据的静态形状
input_shape = keras.backend.int_shape(inputs)
# 获取输入数据的动态形状
tensor_input_shape = tf.shape(inputs)
# 调用内部方法对输入数据进行分组重塑操作
reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape)
# 对重塑后的数据应用规范化操作
normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
# 判断是否为实例规范化
is_instance_norm = (input_shape[self.axis] // self.groups) == 1ifnot is_instance_norm:
# 如果不是实例规范化,将规范化后的数据重新整形为原始输入数据的形状
outputs = tf.reshape(normalized_inputs, tensor_input_shape)
else:
# 如果是实例规范化,则直接使用规范化后的数据作为输出
outputs = normalized_inputs
# 返回处理后的输出数据return outputs
# 获取当前层的配置信息,用于模型保存和加载时使用def get_config(self):
config = {
"groups": self.groups,
"axis": self.axis,
"epsilon": self.epsilon,
"center": self.center,
"scale": self.scale,
"beta_initializer": keras.initializers.serialize(self.beta_initializer),
"gamma_initializer": keras.initializers.serialize(self.gamma_initializer),
"beta_regularizer": keras.regularizers.serialize(self.beta_regularizer),
"gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer),
"beta_constraint": keras.constraints.serialize(self.beta_constraint),
"gamma_constraint": keras.constraints.serialize(self.gamma_constraint),
}
# 调用父类方法获取基础配置信息,并合并当前层的配置信息
base_config = super().get_config()
return {**base_config, **config}
# 计算输出形状,这里直接返回输入形状def compute_output_shape(self, input_shape):
return input_shape
# 内部方法:将输入数据重塑为分组形式def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
# 复制输入数据的形状作为分组形状的基础
group_shape = [tensor_input_shape[i] for i inrange(len(input_shape))]
# 判断是否为实例规范化
is_instance_norm = (input_shape[self.axis] // self.groups) == 1ifnot is_instance_norm:
# 如果不是实例规范化,根据分组数调整分组形状
group_shape[self.axis] = input_shape[self.axis] // self.groups
group_shape.insert(self.axis, self.groups)
group_shape = tf.stack(group_shape)
# 对输入数据进行形状重塑操作
reshaped_inputs = tf.reshape(inputs, group_shape)
return reshaped_inputs, group_shape
else:
# 如果是实例规范化,则直接返回原始输入数据和分组形状return inputs, group_shape
# 内部方法:对重塑后的数据应用规范化操作def _apply_normalization(self, reshaped_inputs, input_shape):
# 获取分组后数据的形状
group_shape = keras.backend.int_shape(reshaped_inputs)
# 确定规范化操作的约简轴
group_reduction_axes = list(range(1, len(group_shape)))
# 判断是否为实例规范化
is_instance_norm = (input_shape[self.axis] // self.groups) == 1ifnot is_instance_norm:
# 如果不是实例规范化,调整约简轴的位置
axis = -2if self.axis == -1else self.axis - 1else:
axis = -1if self.axis == -1else self.axis - 1
group_reduction_axes.pop(axis)
# 计算分组均值和方差
mean, variance = tf.nn.moments(reshaped_inputs, group_reduction_axes, keepdims=True)
# 获取调整后的权重参数
gamma, beta = self._get_reshaped_weights(input_shape)
# 对重塑后的数据应用批量规范化操作
normalized_inputs = tf.nn.batch_normalization(
reshaped_inputs,
mean=mean,
variance=variance,
scale=gamma,
offset=beta,
variance_epsilon=self.epsilon,
)
return normalized_inputs
# 获取重塑后的权重,根据输入形状创建广播形状def _get_reshaped_weights(self, input_shape):
broadcast_shape = self._create_broadcast_shape(input_shape)
gamma = None
beta = None# 如果启用了标准化参数,将 gamma 重塑为广播形状if self.scale:
gamma = tf.reshape(self.gamma, broadcast_shape)
# 如果启用了中心化参数,将 beta 重塑为广播形状if self.center:
beta = tf.reshape(self.beta, broadcast_shape)
return gamma, beta
# 检查输入形状是否有未定义的维度def _check_if_input_shape_is_none(self, input_shape):
dim = input_shape[self.axis]
if dim isNone:
raise ValueError("Axis "
+ str(self.axis)
+ " of input tensor should have a defined dimension but the layer received an input with shape "
+ str(input_shape)
+ ".")
# 为实例标准化设置组数def _set_number_of_groups_for_instance_norm(self, input_shape):
dim = input_shape[self.axis]
# 如果未指定组数,将组数设置为输入张量的维度if self.groups == -1:
self.groups = dim
# 检查维度的大小是否符合要求def _check_size_of_dimensions(self, input_shape):
dim = input_shape[self.axis]
# 检查组数是否超过通道数if dim < self.groups:
raise ValueError("Number of groups ("
+ str(self.groups)
+ ") cannot be more than the number of channels ("
+ str(dim)
+ ").")
# 检查组数是否是通道数的倍数if dim % self.groups != 0:
raise ValueError("Number of groups ("
+ str(self.groups)
+ ") must be a multiple of the number of channels ("
+ str(dim)
+ ").")
# 检查是否尝试标准化批处理轴def _check_axis(self):
if self.axis == 0:
raise ValueError("You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead")
# 创建输入规范def _create_input_spec(self, input_shape):
dim = input_shape[self.axis]
# 根据输入形状创建输入规范
self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
# 添加 gamma 权重def _add_gamma_weight(self, input_shape):
dim = input_shape[self.axis]
shape = (dim,)
# 如果启用了标准化,添加 gamma 权重if self.scale:
self.gamma = self.add_weight(
shape=shape,
name="gamma",
initializer=self.gamma_initializer,
regularizer=self.gamma_regularizer,
constraint=self.gamma_constraint,
)
else:
self.gamma = None# 添加 beta 权重def _add_beta_weight(self, input_shape):
dim = input_shape[self.axis]
shape = (dim,)
# 如果启用了中心化,添加 beta 权重if self.center:
self.beta = self.add_weight(
shape=shape,
name="beta",
initializer=self.beta_initializer,
regularizer=self.beta_regularizer,
constraint=self.beta_constraint,
)
else:
self.beta = None# 定义一个方法用于创建广播形状,根据输入的形状来确定广播后的形状def _create_broadcast_shape(self, input_shape):
# 创建一个与输入形状长度相同的列表,初始值全部为1,用于构建广播形状
broadcast_shape = [1] * len(input_shape)
# 判断是否是实例归一化,这里通过检查特定轴上的尺寸是否等于组数来确定
is_instance_norm = (input_shape[self.axis] // self.groups) == 1# 如果不是实例归一化ifnot is_instance_norm:
# 将广播形状中特定轴的尺寸设置为输入形状中特定轴的尺寸除以组数的结果
broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
# 在特定轴前插入组数,以便于构建正确的广播形状
broadcast_shape.insert(self.axis, self.groups)
else:
# 如果是实例归一化,则直接将广播形状中特定轴的尺寸设置为组数
broadcast_shape[self.axis] = self.groups
# 返回构建好的广播形状return broadcast_shape
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2WeightNormConv1D 复制而来,将 Wav2Vec2 改为 Hubertclass TFHubertWeightNormConv1D(keras.layers.Conv1D):
"""从 https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm 改编"""def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs):
# 调用 Conv1D 的初始化方法,设定卷积核参数super().__init__(
filters=filters,
kernel_size=kernel_size,
groups=groups,
padding="valid", # 使用有效填充方式
use_bias=True, # 使用偏置
bias_initializer="he_normal", # 偏置初始化方式为 he_normal
**kwargs,
)
# 设置显式填充和卷积的通道方向
self.explicit_padding = explicit_padding
self.filter_axis = 2# 卷积核的轴数
self.kernel_norm_axes = tf.constant([0, 1]) # 卷积核的归一化轴def _init_norm(self):
"""设置权重向量的范数。"""
kernel_norm = tf.sqrt(tf.reduce_sum(tf.square(self.weight_v), axis=self.kernel_norm_axes))
self.weight_g.assign(kernel_norm[:, tf.newaxis, tf.newaxis])
def _normalize_kernel(self):
"""生成归一化的权重。"""
kernel = tf.nn.l2_normalize(self.weight_v, axis=self.kernel_norm_axes) * tf.transpose(self.weight_g)
self.kernel = tf.transpose(kernel)
def build(self, input_shape):
ifnot self.built:
super().build(input_shape)
# 初始化权重向量并设为可训练
self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
self.weight_v = self.kernel
# 添加权重 g,初始化为全1,设为可训练
self.weight_g = self.add_weight(
name="weight_g",
shape=(int(self.weight_v.shape[self.filter_axis]), 1, 1),
initializer="ones",
dtype=self.weight_v.dtype,
trainable=True,
)
# 初始化权重向量的范数
self._init_norm()
# 添加偏置,并初始化为0,设为可训练
self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True)
def call(self, inputs):
# TODO Matt: 在 call() 中对属性进行赋值在 TensorFlow 中是不正确的,应该保持幂等性。# 这整个层应该被替换为一个不继承 Conv1D 的层,而是调用一个生成归一化权重的函数性1D卷积。
self._normalize_kernel()
# 对输入进行显式填充
padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0)))
# 调用父类 Conv1D 的 call 方法进行卷积运算
output = super().call(padded_inputs)
return output
# 初始化方法,用于设置对象的初始状态def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
# 调用父类的初始化方法,传递额外的关键字参数super().__init__(**kwargs)
# 设置输入卷积维度为配置对象中的 conv_dim[layer_id],若 layer_id > 0 则取对应的值,否则设为 1
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0else1# 设置输出卷积维度为配置对象中的 conv_dim[layer_id]
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个 1D 卷积层对象,设置滤波器数量、卷积核大小、步长、是否使用偏置,并命名为 "conv"
self.conv = keras.layers.Conv1D(
filters=self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
strides=config.conv_stride[layer_id],
use_bias=config.conv_bias,
name="conv",
)
# 获取激活函数,根据配置中的 feat_extract_activation 来选择
self.activation = get_tf_activation(config.feat_extract_activation)
# 调用方法,用于执行前向传播计算def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 对输入张量进行一维卷积操作
hidden_states = self.conv(hidden_states)
# 应用激活函数到卷积后的张量
hidden_states = self.activation(hidden_states)
# 返回处理后的张量作为输出return hidden_states
# 构建方法,用于构建层的变量和权重,确保在首次调用 call 方法时已经构建def build(self, input_shape=None):
# 如果已经构建过,则直接返回if self.built:
return# 标记为已构建状态
self.built = True# 如果存在卷积层对象,则在名称作用域下构建卷积层,指定输入形状为 [None, None, self.in_conv_dim]ifgetattr(self, "conv", None) isnotNone:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
# 从transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2LayerNormConvLayer复制代码,将Wav2Vec2改为Hubertclass TFHubertLayerNormConvLayer(keras.layers.Layer):
def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
super().__init__(**kwargs)
# 初始化卷积层的输入维度和输出维度
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0else1
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层对象
self.conv = keras.layers.Conv1D(
filters=self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
strides=config.conv_stride[layer_id],
use_bias=config.conv_bias,
name="conv",
)
# 创建一个层归一化层对象
self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps)
# 获取激活函数对象
self.activation = get_tf_activation(config.feat_extract_activation)
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 执行一维卷积操作
hidden_states = self.conv(hidden_states)
# 执行层归一化操作
hidden_states = self.layer_norm(hidden_states)
# 执行激活函数操作
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
# 如果已经构建则直接返回if self.built:
return
self.built = True# 构建卷积层对象ifgetattr(self, "conv", None) isnotNone:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
# 构建层归一化层对象ifgetattr(self, "layer_norm", None) isnotNone:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
# 从transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNormConvLayer复制代码,将Wav2Vec2改为Hubertclass TFHubertGroupNormConvLayer(keras.layers.Layer):
def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
super().__init__(**kwargs)
# 初始化卷积层的输入维度和输出维度
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0else1
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层对象
self.conv = keras.layers.Conv1D(
filters=self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
strides=config.conv_stride[layer_id],
use_bias=config.conv_bias,
name="conv",
)
# 获取激活函数对象
self.activation = get_tf_activation(config.feat_extract_activation)
# 创建一个组归一化层对象
self.layer_norm = TFHubertGroupNorm(groups=self.out_conv_dim, epsilon=config.layer_norm_eps, name="layer_norm")
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 执行一维卷积操作
hidden_states = self.conv(hidden_states)
# 执行组归一化操作
hidden_states = self.layer_norm(hidden_states)
# 执行激活函数操作
hidden_states = self.activation(hidden_states)
return hidden_states
# 定义一个方法 `build`,用于构建神经网络层的结构def build(self, input_shape=None):
# 如果已经构建过,直接返回,不重复构建if self.built:
return# 将标记设置为已构建
self.built = True# 如果存在 `conv` 属性,执行以下操作ifgetattr(self, "conv", None) isnotNone:
# 使用 `tf.name_scope` 创建名为 `self.conv.name` 的命名空间with tf.name_scope(self.conv.name):
# 使用 `self.in_conv_dim` 参数构建 `conv` 层
self.conv.build([None, None, self.in_conv_dim])
# 如果存在 `layer_norm` 属性,执行以下操作ifgetattr(self, "layer_norm", None) isnotNone:
# 使用 `tf.name_scope` 创建名为 `self.layer_norm.name` 的命名空间with tf.name_scope(self.layer_norm.name):
# 使用 `self.out_conv_dim` 参数构建 `layer_norm` 层
self.layer_norm.build([None, None, self.out_conv_dim])
# 定义一个名为 TFHubertPositionalConvEmbedding 的自定义层,继承自 keras 的 Layer 类class TFHubertPositionalConvEmbedding(keras.layers.Layer):
# 初始化方法,接受一个 HubertConfig 对象作为参数def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
super().__init__(**kwargs)
# 创建一个 TFHubertWeightNormConv1D 类对象,用于卷积操作
self.conv = TFHubertWeightNormConv1D(
filters=config.hidden_size, # 卷积输出的维度大小
kernel_size=config.num_conv_pos_embeddings, # 卷积核的大小
groups=config.num_conv_pos_embedding_groups, # 卷积操作时的组数
explicit_padding=config.num_conv_pos_embeddings // 2, # 明确的填充大小
name="conv", # 层的名称)
# 创建一个 TFHubertSamePadLayer 类对象,用于进行相同的填充操作
self.padding = TFHubertSamePadLayer(config.num_conv_pos_embeddings)
# 获取激活函数,根据配置参数中的 feat_extract_activation 设置
self.activation = get_tf_activation(config.feat_extract_activation)
self.config = config # 保存配置对象# 定义 call 方法,接受输入的 hidden_states 张量,返回处理后的张量def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states) # 进行卷积操作
hidden_states = self.padding(hidden_states) # 进行填充操作
hidden_states = self.activation(hidden_states) # 应用激活函数return hidden_states # 返回处理后的张量# build 方法,用于构建层,根据输入形状 input_shape 构建 conv 层def build(self, input_shape=None):
if self.built: # 如果已经构建过,则直接返回return
self.built = True# 将 built 标记为 True,表示已构建ifgetattr(self, "conv", None) isnotNone:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.config.hidden_size])
# 使用配置中的 hidden_size 构建 conv 层的形状# 定义模型的构建方法,用于构建模型的层次结构def build(self, input_shape=None):
# 如果模型已经构建完成,则直接返回,不进行重复构建if self.built:
return# 将模型标记为已构建状态
self.built = True# 遍历模型中的每个卷积层for conv_layer in self.conv_layers:
# 使用 TensorFlow 的命名空间为当前卷积层命名with tf.name_scope(conv_layer.name):
# 构建当前卷积层,input_shape=None 表示使用默认输入形状
conv_layer.build(None)
# 定义 TFHubertFeatureExtractor 类,继承自 TFHubertFeatureEncoder 类class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
def __init__(self, config, **kwargs):
# 调用父类 TFHubertFeatureEncoder 的构造函数super().__init__(config, **kwargs)
# 发出警告,提醒该类已被弃用,并将在 Transformers v5 版本中移除,建议使用其基类代替
warnings.warn(f"The class `{self.__class__.__name__}` has been depreciated ""and will be removed in Transformers v5. "f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# 定义 TFHubertFeatureProjection 类,继承自 keras.layers.Layer 类class TFHubertFeatureProjection(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
# 调用父类的构造函数super().__init__(**kwargs)
# 初始化层归一化模块,使用给定的 epsilon 值
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
# 初始化全连接层,用于特征投影
self.projection = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="projection",
)
# 初始化 Dropout 层,用于在训练时进行随机失活
self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout)
# 保存配置信息
self.config = config
# 定义调用方法,实现特征投影过程def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
# 应用层归一化
hidden_states = self.layer_norm(hidden_states)
# 应用特征投影
hidden_states = self.projection(hidden_states)
# 应用 Dropout
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
# 定义构建方法,用于构建层对象def build(self, input_shape=None):
# 如果已经构建过,直接返回if self.built:
return# 标记为已构建
self.built = True# 构建层归一化模块,使用输入形状和配置的最后一个维度ifgetattr(self, "layer_norm", None) isnotNone:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.conv_dim[-1]])
# 构建特征投影层,使用输入形状和配置的最后一个维度ifgetattr(self, "projection", None) isnotNone:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.config.conv_dim[-1]])
# 从 transformers.models.bart.modeling_tf_bart.TFBartAttention 复制并改名为 TFHubertAttentionclass TFHubertAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""# 初始化多头注意力层def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"f" and `num_heads`: {num_heads}).")
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
# 初始化函数,用于初始化模型的参数和属性super().__init__(**kwargs)
# 设置嵌入维度
self.embed_dim = embed_dim
# 设置注意力头的数量
self.num_heads = num_heads
# 设置 dropout 层,用于在训练过程中随机丢弃部分神经元,防止过拟合
self.dropout = keras.layers.Dropout(dropout)
# 计算每个注意力头的维度
self.head_dim = embed_dim // num_heads
# 检查 embed_dim 是否可以被 num_heads 整除if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"f" and `num_heads`: {num_heads}).")
# 缩放因子,用于缩放注意力分数
self.scaling = self.head_dim**-0.5# 是否为解码器的标志位
self.is_decoder = is_decoder
# 初始化键、查询、值以及输出的投影层
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
# 重新塑造张量的形状,以适应多头注意力的需求def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = Trueifgetattr(self, "k_proj", None) isnotNone:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
ifgetattr(self, "q_proj", None) isnotNone:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
ifgetattr(self, "v_proj", None) isnotNone:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
ifgetattr(self, "out_proj", None) isnotNone:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# 模型的调用方法,定义了模型的前向传播逻辑def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
# 模型的构建方法,用于构建模型的层次结构def build(self, input_shape=None):
if self.built:
return
self.built = True# 构建键、查询、值以及输出的投影层ifgetattr(self, "k_proj", None) isnotNone:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
ifgetattr(self, "q_proj", None) isnotNone:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
ifgetattr(self, "v_proj", None) isnotNone:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
ifgetattr(self, "out_proj", None) isnotNone:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2FeedForward 复制代码,将 Wav2Vec2 替换为 Hubertclass TFHubertFeedForward(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
# 中间层的 Dropout,使用给定的激活 dropout 率
self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout)
# 中间层的全连接层,设置单元数、权重和偏置的初始化方式,并命名为 "intermediate_dense"
self.intermediate_dense = keras.layers.Dense(
units=config.intermediate_size,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="intermediate_dense",
)
# 中间层的激活函数,根据配置选择 Tensorflow 的激活函数
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
# 输出层的全连接层,设置单元数、权重和偏置的初始化方式,并命名为 "output_dense"
self.output_dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="output_dense",
)
# 输出层的 Dropout,使用给定的隐藏 dropout 率
self.output_dropout = keras.layers.Dropout(config.hidden_dropout)
self.config = config
# 调用函数,实现前向传播def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
# 中间层全连接操作
hidden_states = self.intermediate_dense(hidden_states)
# 中间层激活函数操作
hidden_states = self.intermediate_act_fn(hidden_states)
# 中间层 Dropout 操作,根据训练模式决定是否启用
hidden_states = self.intermediate_dropout(hidden_states, training=training)
# 输出层全连接操作
hidden_states = self.output_dense(hidden_states)
# 输出层 Dropout 操作,根据训练模式决定是否启用
hidden_states = self.output_dropout(hidden_states, training=training)
return hidden_states
# 构建层,初始化中间层和输出层的权重和偏置def build(self, input_shape=None):
if self.built:
return
self.built = True# 如果中间层已存在,则构建中间层ifgetattr(self, "intermediate_dense", None) isnotNone:
with tf.name_scope(self.intermediate_dense.name):
self.intermediate_dense.build([None, None, self.config.hidden_size])
# 如果输出层已存在,则构建输出层ifgetattr(self, "output_dense", None) isnotNone:
with tf.name_scope(self.output_dense.name):
self.output_dense.build([None, None, self.config.intermediate_size])
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayer 复制代码,将 Wav2Vec2 替换为 Hubertclass TFHubertEncoderLayer(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
# 使用 HubertConfig 初始化注意力机制层
self.attention = TFHubertAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
name="attention",
)
# dropout 层,使用给定的隐藏 dropout 率
self.dropout = keras.layers.Dropout(config.hidden_dropout)
# 层归一化,设置 epsilon 值并命名为 "layer_norm"
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
# 前馈神经网络层,使用给定的 HubertConfig 配置并命名为 "feed_forward"
self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
# 最终层归一化,设置 epsilon 值并命名为 "final_layer_norm"
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
self.config = config
# 定义一个方法 `call`,用于执行 Transformer 层的前向传播def call(
self,
hidden_states: tf.Tensor, # 输入张量 hidden_states,表示输入的隐藏状态
attention_mask: tf.Tensor | None = None, # 注意力掩码张量,默认为 None
output_attentions: Optional[bool] = False, # 是否输出注意力权重,默认为 False
training: bool = False, # 是否处于训练模式,默认为 False) -> Tuple[tf.Tensor]: # 返回一个元组,包含类型为 tf.Tensor 的 hidden_states# 复制隐藏状态作为注意力残差
attn_residual = hidden_states
# 调用 self.attention 对象的前向传播方法,获取更新后的 hidden_states、注意力权重 attn_weights 和一个占位符 _
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, training=training
)
# 在训练中使用 dropout 处理 hidden_states
hidden_states = self.dropout(hidden_states, training=training)
# 将注意力残差与更新后的 hidden_states 相加,得到新的 hidden_states
hidden_states = attn_residual + hidden_states
# 使用层归一化层处理 hidden_states
hidden_states = self.layer_norm(hidden_states)
# 将隐藏状态输入到 feed_forward 网络中,再将结果与原始 hidden_states 相加
hidden_states = hidden_states + self.feed_forward(hidden_states)
# 最终再次进行层归一化处理
hidden_states = self.final_layer_norm(hidden_states)
# 构建输出元组,初始包含更新后的 hidden_states
outputs = (hidden_states,)
# 如果设置输出注意力权重,则将 attn_weights 加入到输出元组中if output_attentions:
outputs += (attn_weights,)
# 返回输出元组return outputs
# 定义 build 方法,用于构建层次结构def build(self, input_shape=None):
# 如果已经构建过,则直接返回if self.built:
return# 标记当前对象为已构建状态
self.built = True# 如果 self.attention 存在,则构建 self.attention 层ifgetattr(self, "attention", None) isnotNone:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 如果 self.layer_norm 存在,则构建 self.layer_norm 层ifgetattr(self, "layer_norm", None) isnotNone:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
# 如果 self.feed_forward 存在,则构建 self.feed_forward 层ifgetattr(self, "feed_forward", None) isnotNone:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
# 如果 self.final_layer_norm 存在,则构建 self.final_layer_norm 层ifgetattr(self, "final_layer_norm", None) isnotNone:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayerStableLayerNorm 复制过来,将 Wav2Vec2 替换为 Hubertclass TFHubertEncoderLayerStableLayerNorm(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
# 初始化自注意力层,使用 HubertConfig 中定义的参数
self.attention = TFHubertAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
name="attention",
)
# 随机失活层,使用隐藏层失活率来初始化
self.dropout = keras.layers.Dropout(config.hidden_dropout)
# 层归一化,使用 HubertConfig 中定义的 epsilon 来初始化
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
# 前馈网络,使用 HubertConfig 初始化
self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
# 最终的层归一化,使用 HubertConfig 中定义的 epsilon 来初始化
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
self.config = config
# 定义前向传播函数,接受隐藏状态、注意力掩码等输入,并返回一个元组def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 复制注意力层之前的隐藏状态,用于残差连接
attn_residual = hidden_states
# 应用层归一化到隐藏状态
hidden_states = self.layer_norm(hidden_states)
# 调用自注意力层,得到更新的隐藏状态和注意力权重
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, training=training
)
# 应用随机失活到更新的隐藏状态
hidden_states = self.dropout(hidden_states, training=training)
# 残差连接:原始隐藏状态 + 更新的隐藏状态
hidden_states = attn_residual + hidden_states
# 应用前馈网络和最终的层归一化到更新的隐藏状态
hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
# 构建输出元组,包含更新的隐藏状态
outputs = (hidden_states,)
# 如果需要输出注意力权重,将注意力权重加入输出元组if output_attentions:
outputs += (attn_weights,)
return outputs
# 构建层,确保所有子层都被构建def build(self, input_shape=None):
if self.built:
return
self.built = True# 如果注意力层存在,则构建注意力层ifgetattr(self, "attention", None) isnotNone:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 如果层归一化存在,则根据输入形状构建层归一化ifgetattr(self, "layer_norm", None) isnotNone:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
# 如果前馈网络存在,则构建前馈网络ifgetattr(self, "feed_forward", None) isnotNone:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
# 如果最终的层归一化存在,则根据输入形状构建最终的层归一化ifgetattr(self, "final_layer_norm", None) isnotNone:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2Encoder 复制过来,将 Wav2Vec2 替换为 Hubertclass TFHubertEncoder(keras.layers.Layer):
# 初始化方法,用于创建一个 Hubert 模型实例def __init__(self, config: HubertConfig, **kwargs):
# 调用父类的初始化方法super().__init__(**kwargs)
# 保存传入的配置对象
self.config = config
# 创建位置卷积嵌入层,命名为 pos_conv_embed
self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
# 创建 LayerNormalization 层,使用给定的 epsilon 值
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
# 创建 Dropout 层,使用给定的 dropout 率
self.dropout = keras.layers.Dropout(config.hidden_dropout)
# 创建多个 HubertEncoderLayer 层,根据配置中的层数进行命名
self.layer = [TFHubertEncoderLayer(config, name=f"layers.{i}") for i inrange(config.num_hidden_layers)]
# 模型调用方法,实现了 Hubert 模型的前向传播def call(
self,
hidden_states: tf.Tensor, # 输入的隐藏状态张量
attention_mask: tf.Tensor | None = None, # 注意力遮罩张量,默认为 None
output_attentions: Optional[bool] = False, # 是否输出注意力权重,默认为 False
output_hidden_states: Optional[bool] = False, # 是否输出隐藏状态,默认为 False
return_dict: Optional[bool] = True, # 是否以字典形式返回输出,默认为 True
training: Optional[bool] = False, # 是否处于训练模式,默认为 False) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
# 如果要输出隐藏状态,则初始化 all_hidden_states 为空元组
all_hidden_states = () if output_hidden_states elseNone# 如果要输出注意力权重,则初始化 all_self_attentions 为空元组
all_self_attentions = () if output_attentions elseNone# 如果传入了 attention_mask,则将隐藏状态张量与 attention_mask 进行逐元素乘法if attention_mask isnotNone:
hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
# 对 attention_mask 进行扩展,用于后续处理
attention_mask = _expand_mask(attention_mask)
else:
# 否则 attention_mask 为空
attention_mask = None# 使用位置卷积嵌入层处理隐藏状态张量,加上位置嵌入
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
# 对加和后的隐藏状态进行 LayerNormalization 处理
hidden_states = self.layer_norm(hidden_states)
# 对 LayerNormalization 后的隐藏状态应用 Dropout 处理
hidden_states = self.dropout(hidden_states, training=training)
# 遍历每一个 HubertEncoderLayer 层for i, layer_module inenumerate(self.layer):
# 如果需要输出隐藏状态,则将当前的隐藏状态添加到 all_hidden_states 中if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop(参见 https://arxiv.org/abs/1909.11556 )
dropout_probability = np.random.uniform(0, 1)
# 如果处于训练状态并且随机数小于配置中的 layerdrop 率,则跳过当前层if training and (dropout_probability < self.config.layerdrop):
continue# 调用当前层的 forward 方法,得到输出
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
training=training,
)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,则将当前层的注意力权重输出添加到 all_self_attentions 中if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 添加最后一层的隐藏状态到 all_hidden_states 中if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不需要以字典形式返回结果,则按顺序返回隐藏状态、隐藏状态序列、注意力权重序列中的非空元素ifnot return_dict:
returntuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v isnotNone)
# 否则,返回 TFBaseModelOutput 对象,包括最后的隐藏状态、隐藏状态序列和注意力权重序列return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# 构建模型的方法,用于定义模型的输入形状和层次结构def build(self, input_shape=None):
# 如果模型已经构建,则直接返回,避免重复构建if self.built:
return# 将模型标记为已构建状态
self.built = True# 如果存在位置卷积嵌入层,构建该层ifgetattr(self, "pos_conv_embed", None) isnotNone:
# 使用位置卷积嵌入层的名称作为命名空间with tf.name_scope(self.pos_conv_embed.name):
# 调用位置卷积嵌入层的构建方法,传入None作为输入形状
self.pos_conv_embed.build(None)
# 如果存在层归一化层,构建该层ifgetattr(self, "layer_norm", None) isnotNone:
# 使用层归一化层的名称作为命名空间with tf.name_scope(self.layer_norm.name):
# 调用层归一化层的构建方法,传入形状为 [None, None, self.config.hidden_size]
self.layer_norm.build([None, None, self.config.hidden_size])
# 如果存在多个层,依次构建每一层ifgetattr(self, "layer", None) isnotNone:
for layer in self.layer:
# 使用当前层的名称作为命名空间with tf.name_scope(layer.name):
# 调用当前层的构建方法,传入None作为输入形状
layer.build(None)
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderStableLayerNorm 复制代码,并将 Wav2Vec2 改为 Hubertclass TFHubertEncoderStableLayerNorm(keras.layers.Layer):
# 初始化函数,接收 HubertConfig 类型的 config 参数,并调用父类的初始化方法def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
# 将传入的 config 参数保存为对象的属性
self.config = config
# 创建 TFHubertPositionalConvEmbedding 对象,命名为 pos_conv_embed
self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
# 创建 LayerNormalization 层,epsilon 参数使用 config 中的 layer_norm_eps,命名为 layer_norm
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
# 创建 Dropout 层,dropout 率使用 config 中的 hidden_dropout
self.dropout = keras.layers.Dropout(config.hidden_dropout)
# 创建 TFHubertEncoderLayerStableLayerNorm 层列表,命名为 layers,根据 config.num_hidden_layers 数量生成多个层对象
self.layer = [
TFHubertEncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i inrange(config.num_hidden_layers)
]
# 定义 call 方法,接收多个参数,返回 TFBaseModelOutput 或 Tuple[tf.Tensor] 类型def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
# 初始化 all_hidden_states 和 all_self_attentions 变量,根据输出标志确定是否初始化空元组
all_hidden_states = () if output_hidden_states elseNone
all_self_attentions = () if output_attentions elseNone# 如果 attention_mask 不为 None,则将 hidden_states 加上 attention_mask 的扩展维度乘积if attention_mask isnotNone:
hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
# 调用 _expand_mask 函数扩展 attention_mask
attention_mask = _expand_mask(attention_mask)
else:
attention_mask = None# 计算位置编码并将其加到 hidden_states 上
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
# 使用 dropout 对 hidden_states 进行处理,根据 training 参数确定是否启用训练模式
hidden_states = self.dropout(hidden_states, training=training)
# 遍历 self.layer 中的每个层对象for i, layer_module inenumerate(self.layer):
# 如果输出隐藏状态,则将当前 hidden_states 加入 all_hidden_statesif output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop 功能,根据论文中描述的概率决定是否跳过当前层
dropout_probability = np.random.uniform(0, 1)
if training and (dropout_probability < self.config.layerdrop): # 如果处于训练状态且概率小于 layerdrop 参数,则跳过该层continue# 调用当前层对象的 call 方法,处理 hidden_states 和 attention_mask 等参数
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
training=training,
)
# 更新 hidden_states 为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果输出注意力权重,则将当前层的注意力权重加入 all_self_attentionsif output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 对最终的 hidden_states 应用 layer_norm
hidden_states = self.layer_norm(hidden_states)
# 如果输出隐藏状态,则将最终的 hidden_states 加入 all_hidden_statesif output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果 return_dict 参数为 False,则返回非空值的元组ifnot return_dict:
returntuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v isnotNone)
# 否则,返回 TFBaseModelOutput 对象,包含最终的隐藏状态、所有隐藏状态和注意力权重return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# 如果已经构建过模型,则直接返回,避免重复构建if self.built:
return# 将模型标记为已构建状态
self.built = True# 如果存在位置编码的卷积嵌入层,则构建该层ifgetattr(self, "pos_conv_embed", None) isnotNone:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
# 如果存在 Layer Normalization 层,则构建该层ifgetattr(self, "layer_norm", None) isnotNone:
with tf.name_scope(self.layer_norm.name):
# 构建 Layer Normalization 层,指定输入形状为 [None, None, self.config.hidden_size]
self.layer_norm.build([None, None, self.config.hidden_size])
# 如果存在多个层,则逐个构建这些层ifgetattr(self, "layer", None) isnotNone:
for layer in self.layer:
with tf.name_scope(layer.name):
# 构建当前层,输入形状为 None,表示不限定输入维度
layer.build(None)
@keras_serializable
class TFHubertMainLayer(keras.layers.Layer):
# 设置配置类
config_class = HubertConfig
def __init__(self, config: HubertConfig, **kwargs):
# 调用父类初始化方法super().__init__(**kwargs)
# 设置配置属性
self.config = config
# 创建特征提取器对象
self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
# 创建特征投影对象
self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
# 根据配置选择稳定层归一化编码器或一般编码器if config.do_stable_layer_norm:
self.encoder = TFHubertEncoderStableLayerNorm(config, name="encoder")
else:
self.encoder = TFHubertEncoder(config, name="encoder")
def build(self, input_shape=None):
# 添加权重,用于掩码特定嵌入
self.masked_spec_embed = self.add_weight(
shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed")
# 如果已经建立过,直接返回if self.built:
return
self.built = True# 如果存在特征提取器,构建其结构ifgetattr(self, "feature_extractor", None) isnotNone:
with tf.name_scope(self.feature_extractor.name):
self.feature_extractor.build(None)
# 如果存在特征投影器,构建其结构ifgetattr(self, "feature_projection", None) isnotNone:
with tf.name_scope(self.feature_projection.name):
self.feature_projection.build(None)
# 如果存在编码器,构建其结构ifgetattr(self, "encoder", None) isnotNone:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
"""
计算卷积层的输出长度
"""def _conv_out_length(input_length, kernel_size, stride):
# 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 获取的一维卷积层输出长度公式return (input_length - kernel_size) // stride + 1# 遍历配置中的卷积核大小和步幅,计算每一层的输出长度for kernel_size, stride inzip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
return input_lengths
def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""# 获取 hidden_states 的形状信息:batch_size, sequence_length, hidden_size
batch_size, sequence_length, hidden_size = shape_list(hidden_states)
# 检查是否禁用了 SpecAugment 的应用ifnotgetattr(self.config, "apply_spec_augment", True):
return hidden_states
if mask_time_indices isnotNone:
# 根据给定的 mask_time_indices 在时间轴上应用 SpecAugment
hidden_states = tf.where(
tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
hidden_states,
)
elif self.config.mask_time_prob > 0:
# 生成 mask_time_indices 并在时间轴上应用 SpecAugment
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
min_masks=2,
)
hidden_states = tf.where(
tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
hidden_states,
)
# 在特征轴上应用 SpecAugmentif self.config.mask_feature_prob > 0:
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
)
hidden_states = tf.where(mask_feature_indices[:, tf.newaxis, :], hidden_states, 0)
# 返回经过 SpecAugment 处理后的 hidden_statesreturn hidden_states
@unpack_inputs
def call(
self,
input_values: tf.Tensor,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: tf.Tensor | None = None,
output_hidden_states: tf.Tensor | None = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs: Any,
):
# 使用特征提取器提取特征,将输入转换为浮点数类型并进行训练
hidden_states = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)
if attention_mask isnotNone:
# 根据卷积公式计算真实的输出长度
output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))
# 根据计算得到的长度创建序列掩码,最大长度为隐藏状态的长度,数据类型与隐藏状态一致
attention_mask = tf.sequence_mask(
output_lengths, maxlen=shape_list(hidden_states)[1], dtype=hidden_states.dtype
)
# 使用特征投影器进行特征投影,同时根据是否训练状态进行操作
hidden_states = self.feature_projection(hidden_states, training=training)
# 获取参数中的时间索引掩码,如果处于训练状态
mask_time_indices = kwargs.get("mask_time_indices", None)
if training:
# 根据时间索引掩码对隐藏状态进行掩码处理
hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
# 将隐藏状态传入编码器进行编码,同时传递相关参数和是否返回字典
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从编码器输出中获取最后的隐藏状态
hidden_states = encoder_outputs[0]
ifnot return_dict:
# 如果不返回字典,则返回元组形式的隐藏状态和其他编码器输出return (hidden_states,) + encoder_outputs[1:]
# 如果返回字典,则创建 TFBaseModelOutput 对象,并包含相应的属性return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class TFHubertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""# 指定配置类为 HubertConfig
config_class = HubertConfig
# 基础模型前缀为 "hubert"
base_model_prefix = "hubert"# 主输入名称为 "input_values"
main_input_name = "input_values"
@propertydef input_signature(self):
# 定义输入签名,指定输入参数的形状和数据类型return {
"input_values": tf.TensorSpec((None, 16000), tf.float32, name="input_values"),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
"token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
}
def __init__(self, config, *inputs, **kwargs):
# 初始化方法,调用父类的初始化函数super().__init__(config, *inputs, **kwargs)
# 发出警告,说明在 CPU 上不支持后向传播操作
logger.warning(f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish ""to train/fine-tune this model, you need a GPU or a TPU")
HUBERT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_values` only and nothing else: `model(input_values)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_values": input_values, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
HUBERT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings("The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
HUBERT_START_DOCSTRING,
)
class TFHubertModel(TFHubertPreTrainedModel):
def __init__(self, config: HubertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.config = config
# 初始化 TFHubertMainLayer 对象,用于处理 Hubert 模型的主要逻辑
self.hubert = TFHubertMainLayer(config, name="hubert")
@add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
@unpack_inputs
def call(
self,
input_values: tf.Tensor,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
"""
根据给定的输入执行模型的前向传播,返回模型输出。
Args:
input_values (tf.Tensor): 输入张量,代表输入特征。
attention_mask (tf.Tensor, optional): 注意力掩码张量,用于控制注意力分配。默认为 None。
token_type_ids (tf.Tensor, optional): 标记类型 ID 张量,用于多序列输入。默认为 None。
position_ids (tf.Tensor, optional): 位置 ID 张量,用于指示输入中每个位置的位置信息。默认为 None。
head_mask (tf.Tensor, optional): 头部掩码张量,用于控制多头注意力中每个头的重要性。默认为 None。
inputs_embeds (tf.Tensor, optional): 嵌入输入张量,用于直接提供输入的嵌入表示。默认为 None。
output_attentions (bool, optional): 是否输出注意力权重。默认为 None。
output_hidden_states (bool, optional): 是否输出隐藏状态。默认为 None。
return_dict (bool, optional): 是否以字典形式返回结果。默认为 None。
training (bool, optional): 是否处于训练模式。默认为 False。
Returns:
Union[TFBaseModelOutput, Tuple[tf.Tensor]]: 模型的输出结果,包含隐藏状态和/或注意力权重,具体取决于参数设置。
Example:
```
>>> from transformers import AutoProcessor, TFHubertModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
>>> hidden_states = model(input_values).last_hidden_state
```
"""# 设置输出的隐藏状态、注意力权重和返回字典形式的结果
output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
output_attentions = output_attentions if output_attentions else self.config.output_attentions
return_dict = return_dict if return_dict else self.config.return_dict
# 调用 TFHubertMainLayer 对象进行前向传播
outputs = self.hubert(
input_values=input_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
# 构建模型的方法,在此方法中进行模型的初始化和构建def build(self, input_shape=None):
# 如果模型已经构建过,直接返回,避免重复构建if self.built:
return# 将模型标记为已构建状态
self.built = True# 检查是否存在名为"hubert"的属性,并且该属性不为Noneifgetattr(self, "hubert", None) isnotNone:
# 使用"hubert"属性的名称作为命名空间with tf.name_scope(self.hubert.name):
# 调用"hubert"对象的build方法,传入None作为输入形状
self.hubert.build(None)
@add_start_docstrings("""TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
HUBERT_START_DOCSTRING,
)
class TFHubertForCTC(TFHubertPreTrainedModel):
def __init__(self, config: HubertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化 TFHubert 主层,使用给定的配置和名称
self.hubert = TFHubertMainLayer(config, name="hubert")
# 添加 dropout 层,使用给定的最终 dropout 率
self.dropout = keras.layers.Dropout(config.final_dropout)
# 添加全连接层 lm_head,输出大小为词汇表大小
self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head")
# 确定输出隐藏大小,如果配置中存在 `add_adapter` 并且为真,则使用 `output_hidden_size`,否则使用 `hidden_size`
self.output_hidden_size = (
config.output_hidden_size ifhasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""# 发出警告,告知方法即将被弃用,建议使用 `freeze_feature_encoder` 方法
warnings.warn("The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. ""Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 `freeze_feature_encoder` 方法来冻结特征编码器的梯度计算
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""# 将特征提取器的可训练属性设置为 False,禁止在训练过程中更新其参数
self.hubert.feature_extractor.trainable = False
@add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
@unpack_inputs
def call(
self,
input_values: tf.Tensor,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
labels: tf.Tensor | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
"""
Call method to process inputs and return outputs, adhering to Hubert model's forward function.
"""# 省略了具体的前向传播逻辑,由装饰器 `add_start_docstrings_to_model_forward` 和 `replace_return_docstrings` 指定passdef build(self, input_shape=None):
if self.built:
return
self.built = Trueifgetattr(self, "hubert", None) isnotNone:
with tf.name_scope(self.hubert.name):
# 构建 `hubert` 层,输入形状为 None
self.hubert.build(None)
ifgetattr(self, "lm_head", None) isnotNone:
with tf.name_scope(self.lm_head.name):
# 构建 `lm_head` 层,输入形状为 [None, None, self.output_hidden_size]
self.lm_head.build([None, None, self.output_hidden_size])
.\models\hubert\__init__.py
# 版权声明和许可证信息## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# 导入类型检查模块中的 TYPE_CHECKING 类型from typing import TYPE_CHECKING
# 导入依赖检查函数和 LazyModule 类from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
# 定义模块的导入结构字典
_import_structure = {"configuration_hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"]}
# 检查是否有 torch 可用,如果不可用则引发 OptionalDependencyNotAvailable 异常try:
ifnot is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
# 如果 torch 可用,则添加对应的 modeling_hubert 模块导入结构
_import_structure["modeling_hubert"] = [
"HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"HubertForCTC",
"HubertForSequenceClassification",
"HubertModel",
"HubertPreTrainedModel",
]
# 检查是否有 tensorflow 可用,如果不可用则引发 OptionalDependencyNotAvailable 异常try:
ifnot is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
# 如果 tensorflow 可用,则添加对应的 modeling_tf_hubert 模块导入结构
_import_structure["modeling_tf_hubert"] = [
"TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFHubertForCTC",
"TFHubertModel",
"TFHubertPreTrainedModel",
]
# 如果在类型检查模式下if TYPE_CHECKING:
# 导入配置和模型类from .configuration_hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
# 如果 torch 可用,则导入 torch 版的模型类try:
ifnot is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
from .modeling_hubert import (
HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
HubertForCTC,
HubertForSequenceClassification,
HubertModel,
HubertPreTrainedModel,
)
# 如果 tensorflow 可用,则导入 tensorflow 版的模型类try:
ifnot is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
from .modeling_tf_hubert import (
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFHubertForCTC,
TFHubertModel,
TFHubertPreTrainedModel,
)
# 如果不在类型检查模式下else:
import sys
# 将当前模块注册为 LazyModule,延迟导入实现
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\ibert\configuration_ibert.py
# coding=utf-8# 声明编码格式为UTF-8# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.# 版权声明,包括作者信息和版权信息# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.# 版权声明,包括年份和版权所有者信息## Licensed under the Apache License, Version 2.0 (the "License");# 根据 Apache License, Version 2.0 进行许可# you may not use this file except in compliance with the License.# 除非遵循 Apache License, Version 2.0,否则不得使用此文件# You may obtain a copy of the License at# 可以在以下链接获取许可证副本## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# 没有任何明示或暗示的担保或条件,软件在分发时是基于“按原样”分发的# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# See the License for the specific language governing permissions and# limitations under the License.# 在许可下限制的特定语言和限制""" I-BERT configuration"""# 模块级文档字符串,描述本文件是关于 I-BERT 的配置信息from collections import OrderedDict
# 导入 OrderedDict 类,用于有序字典from typing import Mapping
# 导入 Mapping 类型提示from ...configuration_utils import PretrainedConfig
# 从配置工具中导入预训练配置类 PretrainedConfigfrom ...onnx import OnnxConfig
# 从 onnx 模块中导入 OnnxConfigfrom ...utils import logging
# 从 utils 模块中导入 logging 模块
logger = logging.get_logger(__name__)
# 获取当前模块的 logger 对象
IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"kssteven/ibert-roberta-base": "https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/config.json",
"kssteven/ibert-roberta-large": "https://huggingface.co/kssteven/ibert-roberta-large/resolve/main/config.json",
"kssteven/ibert-roberta-large-mnli": (
"https://huggingface.co/kssteven/ibert-roberta-large-mnli/resolve/main/config.json"
),
}
# 定义一个字典,映射预训练模型名称到其配置文件的 URLclassIBertConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`IBertModel`]. It is used to instantiate a I-BERT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the IBERT
[kssteven/ibert-roberta-base](https://huggingface.co/kssteven/ibert-roberta-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""# IBertConfig 类,用于存储 IBERT 模型的配置信息
model_type = "ibert"# 模型类型为 ibertdef__init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
quant_mode=False,
force_dequant="none",
**kwargs,
):
"""
Initializes an IBertConfig object with default values for its parameters.
构造函数,初始化 IBertConfig 对象,设置各个参数的默认值。
"""
):
# 调用父类的构造函数,设置模型的特定参数和超参数super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 设置模型的词汇表大小
self.vocab_size = vocab_size
# 设置模型的隐藏层大小
self.hidden_size = hidden_size
# 设置模型的隐藏层数量
self.num_hidden_layers = num_hidden_layers
# 设置模型的注意力头数量
self.num_attention_heads = num_attention_heads
# 设置模型的隐藏层激活函数
self.hidden_act = hidden_act
# 设置模型的中间层大小(全连接层)
self.intermediate_size = intermediate_size
# 设置模型的隐藏层dropout概率
self.hidden_dropout_prob = hidden_dropout_prob
# 设置模型的注意力层dropout概率
self.attention_probs_dropout_prob = attention_probs_dropout_prob
# 设置模型的最大位置嵌入长度
self.max_position_embeddings = max_position_embeddings
# 设置模型的类型词汇表大小
self.type_vocab_size = type_vocab_size
# 设置模型的初始化范围
self.initializer_range = initializer_range
# 设置模型的层归一化epsilon值
self.layer_norm_eps = layer_norm_eps
# 设置模型的位置嵌入类型
self.position_embedding_type = position_embedding_type
# 设置模型的量化模式
self.quant_mode = quant_mode
# 设置模型的强制去量化标志
self.force_dequant = force_dequant
# 定义一个名为 IBertOnnxConfig 的类,它继承自 OnnxConfig 类classIBertOnnxConfig(OnnxConfig):
# 定义一个 inputs 属性,返回一个字典,键为字符串,值为映射(字典,键为整数,值为字符串) @propertydefinputs(self) -> Mapping[str, Mapping[int, str]]:
# 如果任务是多项选择 ("multiple-choice"),则设置动态轴为三维:批量(batch)、选择(choice)、序列(sequence)if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
# 否则,设置动态轴为二维:批量(batch)、序列(sequence)
dynamic_axis = {0: "batch", 1: "sequence"}
# 返回一个有序字典,包含两个条目:("input_ids", dynamic_axis) 和 ("attention_mask", dynamic_axis)return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\ibert\modeling_ibert.py
# coding=utf-8# 版权声明,版权归作者及 HuggingFace 公司所有,保留一切权利# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""PyTorch I-BERT model."""import math
from typing importOptional, Tuple, Unionimport torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import gelu
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
# 导入 logging 模块
logger = logging.get_logger(__name__)
# 以下是用于文档的定义
_CHECKPOINT_FOR_DOC = "kssteven/ibert-roberta-base"
_CONFIG_FOR_DOC = "IBertConfig"# 预训练模型的存档列表
IBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"kssteven/ibert-roberta-base",
"kssteven/ibert-roberta-large",
"kssteven/ibert-roberta-large-mnli",
]
classIBertEmbeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
IBertEmbeddings 类,与 BertEmbeddings 相同,但稍作调整以支持位置嵌入索引。
"""# 初始化函数,接受一个配置参数对象作为输入def__init__(self, config):
# 调用父类的初始化方法super().__init__()
# 设置量化模式,从配置对象中获取
self.quant_mode = config.quant_mode
# 设置嵌入比特位数
self.embedding_bit = 8
self.embedding_act_bit = 16
self.act_bit = 8
self.ln_input_bit = 22
self.ln_output_bit = 32# 创建词嵌入对象,使用QuantEmbedding进行量化
self.word_embeddings = QuantEmbedding(
config.vocab_size,
config.hidden_size,
padding_idx=config.pad_token_id,
weight_bit=self.embedding_bit,
quant_mode=self.quant_mode,
)
# 创建token类型嵌入对象,使用QuantEmbedding进行量化
self.token_type_embeddings = QuantEmbedding(
config.type_vocab_size, config.hidden_size, weight_bit=self.embedding_bit, quant_mode=self.quant_mode
)
# 注册位置ID张量为缓冲区,使用torch.arange生成连续的位置ID
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
# 设置位置嵌入的类型,默认为绝对位置嵌入
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
# 设置填充索引,并创建位置嵌入对象,使用QuantEmbedding进行量化
self.padding_idx = config.pad_token_id
self.position_embeddings = QuantEmbedding(
config.max_position_embeddings,
config.hidden_size,
padding_idx=self.padding_idx,
weight_bit=self.embedding_bit,
quant_mode=self.quant_mode,
)
# 创建嵌入激活函数对象,使用QuantAct进行量化
self.embeddings_act1 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
self.embeddings_act2 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
# 创建层归一化对象,使用IntLayerNorm进行量化,保持与TensorFlow模型变量名一致
self.LayerNorm = IntLayerNorm(
config.hidden_size,
eps=config.layer_norm_eps,
output_bit=self.ln_output_bit,
quant_mode=self.quant_mode,
force_dequant=config.force_dequant,
)
# 创建输出激活函数对象,使用QuantAct进行量化
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
# 创建Dropout对象,使用配置中的隐藏层dropout概率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
):
# 如果没有给定位置编码,根据输入的 token ids 创建位置编码,保留任何填充的 token 的填充状态if position_ids isNone:
if input_ids isnotNone:
# 从输入的 token ids 创建位置编码
position_ids = create_position_ids_from_input_ids(
input_ids, self.padding_idx, past_key_values_length
).to(input_ids.device)
else:
# 根据输入的嵌入向量创建位置编码
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
# 如果给定了 input_ids,则获取其形状;否则获取 inputs_embeds 的形状去掉最后一个维度if input_ids isnotNone:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
# 如果没有给定 token_type_ids,则创建一个全零张量作为 token_type_idsif token_type_ids isNone:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
# 如果没有给定 inputs_embeds,则通过 word_embeddings 获取输入的嵌入向量及其缩放因子if inputs_embeds isNone:
inputs_embeds, inputs_embeds_scaling_factor = self.word_embeddings(input_ids)
else:
# 否则设定 inputs_embeds_scaling_factor 为 None
inputs_embeds_scaling_factor = None# 根据 token_type_ids 获取 token 类型的嵌入向量及其缩放因子
token_type_embeddings, token_type_embeddings_scaling_factor = self.token_type_embeddings(token_type_ids)
# 将 inputs_embeds 和 token_type_embeddings 组合并通过 embeddings_act1 处理得到嵌入向量及其缩放因子
embeddings, embeddings_scaling_factor = self.embeddings_act1(
inputs_embeds,
inputs_embeds_scaling_factor,
identity=token_type_embeddings,
identity_scaling_factor=token_type_embeddings_scaling_factor,
)
# 如果 position_embedding_type 是 "absolute",则根据 position_ids 获取位置嵌入向量及其缩放因子if self.position_embedding_type == "absolute":
position_embeddings, position_embeddings_scaling_factor = self.position_embeddings(position_ids)
# 将 embeddings 和 position_embeddings 组合并通过 embeddings_act1 处理得到最终的嵌入向量及其缩放因子
embeddings, embeddings_scaling_factor = self.embeddings_act1(
embeddings,
embeddings_scaling_factor,
identity=position_embeddings,
identity_scaling_factor=position_embeddings_scaling_factor,
)
# 对最终的嵌入向量进行 LayerNorm 处理,并返回处理后的嵌入向量及其缩放因子
embeddings, embeddings_scaling_factor = self.LayerNorm(embeddings, embeddings_scaling_factor)
embeddings = self.dropout(embeddings)
# 对嵌入向量应用 output_activation,并返回处理后的嵌入向量及其缩放因子
embeddings, embeddings_scaling_factor = self.output_activation(embeddings, embeddings_scaling_factor)
return embeddings, embeddings_scaling_factor
defcreate_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""# 获取输入嵌入向量的形状,并计算序列长度
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
# 根据序列长度生成从 padding_idx + 1 开始的连续位置编码
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
# 定义 IBertSelfAttention 类,继承自 nn.Module,实现自注意力机制部分classIBertSelfAttention(nn.Module):
def__init__(self, config):
super().__init__()
# 检查 hidden_size 是否能被 num_attention_heads 整除,同时不应有 embedding_size 属性if config.hidden_size % config.num_attention_heads != 0andnothasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "f"heads ({config.num_attention_heads})"
)
# 初始化量化模式和量化位数设置
self.quant_mode = config.quant_mode
self.weight_bit = 8
self.bias_bit = 32
self.act_bit = 8# 设置注意力头数和每个注意力头的大小
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 初始化 Q、K、V 的线性层,进行量化
self.query = QuantLinear(
config.hidden_size,
self.all_head_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.key = QuantLinear(
config.hidden_size,
self.all_head_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.value = QuantLinear(
config.hidden_size,
self.all_head_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
# 初始化 Q、K、V 的激活函数,进行量化
self.query_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.key_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.value_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
# Dropout 层,用于注意力概率的 dropout
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
# 位置嵌入类型设置为绝对位置编码
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type != "absolute":
raise ValueError("I-BERT only supports 'absolute' for `config.position_embedding_type`")
# 定义 Softmax 层,用于计算注意力权重
self.softmax = IntSoftmax(self.act_bit, quant_mode=self.quant_mode, force_dequant=config.force_dequant)
# 将输入张量 x 转换为注意力分数张量的形状deftranspose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
# 前向传播函数,实现自注意力机制的计算过程defforward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
# 初始化函数,接收配置参数 configdef __init__(self, config):
# 调用父类的初始化方法super().__init__()
# 设置量化模式
self.quant_mode = config.quant_mode
# 设置激活位数为 8
self.act_bit = 8# 设置权重位数为 8
self.weight_bit = 8# 设置偏置位数为 32
self.bias_bit = 32# 设置输入层归一化的位数为 22
self.ln_input_bit = 22# 设置输出层归一化的位数为 32
self.ln_output_bit = 32# 创建一个量化线性层对象,用于神经网络的量化线性变换
self.dense = QuantLinear(
config.hidden_size,
config.hidden_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
# 创建一个输入层激活函数的量化对象
self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
# 创建一个整数型层归一化对象,用于神经网络的整数型层次归一化
self.LayerNorm = IntLayerNorm(
config.hidden_size,
eps=config.layer_norm_eps,
output_bit=self.ln_output_bit,
quant_mode=self.quant_mode,
force_dequant=config.force_dequant,
)
# 创建一个输出激活函数的量化对象
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
# 创建一个 Dropout 层,用于随机置零输入张量的元素,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播函数,接收隐藏状态、缩放因子、输入张量和其缩放因子作为输入def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
# 使用量化线性层进行隐藏状态的线性变换
hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
# 对变换后的隐藏状态应用 Dropout 操作
hidden_states = self.dropout(hidden_states)
# 使用输入层激活函数的量化对象,对隐藏状态进行激活函数操作
hidden_states, hidden_states_scaling_factor = self.ln_input_act(
hidden_states,
hidden_states_scaling_factor,
identity=input_tensor,
identity_scaling_factor=input_tensor_scaling_factor,
)
# 使用整数型层归一化对象,对处理后的隐藏状态进行归一化操作
hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)
# 使用输出激活函数的量化对象,对归一化后的隐藏状态进行激活函数操作
hidden_states, hidden_states_scaling_factor = self.output_activation(
hidden_states, hidden_states_scaling_factor
)
# 返回处理后的隐藏状态和相应的缩放因子return hidden_states, hidden_states_scaling_factor
# 定义 IBertAttention 类,继承自 nn.Module,实现自注意力机制class IBertAttention(nn.Module):
def __init__(self, config):
super().__init__()
# 从配置中获取量化模式
self.quant_mode = config.quant_mode
# 初始化 IBertSelfAttention 层和 IBertSelfOutput 层
self.self = IBertSelfAttention(config)
self.output = IBertSelfOutput(config)
# 初始化头部剪枝集合
self.pruned_heads = set()
# 剪枝指定的注意力头def prune_heads(self, heads):
iflen(heads) == 0:
return# 调用辅助函数找到可剪枝的头部索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 对自注意力机制的查询、键、值进行剪枝
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
# 对输出层的稠密层进行剪枝
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并记录剪枝的头部
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
# 前向传播函数def forward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
# 调用自注意力层的前向传播
self_outputs, self_outputs_scaling_factor = self.self(
hidden_states,
hidden_states_scaling_factor,
attention_mask,
head_mask,
output_attentions,
)
# 调用自注意力输出层的前向传播
attention_output, attention_output_scaling_factor = self.output(
self_outputs[0], self_outputs_scaling_factor[0], hidden_states, hidden_states_scaling_factor
)
# 如果输出注意力矩阵,添加到输出中
outputs = (attention_output,) + self_outputs[1:]
outputs_scaling_factor = (attention_output_scaling_factor,) + self_outputs_scaling_factor[1:]
return outputs, outputs_scaling_factor
# 定义 IBertIntermediate 类,继承自 nn.Module,实现中间层的量化操作class IBertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
# 从配置中获取量化模式
self.quant_mode = config.quant_mode
# 设置激活位数和权重位数
self.act_bit = 8
self.weight_bit = 8
self.bias_bit = 32# 创建量化线性层
self.dense = QuantLinear(
config.hidden_size,
config.intermediate_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
# 检查隐藏激活函数是否为 "gelu"if config.hidden_act != "gelu":
raise ValueError("I-BERT only supports 'gelu' for `config.hidden_act`")
# 初始化中间激活函数为 IntGELU
self.intermediate_act_fn = IntGELU(quant_mode=self.quant_mode, force_dequant=config.force_dequant)
# 初始化输出激活函数为 QuantAct
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
# 前向传播函数,接受隐藏状态和隐藏状态缩放因子作为输入参数def forward(self, hidden_states, hidden_states_scaling_factor):
# 将隐藏状态和缩放因子传递给稠密层进行处理
hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
# 将稠密层输出的隐藏状态和缩放因子传递给中间激活函数进行处理
hidden_states, hidden_states_scaling_factor = self.intermediate_act_fn(
hidden_states, hidden_states_scaling_factor
)
# 重新量化步骤:从32位转换为8位
hidden_states, hidden_states_scaling_factor = self.output_activation(
hidden_states, hidden_states_scaling_factor
)
# 返回处理后的隐藏状态和缩放因子return hidden_states, hidden_states_scaling_factor
class IBertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode # 从配置中获取量化模式
self.act_bit = 8# 激活函数的位数设定为8位
self.weight_bit = 8# 权重的位数设定为8位
self.bias_bit = 32# 偏置的位数设定为32位
self.ln_input_bit = 22# LayerNorm输入的位数设定为22位
self.ln_output_bit = 32# LayerNorm输出的位数设定为32位# 创建量化线性层,指定输入大小、输出大小,并设定权重、偏置的位数,使用量化模式
self.dense = QuantLinear(
config.intermediate_size,
config.hidden_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
# 创建量化激活函数,设定输入位数和量化模式
self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
# 创建整数化LayerNorm,指定输入大小、输出位数、量化模式和是否强制反量化
self.LayerNorm = IntLayerNorm(
config.hidden_size,
eps=config.layer_norm_eps,
output_bit=self.ln_output_bit,
quant_mode=self.quant_mode,
force_dequant=config.force_dequant,
)
# 创建量化激活函数,设定激活位数和量化模式
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
# 创建Dropout层,设定丢弃率为配置中的隐藏层dropout概率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
# 应用量化线性层,处理隐藏状态和其缩放因子
hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
# 应用Dropout层,处理隐藏状态
hidden_states = self.dropout(hidden_states)
# 应用量化激活函数,处理隐藏状态,同时传入输入张量和其缩放因子作为辅助信息
hidden_states, hidden_states_scaling_factor = self.ln_input_act(
hidden_states,
hidden_states_scaling_factor,
identity=input_tensor,
identity_scaling_factor=input_tensor_scaling_factor,
)
# 应用整数化LayerNorm,处理隐藏状态和其缩放因子
hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)
# 应用输出激活函数,处理隐藏状态和其缩放因子
hidden_states, hidden_states_scaling_factor = self.output_activation(
hidden_states, hidden_states_scaling_factor
)
# 返回处理后的隐藏状态和其缩放因子return hidden_states, hidden_states_scaling_factor
class IBertLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode # 从配置中获取量化模式
self.act_bit = 8# 激活函数的位数设定为8位
self.seq_len_dim = 1# 序列长度维度设定为1# 创建IBertAttention层,使用给定的配置
self.attention = IBertAttention(config)
# 创建IBertIntermediate层,使用给定的配置
self.intermediate = IBertIntermediate(config)
# 创建IBertOutput层,使用给定的配置
self.output = IBertOutput(config)
# 创建量化激活函数,设定输入位数和量化模式
self.pre_intermediate_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
# 创建量化激活函数,设定输入位数和量化模式
self.pre_output_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
def forward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
# 继续编写其他参数):
self_attention_outputs, self_attention_outputs_scaling_factor = self.attention(
hidden_states,
hidden_states_scaling_factor,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
# 获取自注意力机制的输出和相应的缩放因子
attention_output = self_attention_outputs[0]
attention_output_scaling_factor = self_attention_outputs_scaling_factor[0]
outputs = self_attention_outputs[1:] # 如果输出注意力权重,则添加自注意力权重# 将注意力输出作为输入,应用前馈网络
layer_output, layer_output_scaling_factor = self.feed_forward_chunk(
attention_output, attention_output_scaling_factor
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output, attention_output_scaling_factor):
# 应用预激活函数到注意力输出和缩放因子
attention_output, attention_output_scaling_factor = self.pre_intermediate_act(
attention_output, attention_output_scaling_factor
)
# 将注意力输出传递给中间层前馈网络
intermediate_output, intermediate_output_scaling_factor = self.intermediate(
attention_output, attention_output_scaling_factor
)
# 应用预输出激活函数到中间层输出和缩放因子
intermediate_output, intermediate_output_scaling_factor = self.pre_output_act(
intermediate_output, intermediate_output_scaling_factor
)
# 应用输出层到中间层输出和相应的注意力输出及缩放因子
layer_output, layer_output_scaling_factor = self.output(
intermediate_output, intermediate_output_scaling_factor, attention_output, attention_output_scaling_factor
)
return layer_output, layer_output_scaling_factor
# 定义一个名为 IBertEncoder 的类,继承自 nn.Module 类,用于实现 BERT 编码器模型class IBertEncoder(nn.Module):
# 初始化方法,接收一个配置参数 configdef __init__(self, config):
super().__init__() # 调用父类的初始化方法
self.config = config # 将传入的配置参数保存到对象的属性中
self.quant_mode = config.quant_mode # 从配置中获取量化模式设置# 创建一个由多个 IBertLayer 实例组成的模块列表,列表长度由配置中的 num_hidden_layers 决定
self.layer = nn.ModuleList([IBertLayer(config) for _ inrange(config.num_hidden_layers)])
# 前向传播方法定义def forward(
self,
hidden_states, # 输入的隐藏状态张量
hidden_states_scaling_factor, # 隐藏状态的缩放因子
attention_mask=None, # 注意力掩码,默认为 None
head_mask=None, # 头部掩码,默认为 None
output_attentions=False, # 是否输出注意力矩阵,默认为 False
output_hidden_states=False, # 是否输出所有隐藏状态,默认为 False
return_dict=True, # 是否以字典形式返回,默认为 True):
# 如果需要输出隐藏状态,则初始化一个空元组用于存储所有的隐藏状态张量
all_hidden_states = () if output_hidden_states elseNone# 如果需要输出注意力矩阵,则初始化一个空元组用于存储所有的自注意力矩阵
all_self_attentions = () if output_attentions elseNone
all_cross_attentions = None# 不支持交叉注意力,置为 None
next_decoder_cache = None# 不支持缓存,置为 None# 遍历每一个 IBertLayer 模块进行处理for i, layer_module inenumerate(self.layer):
# 如果需要输出隐藏状态,则将当前的隐藏状态张量添加到 all_hidden_states 元组中if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果存在头部掩码,则获取当前层的头部掩码
layer_head_mask = head_mask[i] if head_mask isnotNoneelseNone# 调用当前层的前向传播方法,得到该层的输出
layer_outputs = layer_module(
hidden_states,
hidden_states_scaling_factor,
attention_mask,
layer_head_mask,
output_attentions,
)
# 更新隐藏状态张量为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果需要输出注意力矩阵,则将当前层的自注意力矩阵添加到 all_self_attentions 元组中if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 如果需要输出隐藏状态,则将最终的隐藏状态张量添加到 all_hidden_states 元组中if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不需要以字典形式返回结果,则返回一个元组,包含所有非 None 的值ifnot return_dict:
returntuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v isnotNone)
# 如果需要以字典形式返回结果,则创建一个 BaseModelOutputWithPastAndCrossAttentions 实例作为返回值return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
# 定义一个名为 IBertPooler 的类,继承自 nn.Module 类,用于实现 BERT 池化器模型class IBertPooler(nn.Module):
# 初始化方法,接收一个配置参数 configdef __init__(self, config):
super().__init__() # 调用父类的初始化方法
self.quant_mode = config.quant_mode # 从配置中获取量化模式设置# 创建一个线性层,将输入特征大小映射到相同的输出特征大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh() # 定义 Tanh 激活函数# 前向传播方法定义def forward(self, hidden_states):
# 只取第一个 token 对应的隐藏状态张量作为池化输出
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor) # 通过线性层映射
pooled_output = self.activation(pooled_output) # 应用 Tanh 激活函数return pooled_output
# 定义一个名为 IBertPreTrainedModel 的类,继承自 PreTrainedModel 类class IBertPreTrainedModel(PreTrainedModel):
"""
一个抽象类,用于处理权重初始化和简单的接口,用于下载和加载预训练模型。
"""# 定义配置类为 IBertConfig
config_class = IBertConfig
# 定义基础模型前缀为 "ibert"
base_model_prefix = "ibert"def _init_weights(self, module):
"""初始化权重"""# 如果模块是 QuantLinear 或 nn.Linear 类型ifisinstance(module, (QuantLinear, nn.Linear)):
# 使用正态分布初始化权重数据,均值为 0.0,标准差为配置中的初始化范围
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果模块有偏置,则将偏置数据初始化为零if module.bias isnotNone:
module.bias.data.zero_()
# 如果模块是 QuantEmbedding 或 nn.Embedding 类型elifisinstance(module, (QuantEmbedding, nn.Embedding)):
# 使用正态分布初始化权重数据,均值为 0.0,标准差为配置中的初始化范围
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果模块有填充索引,则将填充索引处的权重数据初始化为零if module.padding_idx isnotNone:
module.weight.data[module.padding_idx].zero_()
# 如果模块是 IntLayerNorm 或 nn.LayerNorm 类型elifisinstance(module, (IntLayerNorm, nn.LayerNorm)):
# 将模块的偏置数据初始化为零
module.bias.data.zero_()
# 将模块的权重数据填充为 1.0
module.weight.data.fill_(1.0)
def resize_token_embeddings(self, new_num_tokens=None):
# 抛出未实现错误,因为 I-BERT 不支持调整 token embeddingsraise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
IBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`IBertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
IBERT_INPUTS_DOCSTRING = r"""
This string contains the docstring for explaining the inputs accepted by the IBERT model.
This docstring should describe the expected inputs for the model, such as input tensors or data structures,
their types, shapes, and any preprocessing requirements.
It provides guidance on how to format and prepare data for the model's forward pass, ensuring compatibility
with the model's architecture and requirements.
This documentation helps users understand how to correctly interface with the model, ensuring inputs are
correctly formatted to achieve expected results.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列标记在词汇表中的索引。# 可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮罩,避免对填充的标记索引执行注意力操作。# 遮罩值在 `[0, 1]` 范围内:# - 1 表示**未遮罩**的标记,# - 0 表示**已遮罩**的标记。# [什么是注意力遮罩?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 分段标记索引,指示输入的第一和第二部分。# 索引选在 `[0, 1]` 范围内:# - 0 对应*句子 A* 的标记,# - 1 对应*句子 B* 的标记。# [什么是分段标记 ID?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 每个输入序列标记在位置嵌入中的位置索引。# 索引选在 `[0, config.max_position_embeddings - 1]` 范围内。# [什么是位置 ID?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 用于空置自注意力模块中选定头部的遮罩。# 遮罩值在 `[0, 1]` 范围内:# - 1 表示**未遮罩**的头部,# - 0 表示**已遮罩**的头部。
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选参数,可以直接传递嵌入表示,而不是传递 `input_ids`。# 如果要控制如何将 `input_ids` 索引转换为相关联的向量,这很有用。# 这比模型内部的嵌入查找矩阵更灵活。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。# 查看返回的张量中的 `attentions` 以获取更多详细信息。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。# 查看返回的张量中的 `hidden_states` 以获取更多详细信息。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。"""
@add_start_docstrings(
"The bare I-BERT Model transformer outputting raw hidden-states without any specific head on top.",
IBERT_START_DOCSTRING,
)
class IBertModel(IBertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention isall you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.quant_mode = config.quant_mode
# Initialize the embeddings layer for the IBERT model
self.embeddings = IBertEmbeddings(config)
# Initialize the encoder layer for the IBERT model
self.encoder = IBertEncoder(config)
# Initialize the pooling layer if specified
self.pooler = IBertPooler(config) if add_pooling_layer else None
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
# Return the word embeddings from the embeddings layer
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
# Set new word embeddings to the embeddings layer
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# Iterate over layers and prune specific attention heads in each layer
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# Forward pass through the IBERT model
# Detailed arguments are passed to handle different configurations
pass
@add_start_docstrings(
"I-BERT Model with a `language modeling` head on top.",
IBERT_START_DOCSTRING
)
class IBertForMaskedLM(IBertPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
def __init__(self, config):
super().__init__(config)
# Initialize the IBERT model without a pooling layer
self.ibert = IBertModel(config, add_pooling_layer=False)
# Initialize the language modeling head for IBERT
self.lm_head = IBertLMHead(config)
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
# Return the decoder weights from the language modeling head
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
# 将语言模型头部的解码器层替换为新的嵌入层
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
# 确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用iBERT模型进行前向传播
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出
sequence_output = outputs[0]
# 将序列输出传递给语言模型头部以获取预测分数
prediction_scores = self.lm_head(sequence_output)
masked_lm_loss = None
# 如果存在标签,则计算掩码语言建模损失
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# 如果不需要返回字典格式的输出,则组装最终输出
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# 如果需要返回字典格式的输出,则创建MaskedLMOutput对象
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class IBertLMHead(nn.Module):
"""I-BERT Head for masked language modeling."""
def __init__(self, config):
super().__init__()
# 初始化一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 初始化 LayerNorm 层,对隐藏层进行归一化,eps 是归一化过程中的小数值
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化一个全连接层,输入维度是 config.hidden_size,输出维度是 config.vocab_size
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
# 初始化一个偏置参数,大小是 config.vocab_size
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# 将偏置参数赋给 decoder 层的偏置
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
# 将输入 features 输入全连接层 dense
x = self.dense(features)
# 使用 GELU 激活函数处理全连接层输出
x = gelu(x)
# 对处理后的结果进行 LayerNorm 归一化
x = self.layer_norm(x)
# 使用全连接层 decoder 将结果映射回词汇表大小,加上偏置
x = self.decoder(x)
return x
def _tie_weights(self):
# 如果两个权重被分离(在TPU上或者当偏置被重新调整大小时),将偏置与 decoder 的偏置相连
self.bias = self.decoder.bias
@add_start_docstrings(
"""
I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
IBERT_START_DOCSTRING,
)
class IBertForSequenceClassification(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 设置分类任务的类别数
self.num_labels = config.num_labels
# 初始化 IBertModel,不添加池化层
self.ibert = IBertModel(config, add_pooling_layer=False)
# 初始化 IBertClassificationHead
self.classifier = IBertClassificationHead(config)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 不为 None,则使用指定的 return_dict 值;否则使用 self.config.use_return_dict 的设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用预训练模型 `ibert` 进行处理,获取输出结果
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中提取序列输出(一般是经过分类器之前的最后一层隐藏状态)
sequence_output = outputs[0]
# 将序列输出传入分类器,得到 logits(预测的分类/回归结果)
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
# 如果有提供标签 labels
if labels is not None:
# 如果问题类型未定义
if self.config.problem_type is None:
# 根据 num_labels 的情况设置问题类型
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型计算损失
if self.config.problem_type == "regression":
loss_fct = MSELoss() # 使用均方误差损失函数
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss() # 使用交叉熵损失函数
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss() # 使用带 logits 的二元交叉熵损失函数
loss = loss_fct(logits, labels)
# 如果 return_dict 为 False,返回带有 logits 和其他输出的元组
if not return_dict:
output = (logits,) + outputs[2:] # 将 logits 和额外的输出合并为元组
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,返回 SequenceClassifierOutput 对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
I-BERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
IBERT_START_DOCSTRING,
)
class IBertForMultipleChoice(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 I-BERT 模型
self.ibert = IBertModel(config)
# Dropout 层,用于随机失活
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 分类器,线性层,将隐藏状态映射到单个输出值
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[MultipleChoiceModelOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 根据 `return_dict` 是否为 `None` 确定是否使用配置中的 `use_return_dict`
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取输入 `input_ids` 的第二维度大小作为 `num_choices`
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 将 `input_ids`, `position_ids`, `token_type_ids`, `attention_mask`, `inputs_embeds` 扁平化处理
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 `ibert` 模型,传入扁平化的参数,返回模型的输出结果
outputs = self.ibert(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取汇聚的输出
pooled_output = outputs[1]
# 对汇聚的输出应用 dropout
pooled_output = self.dropout(pooled_output)
# 使用分类器得出 logits
logits = self.classifier(pooled_output)
# 重塑 logits 的形状,以适应多项选择的结构
reshaped_logits = logits.view(-1, num_choices)
loss = None
# 如果存在 `labels`,计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果 `return_dict` 为 False,则返回扁平化后的输出和额外的隐藏状态
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 否则,返回一个包含损失、重塑后的 logits、隐藏状态和注意力的 `MultipleChoiceModelOutput` 对象
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
I-BERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
IBERT_START_DOCSTRING,
)
class IBertForTokenClassification(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数量
self.ibert = IBertModel(config, add_pooling_layer=False) # 初始化基于IBert的模型,不包含池化层
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 使用配置中的dropout概率初始化dropout层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 使用隐藏层大小和标签数量初始化分类器线性层
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[TokenClassifierOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用IBert模型的forward方法,传递参数并获取输出
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0] # 获取IBert模型的输出序列
sequence_output = self.dropout(sequence_output) # 应用dropout层到序列输出上
logits = self.classifier(sequence_output) # 应用分类器线性层到序列输出上,得到logits
loss = None
if labels is not None:
# 如果提供了标签,计算交叉熵损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
# 如果不要求返回字典形式的输出,按原始格式输出
output = (logits,) + outputs[2:] # 将logits和其他输出状态组合起来
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的输出,构建TokenClassifierOutput对象并返回
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class IBertClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 定义一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 定义一个 dropout 层,用于防止过拟合,dropout 概率为 config.hidden_dropout_prob
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 定义一个全连接层,输入维度为 config.hidden_size,输出维度为 config.num_labels
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
# 从 features 中获取每个样本的第一个 token 的隐藏状态,相当于取 [CLS] token
hidden_states = features[:, 0, :]
# 对隐藏状态进行 dropout
hidden_states = self.dropout(hidden_states)
# 将 dropout 后的隐藏状态输入全连接层进行线性变换
hidden_states = self.dense(hidden_states)
# 对全连接层的输出应用 tanh 激活函数
hidden_states = torch.tanh(hidden_states)
# 再次对隐藏状态进行 dropout
hidden_states = self.dropout(hidden_states)
# 将 dropout 后的隐藏状态输入最终的全连接层进行线性变换,得到模型的输出
hidden_states = self.out_proj(hidden_states)
return hidden_states
@add_start_docstrings(
"""
I-BERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
IBERT_START_DOCSTRING,
)
class IBertForQuestionAnswering(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 保存标签数量
self.num_labels = config.num_labels
# 初始化 I-BERT 模型,不加入 pooling 层
self.ibert = IBertModel(config, add_pooling_layer=False)
# 定义一个全连接层,用于生成问题回答的输出
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化模型权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[QuestionAnsweringModelOutput, Tuple[torch.FloatTensor]]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 设置返回字典是否已经指定,如果未指定则使用模型配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型的前向传播,获取模型输出
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传入问答头部,获取起始和结束 logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# 如果 start_positions 或 end_positions 是多维的,在第一个维度上进行压缩
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入的起始/结束位置
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 定义交叉熵损失函数,忽略指定的索引
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
# 计算起始和结束位置的平均损失
total_loss = (start_loss + end_loss) / 2
if not return_dict:
# 如果不返回字典,则输出损失和 logits 等信息
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
# 返回一个 QuestionAnsweringModelOutput 对象,包括损失、起始和结束 logits,以及其他隐藏状态和注意力信息
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 根据输入的 `input_ids` 生成对应的位置标识符。非填充符号被替换为它们的位置数字,位置数字从 `padding_idx+1` 开始计数。
# 填充符号被忽略。此函数改编自 fairseq 的 *utils.make_positions*。
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's *utils.make_positions*.
Args:
input_ids (`torch.LongTensor`):
Indices of input sequence tokens in the vocabulary.
Returns: torch.Tensor
"""
# 使用 input_ids.ne(padding_idx) 生成一个 mask,标记非填充符号为 1,填充符号为 0
mask = input_ids.ne(padding_idx).int()
# 在每行中计算累积的非填充符号数量,类型转换为与 mask 相同的类型,然后加上 past_key_values_length
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
# 将 incremental_indices 转换为长整型(torch.long),然后加上 padding_idx 得到最终的位置标识符
return incremental_indices.long() + padding_idx
.\models\ibert\quant_modules.py
# 设置文件编码为 UTF-8# 版权声明,包括作者信息和版权信息# 版权所有 (c) 2021, NVIDIA CORPORATION. 保留所有权利。## 根据 Apache 许可证 2.0 版本使用本文件# 除非符合许可证规定,否则不得使用本文件# 您可以在以下网址获取许可证的副本:## http://www.apache.org/licenses/LICENSE-2.0## 除非适用法律要求或书面同意,否则按“原样”分发本软件# 没有任何明示或暗示的担保或条件# 有关特定语言的权限,请参阅许可证import decimal # 导入 decimal 库import numpy as np # 导入 numpy 库import torch # 导入 torch 库from torch import nn # 从 torch 中导入 nn 模块from torch.autograd import Function # 从 torch.autograd 中导入 Function 类from ...utils import logging # 从相对路径中导入 logging 模块# 获取 logger 对象,用于记录日志信息
logger = logging.get_logger(__name__)
classQuantEmbedding(nn.Module):
"""
`torch.nn.Embedding` 的量化版本。在 `torch.nn.Embedding` 的基础上增加了量化特定的参数。
Args:
weight_bit (`int`, *optional*, defaults to `8`):
权重的量化位宽。
momentum (`float`, *optional*, defaults to `0.95`):
更新激活量化范围的动量。
quant_mode (`bool`, *optional*, defaults to `False`):
是否对该层进行量化。
"""def__init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
max_norm=None,
norm_type=2.0,
scale_grad_by_freq=False,
sparse=False,
_weight=None,
weight_bit=8,
momentum=0.95,
quant_mode=False,
):
super().__init__()
self.num_ = num_embeddings # 设置 num_ 属性为 num_embeddings
self.dim = embedding_dim # 设置 dim 属性为 embedding_dim
self.padding_idx = padding_idx # 设置 padding_idx 属性
self.max_norm = max_norm # 设置 max_norm 属性
self.norm_type = norm_type # 设置 norm_type 属性
self.scale_grad_by_freq = scale_grad_by_freq # 设置 scale_grad_by_freq 属性
self.sparse = sparse # 设置 sparse 属性
self.weight = nn.Parameter(torch.zeros([num_embeddings, embedding_dim])) # 初始化权重参数
self.register_buffer("weight_scaling_factor", torch.zeros(1)) # 注册缓冲区 weight_scaling_factor
self.register_buffer("weight_integer", torch.zeros_like(self.weight)) # 注册缓冲区 weight_integer
self.weight_bit = weight_bit # 设置 weight_bit 属性
self.momentum = momentum # 设置 momentum 属性
self.quant_mode = quant_mode # 设置 quant_mode 属性
self.percentile_mode = False# 设置 percentile_mode 属性为 False
self.weight_function = SymmetricQuantFunction.apply # 设置 weight_function 属性为 SymmetricQuantFunction.apply# 定义前向传播函数,用于模型的正向计算defforward(self, x, positions=None, incremental_state=None):
# 如果不处于量化模式,则直接返回原始的嵌入结果和空的状态ifnot self.quant_mode:
return (
nn.functional.embedding(
x,
self.weight,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
),
None,
)
# 获取模型的权重
w = self.weight
# 分离权重数据并进行转换
w_transform = w.data.detach()
# 计算权重数据的最小值,并扩展为1维张量
w_min = w_transform.min().expand(1)
# 计算权重数据的最大值,并扩展为1维张量
w_max = w_transform.max().expand(1)
# 计算权重的对称线性量化参数
self.weight_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, False)
# 使用量化函数将浮点权重转换为整数权重
self.weight_integer = self.weight_function(
self.weight, self.weight_bit, self.percentile_mode, self.weight_scaling_factor
)
# 使用整数权重进行嵌入操作
emb_int = nn.functional.embedding(
x,
self.weight_integer,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
)
# 返回量化后的嵌入结果乘以权重的缩放因子,以及权重的缩放因子本身return emb_int * self.weight_scaling_factor, self.weight_scaling_factor
classQuantAct(nn.Module):
"""
Quantizes the given activation.
Args:
activation_bit (`int`):
Bitwidth for the quantized activation.
act_range_momentum (`float`, *optional*, defaults to `0.95`):
Momentum for updating the activation quantization range.
per_channel (`bool`, *optional*, defaults to `False`):
Whether to or not use channel-wise quantization.
channel_len (`int`, *optional*):
Specify the channel length when set the *per_channel* True.
quant_mode (`bool`, *optional`, defaults to `False`):
Whether or not the layer is quantized.
"""def__init__(self, activation_bit, act_range_momentum=0.95, per_channel=False, channel_len=None, quant_mode=False):
super().__init__()
self.activation_bit = activation_bit # 设置激活量化的位宽
self.act_range_momentum = act_range_momentum # 激活量化范围动量更新的动量
self.quant_mode = quant_mode # 层是否量化的标志
self.per_channel = per_channel # 是否进行通道-wise的量化
self.percentile = False# 百分位数是否激活的标志
self.act_function = SymmetricQuantFunction.apply # 使用的量化函数ifnot self.per_channel:
# 如果不是每个通道独立量化,则注册缓冲区
self.register_buffer("x_min", torch.zeros(1))
self.register_buffer("x_max", torch.zeros(1))
self.register_buffer("act_scaling_factor", torch.zeros(1))
self.x_min -= 1e-5# 调整最小值的初始化偏移
self.x_max += 1e-5# 调整最大值的初始化偏移else:
# 目前不支持通道-wise模式的量化raise NotImplementedError("per-channel mode is not currently supported for activation.")
def__repr__(self):
return (
f"{self.__class__.__name__}(activation_bit={self.activation_bit}, "f"quant_mode: {self.quant_mode}, Act_min: {self.x_min.item():.2f}, "f"Act_max: {self.x_max.item():.2f})"
)
defforward(
self,
x,
pre_act_scaling_factor=None,
identity=None,
identity_scaling_factor=None,
specified_min=None,
specified_max=None,
):
# 根据标识(identity)是否为空来确定是否对输入进行偏移操作
x_act = x if identity isNoneelse identity + x
# 如果处于训练模式,则收集运行时的统计信息if self.training:
# 断言检查,确保激活量化模式下不支持百分位模式和按通道模式assertnot self.percentile, "percentile mode is not currently supported for activation."assertnot self.per_channel, "per-channel mode is not currently supported for activation."# 计算激活值张量的最小值和最大值
x_min = x_act.data.min()
x_max = x_act.data.max()
# 断言检查,确保计算激活值的最小和最大时未检测到NaN值assert (
x_max.isnan().sum() == 0and x_min.isnan().sum() == 0
), "NaN detected when computing min/max of the activation"# 初始化过程if self.x_min.min() > -1.1e-5and self.x_max.max() < 1.1e-5:
# 更新活动范围的最小值和最大值
self.x_min = self.x_min + x_min
self.x_max = self.x_max + x_max
# 指数移动平均 (EMA)# 使用动量以防止量化值在每次迭代中发生显著变化elif self.act_range_momentum == -1:
self.x_min = torch.min(self.x_min, x_min)
self.x_max = torch.max(self.x_max, x_max)
else:
self.x_min = self.x_min * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
self.x_max = self.x_max * self.act_range_momentum + x_max * (1 - self.act_range_momentum)
ifnot self.quant_mode:
# 如果不处于量化模式,则直接返回经过激活函数处理后的值和空的量化参数return x_act, None# 根据指定的最小值和最大值或者默认的活动范围来计算活动缩放因子
x_min = self.x_min if specified_min isNoneelse specified_min
x_max = self.x_max if specified_max isNoneelse specified_max
# 计算对应的对称线性量化参数
self.act_scaling_factor = symmetric_linear_quantization_params(
self.activation_bit, x_min, x_max, per_channel=self.per_channel
)
if pre_act_scaling_factor isNone:
# 如果没有预先计算的激活值缩放因子,则进行输入的量化操作
quant_act_int = self.act_function(x, self.activation_bit, self.percentile, self.act_scaling_factor)
else:
# 否则,使用固定点乘法进行量化操作
quant_act_int = FixedPointMul.apply(
x,
pre_act_scaling_factor,
self.activation_bit,
self.act_scaling_factor,
identity,
identity_scaling_factor,
)
# 计算正确的输出缩放因子,用于量化后的激活值
correct_output_scale = self.act_scaling_factor.view(-1)
return quant_act_int * correct_output_scale, self.act_scaling_factor
# 定义一个自定义的量化线性层,继承自 `torch.nn.Module`classQuantLinear(nn.Module):
"""
Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.
Args:
weight_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the quantized weight.
bias_bit (`int`, *optional*, defaults to `32`):
Bitwidth for the quantized bias.
per_channel (`bool`, *optional*, defaults to `False`):
Whether or not to use channel-wise quantization.
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
"""# 初始化函数,设置量化线性层的参数和缓冲区def__init__(
self, in_features, out_features, bias=True, weight_bit=8, bias_bit=32, per_channel=False, quant_mode=False):
super().__init__()
# 设置输入和输出特征数
self.in_features = in_features
self.out_features = out_features
# 初始化权重参数,并注册缓冲区 weight_integer 用于量化后的权重存储
self.weight = nn.Parameter(torch.zeros([out_features, in_features]))
self.register_buffer("weight_integer", torch.zeros_like(self.weight))
# 初始化缩放因子,对每个输出特征都有一个缩放因子
self.register_buffer("fc_scaling_factor", torch.zeros(self.out_features))
# 如果有偏置项,则初始化偏置参数,并注册缓冲区 bias_integer 用于量化后的偏置存储if bias:
self.bias = nn.Parameter(torch.zeros(out_features))
self.register_buffer("bias_integer", torch.zeros_like(self.bias))
# 设置权重和偏置的位宽,量化模式,是否使用通道级量化等属性
self.weight_bit = weight_bit
self.quant_mode = quant_mode
self.per_channel = per_channel
self.bias_bit = bias_bit
self.quant_mode = quant_mode # 设置量化模式
self.percentile_mode = False# 百分位模式,这里未启用
self.weight_function = SymmetricQuantFunction.apply # 设置权重量化函数# 返回对象的字符串表示,包含量化参数信息def__repr__(self):
s = super().__repr__()
s = f"({s} weight_bit={self.weight_bit}, quant_mode={self.quant_mode})"return s
# 定义前向传播函数,接受输入 x 和可选的前一层激活量缩放因子 prev_act_scaling_factordefforward(self, x, prev_act_scaling_factor=None):
# 如果不处于量化模式下,直接调用 PyTorch 的线性层函数进行前向传播ifnot self.quant_mode:
return nn.functional.linear(x, weight=self.weight, bias=self.bias), None# 断言 prev_act_scaling_factor 是一个标量张量assert prev_act_scaling_factor isnotNoneand prev_act_scaling_factor.shape == (1,), (
"Input activation to the QuantLinear layer should be globally (non-channel-wise) quantized. ""Please add a QuantAct layer with `per_channel = True` before this QuantAct layer"
)
# 获取权重张量
w = self.weight
# 分离权重数据,并且不再追踪计算图
w_transform = w.data.detach()
# 如果按通道量化if self.per_channel:
# 计算每个通道的最小值和最大值
w_min, _ = torch.min(w_transform, dim=1, out=None)
w_max, _ = torch.max(w_transform, dim=1, out=None)
else:
# 计算整个权重张量的最小值和最大值,并扩展为包含一个元素的张量
w_min = w_transform.min().expand(1)
w_max = w_transform.max().expand(1)
# 计算量化参数,根据权重位数、最小值、最大值和是否按通道量化
self.fc_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, self.per_channel)
# 计算量化后的整数权重
self.weight_integer = self.weight_function(
self.weight, self.weight_bit, self.percentile_mode, self.fc_scaling_factor
)
# 计算偏置项的缩放因子
bias_scaling_factor = self.fc_scaling_factor * prev_act_scaling_factor
# 如果存在偏置项if self.bias isnotNone:
# 计算量化后的整数偏置项
self.bias_integer = self.weight_function(self.bias, self.bias_bit, False, bias_scaling_factor)
# 将 prev_act_scaling_factor 重塑为形状为 (1, -1) 的张量,并用它对输入 x 进行缩放
prev_act_scaling_factor = prev_act_scaling_factor.view(1, -1)
x_int = x / prev_act_scaling_factor
# 使用量化后的整数权重和偏置项进行线性变换,并乘以偏置项的缩放因子return (
nn.functional.linear(x_int, weight=self.weight_integer, bias=self.bias_integer) * bias_scaling_factor,
bias_scaling_factor,
)
classIntGELU(nn.Module):
"""
Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.
Args:
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "gelu" or "nonlinear" is given.
"""def__init__(self, quant_mode=True, force_dequant="none"):
super().__init__()
self.quant_mode = quant_mode # 初始化量化模式标志,默认为 Trueif force_dequant in ["nonlinear", "gelu"]:
logger.info("Force dequantize gelu")
self.quant_mode = False# 如果 force_dequant 参数为 "nonlinear" 或 "gelu",强制取消量化模式ifnot self.quant_mode:
self.activation_fn = nn.GELU() # 如果未使用量化模式,则使用 nn.GELU 激活函数
self.k = 1.4142# 常数 k,用于计算缩放因子
self.const = 14# 虚拟的整数常数
self.coeff = [-0.2888, -1.769, 1] # 系数数组 [a, b, c],用于计算整数误差函数
self.coeff[2] /= self.coeff[0] # 系数归一化处理defint_erf(self, x_int, scaling_factor):
b_int = torch.floor(self.coeff[1] / scaling_factor) # 计算 b 的整数值
c_int = torch.floor(self.coeff[2] / scaling_factor**2) # 计算 c 的整数值
sign = torch.sign(x_int) # 计算 x_int 的符号
abs_int = torch.min(torch.abs(x_int), -b_int) # 取绝对值并截断到 -b_int
y_int = sign * ((abs_int + b_int) ** 2 + c_int) # 计算整数误差函数
scaling_factor = scaling_factor**2 * self.coeff[0] # 更新缩放因子的平方乘以系数 a# 避免溢出,通过右移操作
y_int = floor_ste.apply(y_int / 2**self.const) # 使用 floor_ste 函数进行右移处理
scaling_factor = scaling_factor * 2**self.const # 更新缩放因子return y_int, scaling_factor # 返回整数误差函数值和更新后的缩放因子defforward(self, x, scaling_factor=None):
ifnot self.quant_mode:
return self.activation_fn(x), None# 如果未使用量化模式,直接返回激活函数处理后的结果
x_int = x / scaling_factor # 计算 x 的整数值
sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k) # 计算整数误差函数
shift_int = 1.0 // sigmoid_scaling_factor # 计算整数误差函数的偏移量
x_int = x_int * (sigmoid_int + shift_int) # 应用整数误差函数和偏移量对 x_int 进行处理
scaling_factor = scaling_factor * sigmoid_scaling_factor / 2# 更新缩放因子return x_int * scaling_factor, scaling_factor # 返回处理后的整数值和更新后的缩放因子classIntSoftmax(nn.Module):
"""
Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of `torch.nn.Softmax`.
Args:
output_bit (`int`):
Bitwidth for the layer output activation.
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "softmax" or "nonlinear" is given.
"""# 初始化函数,设置输出位数、量化模式和强制去量化模式def__init__(self, output_bit, quant_mode=False, force_dequant="none"):
# 调用父类初始化函数super().__init__()
# 设置输出位数
self.output_bit = output_bit
# 最大位数设为32
self.max_bit = 32# 设置量化模式
self.quant_mode = quant_mode
# 如果强制去量化模式为"nonlinear"或"softmax"if force_dequant in ["nonlinear", "softmax"]:
# 输出日志信息
logger.info("Force dequantize softmax")
# 强制取消量化模式设为False
self.quant_mode = False# 初始化量化操作对象,16为输入量化位数
self.act = QuantAct(16, quant_mode=self.quant_mode)
# 设置常数x0为-ln2
self.x0 = -0.6931# -ln2# 设置常数const为30,用作虚拟整数常量
self.const = 30# dummy integer constant# 设置多项式系数为ax**2 + bx + c,其中a为1.0,b为0.35815147,c为0.96963238
self.coef = [0.35815147, 0.96963238, 1.0]
# 根据a对b和c进行归一化处理
self.coef[1] /= self.coef[0]
self.coef[2] /= self.coef[0]
# 整型多项式函数defint_polynomial(self, x_int, scaling_factor):
# 禁用梯度计算with torch.no_grad():
# 计算系数b_int和c_int
b_int = torch.floor(self.coef[1] / scaling_factor)
c_int = torch.floor(self.coef[2] / scaling_factor**2)
# 计算多项式结果z
z = (x_int + b_int) * x_int + c_int
# 更新缩放因子为多项式系数乘以原缩放因子的平方
scaling_factor = self.coef[0] * scaling_factor**2return z, scaling_factor
# 整型指数函数defint_exp(self, x_int, scaling_factor):
# 禁用梯度计算with torch.no_grad():
# 计算整数化的x0_int
x0_int = torch.floor(self.x0 / scaling_factor)
# 限制x_int的最小值为常数const乘以x0_int
x_int = torch.max(x_int, self.const * x0_int)
# 计算q和r
q = floor_ste.apply(x_int / x0_int)
r = x_int - x0_int * q
# 计算指数整数和缩放因子
exp_int, exp_scaling_factor = self.int_polynomial(r, scaling_factor)
# 对指数整数进行修剪并缩放
exp_int = torch.clamp(floor_ste.apply(exp_int * 2 ** (self.const - q)), min=0)
scaling_factor = exp_scaling_factor / 2**self.const
return exp_int, scaling_factor
# 前向传播函数defforward(self, x, scaling_factor):
# 如果非量化模式,直接返回softmax函数结果和空值ifnot self.quant_mode:
return nn.functional.softmax(x, dim=-1), None# 计算整数化的输入x_int
x_int = x / scaling_factor
# 计算x_int的最大值和更新x_int
x_int_max, _ = x_int.max(dim=-1, keepdim=True)
x_int = x_int - x_int_max
# 计算指数整数和指数缩放因子
exp_int, exp_scaling_factor = self.int_exp(x_int, scaling_factor)
# 避免溢出
exp, exp_scaling_factor = self.act(exp_int, exp_scaling_factor)
exp_int = exp / exp_scaling_factor
# 计算指数整数的总和
exp_int_sum = exp_int.sum(dim=-1, keepdim=True)
# 计算因子
factor = floor_ste.apply(2**self.max_bit / exp_int_sum)
# 对指数整数进行修剪并缩放
exp_int = floor_ste.apply(exp_int * factor / 2 ** (self.max_bit - self.output_bit))
scaling_factor = 1 / 2**self.output_bit
return exp_int * scaling_factor, scaling_factor
"""
Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of `torch.nn.LayerNorm`.
Args:
normalized_shape (`int` or `list` or `torch.Size`):
Shape of the input tensor over which normalization is applied.
eps (`float`):
Small value added to the denominator for numerical stability.
output_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the layer output activation.
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
force_dequant (`str`, *optional*, defaults to `"none"`):
If set to `"layernorm"` or `"nonlinear"`, forces dequantization of the layer.
Attributes:
weight (`torch.nn.Parameter`):
Learnable parameter representing the scaling factor.
bias (`torch.nn.Parameter`):
Learnable parameter representing the bias.
shift (`torch.Tensor`):
Buffer holding the shift value for dynamic adjustment.
output_bit (`int`):
Bitwidth for the layer output activation.
max_bit (`int`):
Maximum allowable bitwidth for quantization.
dim_sqrt (`None`):
Placeholder for the square root of the dimension, initially `None`.
activation (`QuantAct`):
Instance of `QuantAct` for quantization-aware activation.
Methods:
set_shift(self, y_int):
Adjusts `self.shift` based on the input tensor `y_int`.
overflow_fallback(self, y_int):
Handles overflow during training and adjusts `self.shift` accordingly.
Notes:
- This class extends `torch.nn.Module` and integrates quantization-specific features.
- It manages parameters for scaling and bias, quantization mode, and dynamic shift adjustments.
- The `QuantAct` instance `activation` handles activation quantization within the layer.
"""def__init__(self, normalized_shape, eps, output_bit=8, quant_mode=False, force_dequant="none"):
super().__init__()
# Initialize attributes related to normalization
self.normalized_shape = normalized_shape
self.eps = eps
# Initialize learnable parameters
self.weight = nn.Parameter(torch.zeros(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
# Manage quantization mode, with option for forced dequantization
self.quant_mode = quant_mode
if force_dequant in ["nonlinear", "layernorm"]:
logger.info("Force dequantize layernorm")
self.quant_mode = False# Buffer for dynamic shift adjustment
self.register_buffer("shift", torch.zeros(1))
# Configure output bitwidth and related parameters
self.output_bit = output_bit
self.max_bit = 32
self.dim_sqrt = None# Quantized activation function
self.activation = QuantAct(self.output_bit, quant_mode=self.quant_mode)
defset_shift(self, y_int):
"""
Adjusts `self.shift` based on the input tensor `y_int`.
Args:
y_int (`torch.Tensor`):
Integer tensor representing the quantized activation values.
"""with torch.no_grad():
y_sq_int = y_int**2
var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
shift = (torch.log2(torch.sqrt(var_int / 2**self.max_bit)).ceil()).max()
shift_old = self.shift
self.shift = torch.max(self.shift, shift)
logger.info(f"Dynamic shift adjustment: {int(shift_old)} -> {int(self.shift)}")
defoverflow_fallback(self, y_int):
"""
Handles overflow during training and adjusts `self.shift` accordingly.
Args:
y_int (`torch.Tensor`):
Integer tensor representing the quantized activation values.
Returns:
`torch.Tensor`: Tensor representing the adjusted variance after shift.
"""
self.set_shift(y_int) # adjusts `self.shift`
y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
y_sq_int = y_int_shifted**2
var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
return var_int
# 定义前向传播函数,接受输入张量 x 和可选的缩放因子 scaling_factordefforward(self, x, scaling_factor=None):
# 如果不是量化模式ifnot self.quant_mode:
# 计算输入张量 x 沿第二个轴的均值
mean = x.mean(axis=2, keepdim=True)
# 对输入张量进行均值中心化
y = x - mean
# 计算中心化后的输入张量的方差
var = torch.mean(y**2, axis=2, keepdim=True)
# 根据均值和方差进行标准化处理
x = y / torch.sqrt(self.eps + var)
# 对标准化后的张量进行加权和偏移处理
x = x * self.weight + self.bias
# 返回处理后的张量和空的 scaling_factorreturn x, None# 如果是量化模式,并且还未计算过 feature 维度的平方根if self.dim_sqrt isNone:
# 计算 feature 维度的平方根并保存到 self.dim_sqrt 中
n = torch.tensor(x.shape[2], dtype=torch.float)
self.dim_sqrt = torch.sqrt(n).to(x.device)
# 对输入张量 x 进行除以缩放因子的量化
x_int = x / scaling_factor
# 计算量化后的输入张量沿第二个轴的均值并四舍五入
mean_int = round_ste.apply(x_int.mean(axis=2, keepdim=True))
# 对量化后的输入张量进行均值中心化
y_int = x_int - mean_int
# 将中心化后的量化张量按照指定的位移因子进行向下取整操作
y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
# 计算量化后的输入张量的平方
y_sq_int = y_int_shifted**2# 计算量化后的输入张量的方差
var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
# 如果处于训练阶段,并且检测到方差 var_int 存在溢出if self.training:
# 如果方差 var_int 的最大值超过了 self.max_bit 所指定的阈值if var_int.max() >= 2**self.max_bit:
# 执行溢出处理函数以获取修正后的方差 var_int
var_int = self.overflow_fallback(y_int)
# 断言确保修正后的方差 var_int 仍然小于 self.max_bit + 0.1assert var_int.max() < 2**self.max_bit + 0.1, (
"Error detected in overflow handling: ""`var_int` exceeds `self.max_bit` (the maximum possible bit width)"
)
# 待替换为生成相同输出的整数平方根核函数
std_int = floor_ste.apply(torch.sqrt(var_int)) * 2**self.shift
# 计算因子,用于缩放输入张量 y_int
factor = floor_ste.apply(2**31 / std_int)
# 根据计算得到的因子对输入张量 y_int 进行进一步处理
y_int = floor_ste.apply(y_int * factor / 2)
# 计算缩放因子 scaling_factor,用于最终的缩放和偏移
scaling_factor = self.dim_sqrt / 2**30# 缩放和偏移处理
bias = self.bias.data.detach() / (self.weight.data.detach())
bias_int = floor_ste.apply(bias / scaling_factor)
y_int = y_int + bias_int
scaling_factor = scaling_factor * self.weight
x = y_int * scaling_factor
# 返回处理后的张量 x 和最终的 scaling_factorreturn x, scaling_factor
# 计算给定张量中百分位数的最大值和最小值defget_percentile_min_max(input, lower_percentile, upper_percentile, output_tensor=False):
"""
Calculate the percentile max and min values in a given tensor
Args:
input (`torch.Tensor`):
The target tensor to calculate percentile max and min.
lower_percentile (`float`):
If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
upper_percentile (`float`):
If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
output_tensor (`bool`, *optional*, defaults to `False`):
If True, this function returns tensors, otherwise it returns values.
Returns:
`Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
"""# 获取输入张量的长度
input_length = input.shape[0]
# 计算下分位数和上分位数的索引
lower_index = round(input_length * (1 - lower_percentile * 0.01))
upper_index = round(input_length * upper_percentile * 0.01)
# 计算上分位数的值
upper_bound = torch.kthvalue(input, k=upper_index).values
# 如果 lower_percentile 为 0,则下分位数设为 0,否则计算下分位数的值if lower_percentile == 0:
lower_bound = upper_bound * 0# lower_index += 1else:
lower_bound = -torch.kthvalue(-input, k=lower_index).values
# 如果不需要输出张量,将下分位数和上分位数转换为标量值ifnot output_tensor:
lower_bound = lower_bound.item()
upper_bound = upper_bound.item()
return lower_bound, upper_bound
deflinear_quantize(input, scale, zero_point, inplace=False):
"""
Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.
Args:
input (`torch.Tensor`):
Single-precision input tensor to be quantized.
scale (`torch.Tensor`):
Scaling factor for quantization.
zero_point (`torch.Tensor`):
Shift for quantization.
inplace (`bool`, *optional*, defaults to `False`):
Whether to compute inplace or not.
Returns:
`torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
"""# 根据张量维度重新调整 scale 和 zero_point,适用于卷积权重和激活函数iflen(input.shape) == 4:
scale = scale.view(-1, 1, 1, 1)
zero_point = zero_point.view(-1, 1, 1, 1)
# 根据张量维度重新调整 scale 和 zero_point,适用于线性权重eliflen(input.shape) == 2:
scale = scale.view(-1, 1)
zero_point = zero_point.view(-1, 1)
else:
scale = scale.view(-1)
zero_point = zero_point.view(-1)
# 执行量化操作:input = float / scale + zero_pointif inplace:
input.mul_(1.0 / scale).add_(zero_point).round_()
returninputreturn torch.round(1.0 / scale * input + zero_point)
defsymmetric_linear_quantization_params(num_bits, saturation_min, saturation_max, per_channel=False):
"""
Compute the scaling factor with the given quantization range for symmetric quantization.
"""# 在对称量化情况下计算缩放因子,根据给定的量化范围# 在这部分,我们不需要进行任何梯度计算,# 为了确保这一点,我们使用 torch.no_grad() 来包裹代码块with torch.no_grad():
# 计算量化的范围,使用的比特数为 num_bits
n = 2 ** (num_bits - 1) - 1# 如果 per_channel 为 True,执行以下操作if per_channel:
# 计算每个通道的最大饱和度,并取绝对值
scale, _ = torch.max(torch.stack([saturation_min.abs(), saturation_max.abs()], dim=1), dim=1)
# 将 scale 限制在最小值为 1e-8,然后进行量化范围的计算
scale = torch.clamp(scale, min=1e-8) / n
else:
# 计算整体的最大饱和度,并取绝对值
scale = max(saturation_min.abs(), saturation_max.abs())
# 将 scale 限制在最小值为 1e-8,然后进行量化范围的计算
scale = torch.clamp(scale, min=1e-8) / n
# 返回计算得到的量化因子 scalereturn scale
classSymmetricQuantFunction(Function):
"""
Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth.
""" @staticmethoddefforward(ctx, x, k, percentile_mode, scale):
"""
Args:
x (`torch.Tensor`):
Floating point tensor to be quantized.
k (`int`):
Quantization bitwidth.
percentile_mode (`bool`):
Whether or not to use percentile calibration.
scale (`torch.Tensor`):
Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
requires pre-calculated scaling factor.
Returns:
`torch.Tensor`: Symmetric-quantized value of *input*.
"""# Define the zero point as a tensor with value 0.0 on the same device as scale
zero_point = torch.tensor(0.0).to(scale.device)
# Calculate the maximum representable integer for given bitwidth k
n = 2 ** (k - 1) - 1# Perform linear quantization with the given parameters
new_quant_x = linear_quantize(x, scale, zero_point, inplace=False)
# Clamp the quantized values to ensure they lie within the representable range
new_quant_x = torch.clamp(new_quant_x, -n, n - 1)
# Store scaling factor in context for backward pass
ctx.scale = scale
return new_quant_x
@staticmethoddefbackward(ctx, grad_output):
# Retrieve stored scaling factor from context
scale = ctx.scale
# Adjust scale shape based on gradient output dimensionsiflen(grad_output.shape) == 4:
scale = scale.view(-1, 1, 1, 1)
eliflen(grad_output.shape) == 2:
scale = scale.view(-1, 1)
else:
scale = scale.view(-1)
# Return gradient scaled by the inverse of the scaling factor, and None for other argumentsreturn grad_output.clone() / scale, None, None, None, Noneclassfloor_ste(Function):
"""
Straight-through Estimator(STE) for torch.floor()
""" @staticmethoddefforward(ctx, x):
# Forward pass computes the floor of input tensor xreturn torch.floor(x)
@staticmethoddefbackward(ctx, grad_output):
# Backward pass returns the gradient unchangedreturn grad_output.clone()
classround_ste(Function):
"""
Straight-through Estimator(STE) for torch.round()
""" @staticmethoddefforward(ctx, x):
# Forward pass computes the round of input tensor xreturn torch.round(x)
@staticmethoddefbackward(ctx, grad_output):
# Backward pass returns the gradient unchangedreturn grad_output.clone()
defbatch_frexp(inputs, max_bit=31):
"""
Decompose the scaling factor into mantissa and twos exponent.
Args:
scaling_factor (`torch.Tensor`):
Target scaling factor to decompose.
Returns:
``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent
"""# Get the shape of the input tensor
shape_of_input = inputs.size()
# Flatten the input tensor to 1D
inputs = inputs.view(-1)
# Use NumPy's frexp function to decompose each element of the tensor into mantissa and exponent
output_m, output_e = np.frexp(inputs.cpu().numpy())
# Quantize the mantissa and shift it to fit within max_bit range
tmp_m = []
for m in output_m:
int_m_shifted = int(
decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP)
)
tmp_m.append(int_m_shifted)
output_m = np.array(tmp_m)
# Calculate the exponent in terms of max_bit
output_e = float(max_bit) - output_e
# Return the quantized mantissa and exponent tensors reshaped to the original input shapereturn (
torch.from_numpy(output_m).to(inputs.device).view(shape_of_input),
torch.from_numpy(output_e).to(inputs.device).view(shape_of_input),
)
classFixedPointMul(Function):
"""
Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.
Args:
pre_act (`torch.Tensor`):
Input tensor.
pre_act_scaling_factor (`torch.Tensor`):
Scaling factor of the input tensor *pre_act*.
bit_num (`int`):
Quantization bitwidth.
z_scaling_factor (`torch.Tensor`):
Scaling factor of the output tensor.
identity (`torch.Tensor`, *optional*):
Identity tensor, if exists.
identity_scaling_factor (`torch.Tensor`, *optional*):
Scaling factor of the identity tensor *identity*, if exists.
Returns:
`torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and
*identity*), whose scale is rescaled to *z_scaling_factor*.
""" @staticmethoddefforward(
ctx,
pre_act,
pre_act_scaling_factor,
bit_num,
z_scaling_factor,
identity=None,
identity_scaling_factor=None,
):
# Lambda function to reshape input tensor if necessaryiflen(pre_act_scaling_factor.shape) == 3:
reshape = lambda x: x # noqa: E731else:
reshape = lambda x: x.view(1, 1, -1) # noqa: E731# Store identity tensor in the context
ctx.identity = identity
# Maximum representable integer in fixed-point representation
n = 2 ** (bit_num - 1) - 1# Perform operations with gradients turned offwith torch.no_grad():
# Reshape scaling factors
pre_act_scaling_factor = reshape(pre_act_scaling_factor)
if identity isnotNone:
identity_scaling_factor = reshape(identity_scaling_factor)
# Store scaling factor of the output tensor in the context
ctx.z_scaling_factor = z_scaling_factor
# Quantize input tensor pre_act
z_int = torch.round(pre_act / pre_act_scaling_factor)
_A = pre_act_scaling_factor.type(torch.double)
_B = (z_scaling_factor.type(torch.float)).type(torch.double)
new_scale = _A / _B
new_scale = reshape(new_scale)
# Compute mantissa and exponent using batch_frexp function
m, e = batch_frexp(new_scale)
# Compute the output tensor in fixed-point arithmetic
output = z_int.type(torch.double) * m.type(torch.double)
output = torch.round(output / (2.0**e))
# If identity tensor is provided, perform additional fixed-point arithmeticif identity isnotNone:
wx_int = torch.round(identity / identity_scaling_factor)
_A = identity_scaling_factor.type(torch.double)
_B = (z_scaling_factor.type(torch.float)).type(torch.double)
new_scale = _A / _B
new_scale = reshape(new_scale)
m1, e1 = batch_frexp(new_scale)
output1 = wx_int.type(torch.double) * m1.type(torch.double)
output1 = torch.round(output1 / (2.0**e1))
# Sum the outputs of pre_act and identity tensors
output = output1 + output
# Clamp the output tensor within the range of representable integersreturn torch.clamp(output.type(torch.float), -n - 1, n)
@staticmethod# 定义反向传播函数,计算梯度defbackward(ctx, grad_output):
# 初始化变量用于存储身份梯度
identity_grad = None# 如果上下文中的身份不为None,则计算身份梯度if ctx.identity isnotNone:
# 克隆梯度输出并除以上下文中的缩放因子,作为身份梯度
identity_grad = grad_output.clone() / ctx.z_scaling_factor
# 返回计算得到的梯度,其他返回值为Nonereturn grad_output.clone() / ctx.z_scaling_factor, None, None, None, None, identity_grad, None
# coding=utf-8# 声明文件编码格式为UTF-8# 版权声明和许可证信息,说明此代码基于EleutherAI的GPT-NeoX库,经过修改以适应与Meta AI团队训练的模型的轻微架构差异# 详细说明了代码的版权信息和许可证,允许在Apache License, Version 2.0下使用此文件# 导入必要的模块和库from ...configuration_utils import PretrainedConfig
from ...utils import logging
# 获取与当前模块关联的日志记录器对象
logger = logging.get_logger(__name__)
# 预训练模型配置文件与存档映射表
IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
"HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
}
# IdeficsVisionConfig类,继承自PretrainedConfig类,用于存储Idefics模型的配置信息classIdeficsVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.
e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""# 定义模型类型为 "idefics"
model_type = "idefics"# 创建属性映射字典,将 "hidden_size" 映射为 "embed_dim"
attribute_map = {
"hidden_size": "embed_dim",
}
# 初始化函数,定义了模型的参数和默认取值def__init__(
self,
embed_dim=768, # 编码器层和池化层的维度,默认为 768
image_size=224, # 每个图像的分辨率大小,默认为 224
intermediate_size=5120, # Transformer 编码器中"中间"(即前馈)层的维度,默认为 5120
patch_size=14, # 每个补丁的大小(分辨率),默认为 14
num_hidden_layers=32, # Transformer 编码器中的隐藏层数量,默认为 32
num_attention_heads=16, # 每个注意力层中的注意力头数,默认为 16
num_channels=3, # 图像通道数,默认为 3
hidden_act="gelu", # 编码器和池化器中的非线性激活函数,默认为 "gelu"
layer_norm_eps=1e-5, # 层归一化层使用的 epsilon,默认为 1e-5
attention_dropout=0.0, # 注意力概率的 dropout 比率,默认为 0.0
initializer_range=0.02, # 用于初始化所有权重矩阵的截断正态分布的标准差,默认为 0.02
initializer_factor=1.0, # 用于初始化权重矩阵的因子(通常保持为 1.0,仅用于初始化测试中)
**kwargs, # 其他参数,未指定的参数会被捕获在这里):
# 设置嵌入维度
self.embed_dim = embed_dim
# 设置图像尺寸
self.image_size = image_size
# 设置中间层大小
self.intermediate_size = intermediate_size
# 设置patch大小
self.patch_size = patch_size
# 设置隐藏层数量
self.num_hidden_layers = num_hidden_layers
# 设置注意力头数量
self.num_attention_heads = num_attention_heads
# 设置通道数量
self.num_channels = num_channels
# 设置层归一化 epsilon 值
self.layer_norm_eps = layer_norm_eps
# 设置注意力机制的 dropout 率
self.attention_dropout = attention_dropout
# 设置初始化范围
self.initializer_range = initializer_range
# 设置初始化因子
self.initializer_factor = initializer_factor
# 设置隐藏层激活函数
self.hidden_act = hidden_act
# 调用父类的初始化方法super().__init__(**kwargs)
classIdeficsConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.
e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import IdeficsModel, IdeficsConfig
>>> # Initializing a Idefics idefics-9b style configuration
>>> configuration = IdeficsConfig()
```
注释:
声明一个名为 IdeficsConfig 的配置类,用于存储 `IdeficsModel` 的配置信息。
该配置类根据指定的参数实例化一个 Idefics 模型,定义模型的架构。
使用默认参数实例化配置类会产生类似 Idefics-9B 模型的配置。
例如,[HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b) 提供了相关的预训练模型。
Configuration objects 继承自 [`PretrainedConfig`],可用于控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
```
>>> # 从 idefics-9b 风格的配置中初始化一个模型
>>> model = IdeficsModel(configuration)
>>> # 访问模型的配置信息
>>> configuration = model.config
):
# 初始化函数,设置模型的各项参数
self.vocab_size = vocab_size
# 额外词汇表大小
self.additional_vocab_size = additional_vocab_size
# 隐藏层大小
self.hidden_size = hidden_size
# 中间层大小
self.intermediate_size = intermediate_size
# 隐藏层的数量
self.num_hidden_layers = num_hidden_layers
# 注意力头的数量
self.num_attention_heads = num_attention_heads
# dropout 概率
self.dropout = dropout
# 隐藏层激活函数
self.hidden_act = hidden_act
# 初始化范围
self.initializer_range = initializer_range
# alpha 初始化器
self.alpha_initializer = alpha_initializer
# alpha 初始化范围
self.alphas_initializer_range = alphas_initializer_range
# alpha 类型
self.alpha_type = alpha_type
# RMS 规范化的 epsilon
self.rms_norm_eps = rms_norm_eps
# 是否使用缓存
self.use_cache = use_cache
# 交叉层间隔
self.cross_layer_interval = cross_layer_interval
# qk 层归一化
self.qk_layer_norms = qk_layer_norms
# 冻结视觉层
self.freeze_vision_layers = freeze_vision_layers
# 冻结文本层
self.freeze_text_layers = freeze_text_layers
# 冻结文本模块例外
self.freeze_text_module_exceptions = freeze_text_module_exceptions
# 冻结视觉模块例外
self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
# 冻结 LM 头部
self.freeze_lm_head = freeze_lm_head
# 是否使用重采样器
self.use_resampler = use_resampler
# 如果 perceiver_config 为 None,则使用默认配置
if perceiver_config is None:
self.perceiver_config = IdeficsPerceiverConfig()
# 如果 perceiver_config 是字典类型,则使用给定的配置创建 IdeficsPerceiverConfig 对象
elif isinstance(perceiver_config, dict):
self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
# 如果 perceiver_config 已经是 IdeficsPerceiverConfig 类型,则直接使用它
elif isinstance(perceiver_config, IdeficsPerceiverConfig):
self.perceiver_config = perceiver_config
# 如果 vision_config 为 None,则使用默认配置
if vision_config is None:
self.vision_config = IdeficsVisionConfig()
# 如果 vision_config 是字典类型,则使用给定的配置创建 IdeficsVisionConfig 对象
elif isinstance(vision_config, dict):
self.vision_config = IdeficsVisionConfig(**vision_config)
# 如果 vision_config 已经是 IdeficsVisionConfig 类型,则直接使用它
elif isinstance(vision_config, IdeficsVisionConfig):
self.vision_config = vision_config
# 调用父类的初始化方法,设置特殊标记的 token ID 和其他参数
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
# 注意:不要在构造函数中进行任何基于 __init__ 参数的检查,
# 因为 PretrainedConfig.from_dict 首先使用配置字典实例化类,然后
# 仅在 from_pretrained 中使用 from_pretrained 的 kwargs 更新配置对象,
# 所以在实例化此对象时,许多属性具有默认值,尚未被覆盖。
# 请在运行父类的 from_pretrained 后,在 from_pretrained 中执行任何必要的检查。
.\models\idefics\image_processing_idefics.py
# coding=utf-8# Copyright 2022 The HuggingFace Inc. team. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Image processor class for Idefics."""from typing importCallable, Dict, List, Optional, Unionfrom PIL import Image # 导入 PIL 库中的 Image 模块from ...image_processing_utils import BaseImageProcessor, BatchFeature # 导入自定义的图像处理工具from ...image_transforms import resize, to_channel_dimension_format # 导入自定义的图像转换函数from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
make_list_of_images,
to_numpy_array,
valid_images,
) # 导入图像处理和转换的实用函数from ...utils import TensorType, is_torch_available # 导入通用实用函数和 Torch 相关函数
IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073] # 定义 IDEFICS 标准均值
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711] # 定义 IDEFICS 标准标准差defconvert_to_rgb(image):
# `image.convert("RGB")` 只对 .jpg 图片有效,因为它会为透明图像创建错误的背景。# `alpha_composite` 函数处理带有透明通道的图像。if image.mode == "RGB": # 检查图像是否已经是 RGB 模式return image
image_rgba = image.convert("RGBA") # 将图像转换为 RGBA 模式
background = Image.new("RGBA", image_rgba.size, (255, 255, 255)) # 创建白色背景图像
alpha_composite = Image.alpha_composite(background, image_rgba) # 使用 alpha 合成处理透明通道
alpha_composite = alpha_composite.convert("RGB") # 将结果转换回 RGB 模式return alpha_composite
classIdeficsImageProcessor(BaseImageProcessor):
r"""
Constructs a Idefics image processor.
Args:
image_size (`int`, *optional*, defaults to 224):
Resize to image size
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
image_num_channels (`int`, *optional*, defaults to 3):
Number of image channels.
"""
model_input_names = ["pixel_values"] # 模型输入的名称列表,此处只有一个像素值的输入# 初始化方法,用于设置图像处理的参数和调用父类的初始化方法def__init__(
self,
image_size: int = 224, # 图像大小,默认为224像素
image_mean: Optional[Union[float, List[float]]] = None, # 图像均值,可以是单个数值或列表形式的均值
image_std: Optional[Union[float, List[float]]] = None, # 图像标准差,可以是单个数值或列表形式的标准差
image_num_channels: Optional[int] = 3, # 图像通道数,默认为3通道(彩色图像)
**kwargs, # 其他关键字参数) -> None:
# 调用父类的初始化方法,处理其他传入的关键字参数super().__init__(**kwargs)
# 设置对象的属性值,用于后续图像预处理使用
self.image_size = image_size # 设置图像大小
self.image_num_channels = image_num_channels # 设置图像通道数
self.image_mean = image_mean # 设置图像均值
self.image_std = image_std # 设置图像标准差# 图像预处理方法,用于对输入图像进行预处理操作defpreprocess(
self,
images: ImageInput, # 输入的图像数据,可以是单张图像或批量图像
image_num_channels: Optional[int] = 3, # 图像通道数,默认为3通道
image_size: Optional[Dict[str, int]] = None, # 图像大小的字典,包含宽和高
image_mean: Optional[Union[float, List[float]]] = None, # 图像均值,可以是单个数值或列表形式的均值
image_std: Optional[Union[float, List[float]]] = None, # 图像标准差,可以是单个数值或列表形式的标准差
transform: Callable = None, # 图像变换函数,用于额外的图像处理
**kwargs, # 其他关键字参数
.\models\idefics\modeling_idefics.py
# coding=utf-8# 定义文件编码为UTF-8# 版权声明和许可证信息,基于Apache License, Version 2.0# 详细许可证信息可以在http://www.apache.org/licenses/LICENSE-2.0找到""" PyTorch Idefics model. """# 导入必要的库和模块from dataclasses import dataclass
from typing importAny, Dict, List, Optional, Tuple, Unionimport torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
# 导入自定义的模块和函数from ... import PreTrainedModel
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import ModelOutput
from ...modeling_utils import PretrainedConfig
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
# 导入IdeficsConfig配置文件from .configuration_idefics import IdeficsConfig
# 导入IdeficsPerceiverResampler和IdeficsVisionTransformer模块from .perceiver import IdeficsPerceiverResampler
from .vision import IdeficsVisionTransformer
# 获取日志记录器对象
logger = logging.get_logger(__name__)
# 用于文档的配置示例
_CONFIG_FOR_DOC = "IdeficsConfig"# 预训练模型的存档列表
IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = [
"HuggingFaceM4/idefics-9b",
"HuggingFaceM4/idefics-80b",
# 查看所有Idefics模型 https://huggingface.co/models?filter=idefics
]
@dataclass# 带有过去键/值的Idefics模型输出的基类,用于加速顺序解码classIdeficsBaseModelOutputWithPast(ModelOutput):
"""
Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
""""""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的输出隐藏状态序列。
如果使用了 `past_key_values`,则只输出形状为 `(batch_size, 1, hidden_size)` 的每个序列的最后一个隐藏状态。
past_key_values (`tuple(tuple(torch.FloatTensor))`, *可选*, 当传入 `use_cache=True` 或 `config.use_cache=True` 时返回):
长度为 `config.n_layers` 的元组,每个元组包含两个张量,形状为 `(batch_size, num_heads, sequence_length, embed_size_per_head)`。
包含预计算的隐藏状态(自注意力块中的键和值,以及如果 `config.is_encoder_decoder=True` 在交叉注意力块中也包含),
可用于加速序列解码。
hidden_states (`tuple(torch.FloatTensor)`, *可选*, 当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
元组的 `torch.FloatTensor`(如果模型具有嵌入层,则为嵌入输出的张量 + 每层的输出张量),形状为 `(batch_size, sequence_length, hidden_size)`。
模型每一层的隐藏状态,加上可选的初始嵌入层输出。
attentions (`tuple(torch.FloatTensor)`, *可选*, 当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
元组的 `torch.FloatTensor`(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
自注意力头中注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
image_hidden_states (`tuple(torch.FloatTensor)`, *可选*):
元组的 `torch.FloatTensor`(图像嵌入输出的一个,形状为 `(batch_size, num_images, sequence_length, hidden_size)`)。
模型通过视觉编码器生成的图像隐藏状态,以及通过感知者生成的图像隐藏状态。
"""
last_hidden_state: torch.FloatTensor = None# 初始化最后一个隐藏状态
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None# 初始化预计算的键和值
hidden_states: Optional[Tuple[torch.FloatTensor]] = None# 初始化所有层的隐藏状态
attentions: Optional[Tuple[torch.FloatTensor]] = None# 初始化注意力权重
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None# 初始化图像隐藏状态@dataclassclassIdeficsCausalLMOutputWithPast(ModelOutput):
"""
Base class for Idefics causal language model (or autoregressive) outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
sequence_length, hidden_size)`).
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
"""
loss: Optional[torch.FloatTensor] = None# 初始化为可选的 torch.FloatTensor,用于存储语言模型损失
logits: torch.FloatTensor = None# 初始化为 torch.FloatTensor,存储语言模型头部的预测分数(softmax之前)
past_key_values: Optional[List[torch.FloatTensor]] = None# 初始化为可选的列表,存储预计算的自注意力块中的键值对
hidden_states: Optional[Tuple[torch.FloatTensor]] = None# 初始化为可选的元组,存储模型每层的隐藏状态输出
attentions: Optional[Tuple[torch.FloatTensor]] = None# 初始化为可选的元组,存储每层的注意力权重
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None# 初始化为可选的元组,存储视觉编码器产生的图像隐藏状态defexpand_inputs_for_generation(
input_ids,
expand_size=1,
is_encoder_decoder=False,
attention_mask=None,
encoder_outputs=None,
**model_kwargs,
):
"""
扩展输入以用于生成
Args:
input_ids: 输入的 token IDs
expand_size: 扩展的大小,用于生成的副本数
is_encoder_decoder: 是否是编码器-解码器结构
attention_mask: 注意力掩码
encoder_outputs: 编码器的输出,用于解码器的输入
**model_kwargs: 其他模型的关键字参数
"""
expanded_return_idx = (
torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
)
# 使用索引从输入张量中选择特定的行,更新 input_ids 变量
input_ids = input_ids.index_select(0, expanded_return_idx)
# 将像素值添加到模型关键字参数中,如果已存在则保持不变
model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
# 将图像编码器嵌入向量添加到模型关键字参数中,如果已存在则保持不变
model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)
# 将感知器嵌入向量添加到模型关键字参数中,如果已存在则保持不变
model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)
# 将图像注意力掩码添加到模型关键字参数中,如果已存在则保持不变
model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)
# 如果模型关键字参数中存在 'token_type_ids',则选择特定行更新其对应值if"token_type_ids"in model_kwargs:
token_type_ids = model_kwargs["token_type_ids"]
model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
# 如果存在注意力掩码,选择特定行更新模型关键字参数中的 'attention_mask'if attention_mask isnotNone:
model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
# 如果模型关键字参数中的 'image_attention_mask' 不为 None,则选择特定行更新它if model_kwargs["image_attention_mask"] isnotNone:
model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
0, expanded_return_idx
)
# 如果模型关键字参数中的 'pixel_values' 不为 None,则选择特定行更新它if model_kwargs["pixel_values"] isnotNone:
model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
# 否则,如果 'image_encoder_embeddings' 不为 None,则选择特定行更新它elif model_kwargs["image_encoder_embeddings"] isnotNone:
model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select(
0, expanded_return_idx
)
# 否则,如果 'perceiver_embeddings' 不为 None,则选择特定行更新它elif model_kwargs["perceiver_embeddings"] isnotNone:
model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select(
0, expanded_return_idx
)
# 返回更新后的 input_ids 和 model_kwargsreturn input_ids, model_kwargs
defprepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
# 如果 past_key_values 在 kwargs 中定义,则只使用 input_ids 的最后一个 tokenif past_key_values:
input_ids = input_ids[:, -1].unsqueeze(-1)
if token_type_ids isnotNone:
token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask isnotNoneand position_ids isNone:
# 为批量生成创建动态的 position_ids
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
pixel_values = kwargs.get("pixel_values", None)
image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
image_attention_mask = kwargs.get("image_attention_mask", None)
interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)
# 返回包含所有输入准备数据的字典return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
"pixel_values": pixel_values,
"image_encoder_embeddings": image_encoder_embeddings,
"perceiver_embeddings": perceiver_embeddings,
"image_attention_mask": image_attention_mask,
"interpolate_pos_encoding": interpolate_pos_encoding,
}
deffreeze_model(model, module_exceptions=[]):
# 映射常见模块类型到 PyTorch 中对应的类
mapping = {
"LayerNorm": nn.LayerNorm,
"Linear": nn.Linear,
"Embedding": nn.Embedding,
}
module_exceptions_mapped = [mapping[m] for m in module_exceptions]
# 遍历模型的所有模块,冻结除了例外模块之外的所有参数for module in model.modules():
if module_exceptions andany(isinstance(module, t) for t in module_exceptions_mapped):
module.requires_grad_(True) # 明确将其设置为 True,避免任何错误else:
module.requires_grad_(False)
return model
classIdeficsDecoupledEmbedding(nn.Embedding):
# 源自 https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding"""
实现参数解耦以允许冻结(或不冻结)嵌入的子集。在实践中,regular `weight` 可以训练或冻结
(即 `partially_freeze=True`),如果 `num_additional_embeddings > 0`,则会创建
`num_additional_embeddings` 个额外的始终训练的参数。如果 `num_additional_embeddings=0`,
则模块默认为 `nn.Embedding` 的常规行为。
"""# 初始化函数,用于创建一个新的嵌入层对象def__init__(
self,
num_embeddings,
num_additional_embeddings,
embedding_dim,
partially_freeze: Optional[bool] = False,
device=None,
dtype=None,
padding_idx=None,
**kwargs,
) -> None:
"""
Args:
num_embeddings (`int`):
Size of the dictionary of embeddings
num_additional_embeddings (`int`):
Number of additional embeddings. Only useful when you `partially_freeze=True`.
embedding_dim (`int`):
The size of each embedding vector
partially_freeze: (`bool`, *optional*, defaults to `False`):
If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
padding_idx (`int`, *optional*):
The padding index (needs to be less than num_embeddings)
Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
`max_norm` or `norm_type`. We are not supporting these.
"""# 检查 padding_idx 是否有效,必须小于 num_embeddingsif padding_idx isnotNoneand padding_idx > num_embeddings:
raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
# 调用父类 nn.Embedding 的初始化方法,传入大部分参数super().__init__(
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
device=device,
dtype=dtype,
padding_idx=padding_idx,
**kwargs,
)
# 初始化特定于当前类的成员变量
self.num_embeddings = num_embeddings
self.padding_idx = padding_idx
self.num_additional_embeddings = num_additional_embeddings
self.partially_freeze = partially_freeze
# 如果 partially_freeze 为 True,则冻结主要的 weight 参数if partially_freeze:
self.weight.requires_grad_(False)
# 如果有额外的嵌入向量需求,则创建额外的 nn.Embedding 对象if self.num_additional_embeddings > 0:
self.additional_embedding = nn.Embedding(
num_embeddings=self.num_additional_embeddings,
embedding_dim=embedding_dim,
device=device,
dtype=dtype,
)
defforward(self, input_ids):
"""
前向传播函数,用于模型的正向计算过程。
we have 2 embeddings, with different indices - one pretrained self.weight and another
self.additional_embedding.weight that is being trained.
我们有两个嵌入层,它们有不同的索引范围:
- 一个是预训练的 self.weight
- 另一个是正在训练的 self.additional_embedding.weight
in order to make a lookup of the input ids, we:
为了查找输入的 id,我们执行以下步骤:
1. find out the indices of the entries belonging to the 2nd embedding
1. 找出属于第二个嵌入层的条目的索引
2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
embedding starts from 0 and not num_embeddings
2. 提取这些值,同时减去第一个嵌入层的大小(num_embeddings),因为第二个嵌入层的索引从0开始,而不是从num_embeddings开始
3. perform the 2nd embedding lookup
3. 执行第二个嵌入层的查找操作
4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
4. 现在处理第一个嵌入层,我们用填充索引覆盖属于第二个嵌入层的索引
5. perform the 1st embedding lookup
5. 执行第一个嵌入层的查找操作
6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
6. 现在我们用第二个嵌入层查找的值覆盖第一个嵌入层查找的值
note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
measure.
注意:对于第一个嵌入层的查找,我们本可以只查找低索引而不进行填充,但那样我们就必须创建一个新的张量,并用两个分散在不同索引上的张量填充它 - 也就是不简单的连接操作 - 我还没有对复杂情况进行基准测试,如果更快的话,鉴于序列长度通常相对较短,可能并不更快,或者如果更快,提升也不会很大 - 但是测量一下可能是个好主意。
"""if self.num_additional_embeddings == 0:
return F.embedding(input_ids, self.weight)
# Clone so that we don't modify the original input_ids later on# 克隆 input_ids,以防后续修改原始输入
input_ids = input_ids.clone()
# Find indices where input_ids belong to the additional embedding# 找到 input_ids 中属于额外嵌入层的索引
additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
# Extract input_ids values that belong to the additional vocabulary# 提取属于额外词汇表的 input_ids 值
input_ids_additional_vocab = input_ids[additional_vocab_indices]
# Perform embedding lookup for additional embeddings# 执行额外嵌入层的查找
additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
# Set indices of additional vocabulary to 0, as these results will be discarded# 将额外词汇表的索引设置为0,因为这些结果将被丢弃
input_ids[additional_vocab_indices] = 0# Perform embedding lookup for the main embedding (self.weight)# 执行主嵌入层(self.weight)的查找
full_vector = F.embedding(input_ids, self.weight)
# Overwrite the records with high indices with values from additional embeddings# 用额外嵌入层的值覆盖高索引位置的记录
full_vector[additional_vocab_indices] = additional_embeddings
return full_vector
defextra_repr(self) -> str:
"""
返回模型的额外信息,用于描述模型的属性。
Returns:
返回包含模型属性的字符串
"""return"num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
self.num_embeddings,
self.num_additional_embeddings,
self.embedding_dim,
self.partially_freeze,
)
classIdeficsDecoupledLinear(nn.Linear):
# 从 https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear 派生而来的类,实现参数的解耦,允许部分参数冻结或训练。"""
Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
then it will create `out_additional_features * in_features` additional parameters that are always trained. If
`out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
"""def__init__(
self,
in_features: int,
out_features: int,
out_additional_features: int = 0,
bias: bool = True,
partially_freeze: bool = True,
device=None,
dtype=None,
) -> None:
"""
out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
`partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
"""super().__init__(in_features, out_features, bias, device, dtype)
# 初始化自定义参数
self.out_additional_features = out_additional_features
self.partially_freeze = partially_freeze
self.in_features = in_features
self.out_features = out_features
# 如果 partially_freeze 为 True,则冻结权重和偏置的梯度if partially_freeze:
self.weight.requires_grad_(False)
if bias:
self.bias.requires_grad_(False)
# 如果有额外的特征维度要训练,则创建额外的线性层 additional_fcif out_additional_features > 0:
self.additional_fc = nn.Linear(
in_features=in_features,
out_features=out_additional_features,
bias=bias,
device=device,
dtype=dtype,
)
defforward(self, input: torch.Tensor) -> torch.Tensor:
# 执行前向传播,计算线性层的输出
output = F.linear(input, self.weight, self.bias)
# 如果有额外的特征维度要处理,则将其连接到输出中if self.out_additional_features > 0:
additional_features = self.additional_fc(input)
output = torch.cat((output, additional_features), -1)
return output
defextra_repr(self) -> str:
"""Overwriting `nn.Linear.extra_repr` to include new parameters."""# 重写 `nn.Linear.extra_repr` 方法,以包含新的参数信息return"in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
self.in_features,
self.out_features,
self.out_additional_features,
self.bias isnotNone,
self.partially_freeze,
)
# this was adapted from LlamaRMSNormclassIdeficsRMSNorm(nn.Module):
# 基于 LlamaRMSNorm 进行了适配def__init__(self, hidden_size, eps=1e-6):
"""
IdeficsRMSNorm is equivalent to T5LayerNorm
"""super().__init__()
# 初始化权重参数为可训练的张量
self.weight = nn.Parameter(torch.ones(hidden_size))
# 设置方差 epsilon
self.variance_epsilon = eps
# 定义前向传播方法,接收隐藏状态作为输入defforward(self, hidden_states):
# 计算每个隐藏状态的方差,并保持维度不变
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
# 根据方差对隐藏状态进行归一化处理
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
# 如果权重的数据类型是半精度浮点数(float16 或 bfloat16),则将隐藏状态转换为相应的数据类型if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)
# 返回加权后的归一化隐藏状态return self.weight * hidden_states
# 将 IdeficsRMSNorm 类型对象添加到 ALL_LAYERNORM_LAYERS 列表中
ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
# 这是从 LlamaRotaryEmbedding 改编而来的类classIdeficsEmbedding(torch.nn.Module):
def__init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
# 计算频率的倒数,用于位置编码
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
# 为了使 `torch.jit.trace` 能够正常工作,在这里构建缓存
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def_set_cos_sin_cache(self, seq_len, device, dtype):
# 设置余弦和正弦缓存
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# 与论文不同,但使用了不同的排列顺序来获得相同的计算结果
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
defforward(self, x, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
defrotate_half(x):
"""将输入的隐藏维度的一半进行旋转。"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
# 从 transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb 复制而来defapply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""将旋转位置编码应用到查询和键张量上。
# 通过使用位置索引从余弦向量中选择对应的值,并在指定的维度上进行unsqueeze操作,以便与q和k的形状进行广播。
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
# 通过使用位置索引从正弦向量中选择对应的值,并在指定的维度上进行unsqueeze操作,以便与q和k的形状进行广播。
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
# 将查询向量q与余弦向量cos相乘并加上查询向量q与正弦向量sin经过rotate_half函数后的乘积,生成旋转后的查询向量。
q_embed = (q * cos) + (rotate_half(q) * sin)
# 将键向量k与余弦向量cos相乘并加上键向量k与正弦向量sin经过rotate_half函数后的乘积,生成旋转后的键向量。
k_embed = (k * cos) + (rotate_half(k) * sin)
# 返回旋转后的查询向量和键向量组成的元组。
return q_embed, k_embed
# 这段代码改编自 LlamaMLP
class IdeficsMLP(nn.Module):
def __init__(
self,
hidden_size: int,
intermediate_size: int,
hidden_act: str,
):
super().__init__()
# 定义一个线性层,用于门控投影,输入维度为 hidden_size,输出维度为 intermediate_size,无偏置
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
# 定义一个线性层,用于下游投影,输入维度为 intermediate_size,输出维度为 hidden_size,无偏置
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
# 定义一个线性层,用于上游投影,输入维度为 hidden_size,输出维度为 intermediate_size,无偏置
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
# 激活函数为根据 hidden_act 参数选择的激活函数,从全局字典 ACT2FN 中获取
self.act_fn = ACT2FN[hidden_act]
def forward(self, x):
# 执行前向传播,结合门控投影、激活函数和上游投影,然后通过下游投影得到输出
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# 这段代码改编自 LlamaAttention
class IdeficsAttention(nn.Module):
"""来自 'Attention Is All You Need' 论文中的多头注意力"""
def __init__(
self,
hidden_size: int,
num_heads: int,
dropout: float = 0.0,
is_cross_attention: bool = False,
config: PretrainedConfig = None,
qk_layer_norms: bool = False,
):
super().__init__()
):
super().__init__() # 调用父类的初始化方法
self.hidden_size = hidden_size # 设置模型的隐藏层大小
self.num_heads = num_heads # 设置注意力头的数量
self.head_dim = hidden_size // num_heads # 计算每个注意力头的维度
self.dropout = dropout # 设置dropout的比例
self.is_causal = True # 设定是否是因果注意力机制
if (self.head_dim * num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {num_heads})."
) # 检查隐藏层大小是否能够被注意力头数量整除,如果不能则抛出数值错误异常
self.is_cross_attention = is_cross_attention # 标记是否是交叉注意力
if not hasattr(nn.functional, "scaled_dot_product_attention"):
raise ValueError("this model requires pytorch 2.0 or higher") # 检查是否支持所需的PyTorch版本
if self.is_cross_attention:
kv_input_dim = (
self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
)
self.q_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建查询投影层
self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False) # 创建键投影层
self.v_proj = nn.Linear(
kv_input_dim,
num_heads * self.head_dim,
bias=False,
) # 创建值投影层
else:
self.q_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建查询投影层
self.k_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建键投影层
self.v_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建值投影层
self.o_proj = nn.Linear(
num_heads * self.head_dim,
hidden_size,
bias=False,
) # 创建输出投影层
self.rotary_emb = IdeficsEmbedding(self.head_dim) # 创建旋转嵌入层对象
self.qk_layer_norms = qk_layer_norms # 设置是否进行查询和键的层标准化
if self.qk_layer_norms:
self.q_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps) # 创建查询层标准化对象
self.k_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps) # 创建键层标准化对象
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
# 将张量重塑为(batch_size, sequence_length, num_heads, head_dim),并转置维度以符合注意力机制的需求
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
# this was adapted from LlamaDecoderLayer
# 定义一个名为 IdeficsDecoderLayer 的类,继承自 nn.Module
class IdeficsDecoderLayer(nn.Module):
def __init__(self, config: IdeficsConfig):
super().__init__()
# 初始化隐藏层大小
self.hidden_size = config.hidden_size
# 创建自注意力层对象,使用配置中的参数进行初始化
self.self_attn = IdeficsAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.dropout,
config=config,
)
# 创建MLP对象,使用配置中的参数进行初始化
self.mlp = IdeficsMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
)
# 创建输入层归一化对象,使用配置中的隐藏大小和RMS归一化的epsilon参数进行初始化
self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# 创建注意力后归一化对象,使用配置中的隐藏大小和RMS归一化的epsilon参数进行初始化
self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# 设置Dropout概率,使用配置中的dropout参数
self.dropout = config.dropout
# 定义前向传播方法
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether ornot to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states # 保留输入 hidden_states 的原始值,用于后续残差连接
hidden_states = self.input_layernorm(hidden_states) # 使用层归一化对输入进行归一化处理
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 对输出进行 dropout 处理
hidden_states = residual + hidden_states # 残差连接:将归一化前的输入与经过 self attention 和 dropout 处理后的输出相加
# Fully Connected
residual = hidden_states # 保留上一步操作后的值,用于后续残差连接
hidden_states = self.post_attention_layernorm(hidden_states) # 使用层归一化对输出进行归一化处理
hidden_states = self.mlp(hidden_states) # 使用全连接层进行线性变换
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 对输出进行 dropout 处理
hidden_states = residual + hidden_states # 残差连接:将归一化后的输出与经过 MLP 和 dropout 处理后的输出相加
outputs = (hidden_states,) # 将处理后的 hidden_states 放入输出元组中
if output_attentions:
outputs += (self_attn_weights,) # 如果需要输出 attention 权重,则将 self_attn_weights 放入输出元组中
if use_cache:
outputs += (present_key_value,) # 如果需要使用缓存的过去键值状态,则将 present_key_value 放入输出元组中
return outputs # 返回包含处理后的结果的元组
# 定义自定义的 gated cross-attention 层,继承自 nn.Module
class IdeficsGatedCrossAttentionLayer(nn.Module):
# 前向传播方法定义,接收多个输入参数
def forward(
self,
hidden_states: torch.Tensor, # 输入的隐藏状态张量
attention_mask: Optional[torch.Tensor] = None, # 可选的注意力遮罩张量
image_hidden_states: Optional[torch.Tensor] = None, # 可选的图像隐藏状态张量
image_attention_mask: Optional[torch.Tensor] = None, # 可选的图像注意力遮罩张量
cross_attention_gate: Optional[torch.Tensor] = None, # 可选的交叉注意力门控张量
output_attentions: Optional[bool] = False, # 是否输出注意力权重的标志
use_cache: Optional[bool] = False, # 是否使用缓存的标志
past_key_value: Optional[Tuple[torch.Tensor]] = None, # 可选的过去的键值对元组
LLAMA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements forall its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation forall matter related to general usage
and behavior.
Parameters:
config ([`IdeficsConfig`]):
Model configuration classwithall the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
LLAMA_START_DOCSTRING,
)
# 定义一个预训练模型类,继承自 PreTrainedModel
class IdeficsPreTrainedModel(PreTrainedModel):
config_class = IdeficsConfig # 使用 IdeficsConfig 类作为配置类
base_model_prefix = "model" # 基础模型前缀为 "model"
supports_gradient_checkpointing = True # 支持梯度检查点
_no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"] # 不拆分的模块列表
_supports_sdpa = True # 支持自动分配并行性加速(Self-Delegated Parallelism Acceleration, SDPA)
def _init_weights(self, module):
# 重要提示:这个 Idefics 的移植版本不适用于从头训练,只能用于推理和微调
# 因此,初始化权重的正确代码已被删除。m4 代码库应该用于从头训练,并包含正确的代码。
std = self.config.initializer_range
if isinstance(module, nn.Linear): # 如果是线性层
module.weight.data.normal_(mean=0.0, std=std) # 权重初始化为正态分布
if module.bias is not None:
module.bias.data.zero_() # 如果存在偏置,将其初始化为零
elif isinstance(module, nn.Embedding): # 如果是嵌入层
module.weight.data.normal_(mean=0.0, std=std) # 权重初始化为正态分布
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_() # 如果存在填充索引,将其初始化为零
# 从 transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa 适配而来
@classmethod
# 定义一个类方法 `_check_and_enable_sdpa`,用于检查并启用 SDPA 注意力机制配置
def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
# 检查是否启用了 `use_bettertransformer` 属性,用于决定是否返回原始配置
_is_bettertransformer = getattr(cls, "use_bettertransformer", False)
if _is_bettertransformer:
return config
# 如果不仅仅是进行硬性检查,设置注意力实现方式为 "sdpa"
if not hard_check_only:
config._attn_implementation = "sdpa"
# 返回修改后的配置对象
return config
# 定义一个多行字符串,用于文档化LLaMA输入的说明文档
LLAMA_INPUTS_DOCSTRING = r""""""
# 使用装饰器为IdeficsModel类添加文档字符串,在输出原始隐藏状态时不添加特定的顶部头信息
@add_start_docstrings(
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
LLAMA_START_DOCSTRING,
)
# 定义IdeficsModel类,继承自IdeficsPreTrainedModel类
class IdeficsModel(IdeficsPreTrainedModel):
"""
Transformer解码器,由`config.num_hidden_layers`层组成。每一层是一个[`IdeficsDecoderLayer`]
Args:
config: IdeficsConfig
"""
def __init__(self, config: IdeficsConfig):
# 调用父类的构造函数进行初始化
super().__init__(config)
# 将config参数保存在实例变量中
self.config = config
# 设置填充索引为config中定义的pad_token_id
self.padding_idx = config.pad_token_id
# 设置词汇表大小为config中定义的vocab_size
self.vocab_size = config.vocab_size
# 创建IdeficsDecoupledEmbedding实例,并保存在embed_tokens实例变量中
self.embed_tokens = IdeficsDecoupledEmbedding(
num_embeddings=config.vocab_size,
num_additional_embeddings=config.additional_vocab_size,
embedding_dim=config.hidden_size,
partially_freeze=config.freeze_text_layers,
padding_idx=self.padding_idx,
)
# 设置图像尺寸和视觉配置,从config参数中获取
self.image_size = config.vision_config.image_size
self.vision_config = config.vision_config
# 创建IdeficsVisionTransformer实例,并保存在vision_model实例变量中
self.vision_model = IdeficsVisionTransformer(config.vision_config)
# 如果config中设置了使用resampler,则创建IdeficsPerceiverResampler实例,并保存在perceiver_resampler实例变量中
if config.use_resampler:
perceiver_config = config.perceiver_config
self.perceiver_resampler = IdeficsPerceiverResampler(
config,
config.vision_config.embed_dim,
perceiver_config.resampler_depth,
perceiver_config.resampler_n_heads,
perceiver_config.resampler_head_dim,
perceiver_config.resampler_n_latents,
)
# 创建包含config.num_hidden_layers个IdeficsDecoderLayer实例的模块列表,并保存在layers实例变量中
self.layers = nn.ModuleList([IdeficsDecoderLayer(config) for _ in range(config.num_hidden_layers)])
# 设置跨层间隔为config中定义的cross_layer_interval
self.cross_layer_interval = config.cross_layer_interval
# 计算跨层注意力层的数量
num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
# 创建包含num_cross_layers个IdeficsGatedCrossAttentionLayer实例的模块列表,并保存在gated_cross_attn_layers实例变量中
self.gated_cross_attn_layers = nn.ModuleList(
[IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
)
# 设置梯度检查点标志为False
self.gradient_checkpointing = False
# 创建IdeficsRMSNorm实例,并保存在norm实例变量中
self.norm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# 初始化权重并进行最终处理
self.post_init()
# 冻结相关参数
self.freeze_relevant_params(config)
# 方法:冻结相关参数
def freeze_relevant_params(self, config=None):
if config is None:
config = self.config
# 如果配置中指定冻结文本层,则调用freeze_text_layers方法冻结相关模块
if config.freeze_text_layers:
self.freeze_text_layers(config.freeze_text_module_exceptions)
# 如果配置中指定冻结视觉层,则调用freeze_vision_layers方法冻结视觉模型
if config.freeze_vision_layers:
freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
# 方法:冻结文本层
def freeze_text_layers(self, module_exceptions=[]):
# 遍历self.layers和self.norm列表中的模块,调用freeze_model函数冻结指定模块
for module in [self.layers, self.norm]:
freeze_model(module, module_exceptions=module_exceptions)
# 方法:冻结视觉层
def freeze_vision_layers(self, module_exceptions=[]):
# 调用freeze_model函数冻结self.vision_model中指定的模块
freeze_model(self.vision_model, module_exceptions=module_exceptions)
# 方法:获取输入嵌入层
def get_input_embeddings(self):
return self.embed_tokens
# 设置模型的输入嵌入表示
def set_input_embeddings(self, value):
self.embed_tokens = value
# 在模型前向传播过程中添加注释到模型文档的装饰器
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None, # 输入的token IDs,类型为LongTensor
attention_mask: Optional[torch.Tensor] = None, # 注意力遮罩,可选的Tensor类型
position_ids: Optional[torch.LongTensor] = None, # 位置IDs,可选的LongTensor类型
past_key_values: Optional[List[torch.FloatTensor]] = None, # 过去的键值对,可选的浮点数张量列表
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入的嵌入表示,可选的浮点数张量
pixel_values: Optional[torch.FloatTensor] = None, # 像素值,可选的浮点数张量
image_encoder_embeddings: Optional[torch.FloatTensor] = None, # 图像编码器嵌入,可选的浮点数张量
perceiver_embeddings: Optional[torch.FloatTensor] = None, # 感知器嵌入,可选的浮点数张量
image_attention_mask: Optional[torch.Tensor] = None, # 图像注意力遮罩,可选的Tensor类型
use_cache: Optional[bool] = None, # 是否使用缓存,可选的布尔类型
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选的布尔类型
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选的布尔类型
interpolate_pos_encoding: Optional[bool] = False, # 是否插值位置编码,布尔类型,默认为False
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,可选的布尔类型
class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
# 在加载时需要忽略的键列表,用于处理缺失情况
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
# 要绑定权重的键列表,指定需要共享权重的模型参数
_tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config, vision_model=None):
# 调用父类的初始化方法,传入配置参数
super().__init__(config)
# 使用给定的配置参数初始化 IdeficsModel 模型
self.model = IdeficsModel(config)
# 使用 IdeficsDecoupledLinear 初始化 lm_head 层
self.lm_head = IdeficsDecoupledLinear(
in_features=config.hidden_size,
out_features=config.vocab_size,
out_additional_features=config.additional_vocab_size,
bias=False,
partially_freeze=config.freeze_lm_head,
)
# 执行初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回模型的 embed_tokens 层,用于输入嵌入
return self.model.embed_tokens
def set_input_embeddings(self, value):
# 设置模型的 embed_tokens 层,用于输入嵌入
self.model.embed_tokens = value
def get_output_embeddings(self):
# 返回 lm_head 层,用于输出嵌入
return self.lm_head
def set_output_embeddings(self, new_embeddings):
# 设置 lm_head 层,用于输出嵌入
self.lm_head = new_embeddings
def set_decoder(self, decoder):
# 设置模型的 decoder 层
self.model = decoder
def get_decoder(self):
# 返回模型的 decoder 层
return self.model
def tie_weights(self):
"""
重写 `transformers.modeling_utils.PreTrainedModel.tie_weights` 方法,
处理 IdeficsDecoupledLinear 和 IdeficsDecoupledEmbedding 的情况。
"""
output_embeddings = self.get_output_embeddings()
input_embeddings = self.get_input_embeddings()
# 如果配置允许绑定词嵌入,则将输出嵌入的权重设置为输入嵌入的权重
if getattr(self.config, "tie_word_embeddings", True):
output_embeddings.weight = input_embeddings.weight
# 如果存在额外的嵌入,则也绑定额外的嵌入权重
if input_embeddings.num_additional_embeddings > 0:
assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
# 更新输出嵌入的特征数和额外特征数,以匹配输入嵌入的数目
if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
output_embeddings.out_features = input_embeddings.num_embeddings
if hasattr(output_embeddings, "out_additional_features") and hasattr(
input_embeddings, "num_additional_embeddings"
):
output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=IdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
# 定义一个方法用于处理前向推断过程中的输入数据
def forward(
self,
input_ids: torch.LongTensor = None, # 输入的token ID序列,默认为None
attention_mask: Optional[torch.Tensor] = None, # 可选的注意力掩码张量,默认为None
position_ids: Optional[torch.LongTensor] = None, # 可选的位置ID张量,默认为None
past_key_values: Optional[List[torch.FloatTensor]] = None, # 可选的过去键值对列表,默认为None
inputs_embeds: Optional[torch.FloatTensor] = None, # 可选的嵌入输入张量,默认为None
pixel_values: Optional[torch.FloatTensor] = None, # 可选的像素值张量,默认为None
image_encoder_embeddings: Optional[torch.FloatTensor] = None, # 可选的图像编码器嵌入张量,默认为None
perceiver_embeddings: Optional[torch.FloatTensor] = None, # 可选的感知器嵌入张量,默认为None
image_attention_mask: Optional[torch.Tensor] = None, # 可选的图像注意力掩码张量,默认为None
labels: Optional[torch.LongTensor] = None, # 可选的标签张量,默认为None
use_cache: Optional[bool] = None, # 可选的缓存使用标志,默认为None
output_attentions: Optional[bool] = None, # 可选的输出注意力张量,默认为None
output_hidden_states: Optional[bool] = None, # 可选的输出隐藏状态标志,默认为None
interpolate_pos_encoding: Optional[bool] = False, # 可选的位置编码插值标志,默认为False
return_dict: Optional[bool] = None, # 可选的返回字典标志,默认为None
):
# 定义一个方法用于准备生成过程中的输入数据
def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
# 从kwargs中获取image_hidden_states参数,如果存在的话
image_hidden_states = kwargs.pop("image_hidden_states", None)
if image_hidden_states is not None:
# 如果配置中使用resampler,则将perceiver_embeddings设置为image_hidden_states,否则设置为None
if self.config.use_resampler:
kwargs["perceiver_embeddings"] = image_hidden_states
else:
kwargs["image_encoder_embeddings"] = image_hidden_states
kwargs["pixel_values"] = None # 将像素值设置为None
# 调用准备生成输入数据的函数,传递input_ids、past以及其他未处理的kwargs参数
inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
unwanted_kwargs = ["token_type_ids"] # 定义一个不需要的kwargs参数列表
for kwarg in unwanted_kwargs:
inputs.pop(kwarg, None) # 从inputs中移除不需要的kwargs参数
return inputs # 返回处理后的inputs字典
@staticmethod
def _expand_inputs_for_generation(
*args,
**model_kwargs,
):
# 调用扩展生成输入数据的函数,传递args和model_kwargs参数
return expand_inputs_for_generation(*args, **model_kwargs)
# 定义一个方法,用于生成过程中更新模型关键字参数
def _update_model_kwargs_for_generation(
self,
outputs: ModelOutput, # 输出模型的结果
model_kwargs: Dict[str, Any], # 模型关键字参数的字典
is_encoder_decoder: bool = False, # 是否是编码器-解码器结构,默认为False
standardize_cache_format: bool = False, # 是否标准化缓存格式,默认为False
) -> Dict[str, Any]: # 返回更新后的模型关键字参数的字典
# 调用父类的更新模型关键字参数函数,传递outputs、model_kwargs、is_encoder_decoder和standardize_cache_format参数
model_kwargs = super()._update_model_kwargs_for_generation(
outputs,
model_kwargs,
is_encoder_decoder,
standardize_cache_format,
)
# 如果model_kwargs中包含'image_attention_mask'键
if "image_attention_mask" in model_kwargs:
image_attention_mask = model_kwargs["image_attention_mask"]
# 取图像注意力掩码的最后一个mask并添加一个维度
last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
model_kwargs["image_attention_mask"] = last_mask # 更新模型关键字参数中的'image_attention_mask'为最后一个mask
# 获取预计算的image_hidden_states并添加到模型关键字参数中
model_kwargs["image_hidden_states"] = outputs.image_hidden_states
return model_kwargs # 返回更新后的模型关键字参数的字典
@staticmethod
def _reorder_cache(past, beam_idx):
reordered_past = ()
# 遍历每一层的过去状态,并按beam_idx重新排序
for layer_past in past:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past # 返回重新排序后的过去状态
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 实操Deepseek接入个人知识库
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· 【.NET】调用本地 Deepseek 模型
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
· 上周热点回顾(2.17-2.23)