# coding=utf-8# Copyright 2018 The HuggingFace Inc. team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Tokenization classes for DistilBERT."""import collections
import os
import unicodedata
from typing importList, Optional, Tuplefrom ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
# 获取 logger 对象
logger = logging.get_logger(__name__)
# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
# 预训练模型对应的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt"
),
}
}
# 预训练模型对应的位置编码大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"distilbert-base-uncased": 512,
"distilbert-base-uncased-distilled-squad": 512,
"distilbert-base-cased": 512,
"distilbert-base-cased-distilled-squad": 512,
"distilbert-base-german-cased": 512,
"distilbert-base-multilingual-cased": 512,
}
# 预训练模型初始化配置
PRETRAINED_INIT_CONFIGURATION = {
"distilbert-base-uncased": {"do_lower_case": True},
"distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
"distilbert-base-cased": {"do_lower_case": False},
"distilbert-base-cased-distilled-squad": {"do_lower_case": False},
"distilbert-base-german-cased": {"do_lower_case": False},
"distilbert-base-multilingual-cased": {"do_lower_case": False},
}
# 从 transformers.models.bert.tokenization_bert.load_vocab 复制而来的函数defload_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""# 使用 OrderedDict 来存储词汇表
vocab = collections.OrderedDict()
# 以 UTF-8 编码读取词汇文件withopen(vocab_file, "r", encoding="utf-8") as reader:
# 逐行读取词汇文件内容
tokens = reader.readlines()
# 将每个词汇添加到 vocab 字典中,并用其在文件中的顺序作为值for index, token inenumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenizedefwhitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""# 去除文本两端的空白字符
text = text.strip()
# 如果处理后文本为空,则返回空列表ifnot text:
return []
# 使用空格分割文本,得到 token 列表
tokens = text.split()
# 返回分割后的 token 列表return tokens
classDistilBertTokenizer(PreTrainedTokenizer):
r"""
Construct a DistilBERT tokenizer. Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
# 加载预训练模型的词汇文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 加载预训练模型的初始化配置
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 加载预训练模型的最大输入大小配置
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义模型的输入名称列表
model_input_names = ["input_ids", "attention_mask"]
def__init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 检查给定的词汇文件是否存在,否则抛出值错误ifnot os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"" model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 加载词汇表
self.vocab = load_vocab(vocab_file)
# 构建从标识符到词汇的有序字典
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
# 设置是否进行基本的分词操作
self.do_basic_tokenize = do_basic_tokenize
# 如果需要进行基本分词,则初始化基本分词器if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
# 初始化基于词汇表的 WordPiece 分词器
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# 调用父类的初始化方法,传递相同的参数super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property# 返回基本分词器是否进行小写处理的属性# 来自 transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_casedefdo_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property# 返回词汇表的大小# 来自 transformers.models.bert.tokenization_bert.BertTokenizer.vocab_sizedefvocab_size(self):
returnlen(self.vocab)
# 返回词汇表及其附加编码器的字典# 来自 transformers.models.bert.tokenization_bert.BertTokenizer.get_vocabdefget_vocab(self):
returndict(self.vocab, **self.added_tokens_encoder)
# 将文本标记化为子词的方法,这个方法会被具体的子类实现# 来自 transformers.models.bert.tokenization_bert.BertTokenizer._tokenizedef_tokenize(self, text, split_special_tokens=False):
# 初始化空列表,用于存储分词后的 tokens
split_tokens = []
# 如果需要进行基本的分词处理if self.do_basic_tokenize:
# 使用 basic_tokenizer 对文本进行分词for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens ifnot split_special_tokens elseNone
):
# 如果 token 在 never_split 集合中if token in self.basic_tokenizer.never_split:
# 直接添加到 split_tokens 中
split_tokens.append(token)
else:
# 否则,使用 wordpiece_tokenizer 进一步分词,并将结果合并到 split_tokens 中
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
# 如果不需要基本的分词处理,直接使用 wordpiece_tokenizer 进行分词
split_tokens = self.wordpiece_tokenizer.tokenize(text)
# 返回分词后的 tokens 列表return split_tokens
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_iddef_convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""# 使用 vocab 字典将 token 转换为对应的 id,如果 token 不在 vocab 中,则使用 unk_token 对应的 idreturn self.vocab.get(token, self.vocab.get(self.unk_token))
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_tokendef_convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""# 使用 ids_to_tokens 字典将 index 转换为对应的 token,如果 index 不在 ids_to_tokens 中,则返回 unk_tokenreturn self.ids_to_tokens.get(index, self.unk_token)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_stringdefconvert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""# 将 tokens 列表中的 token 连接成一个字符串,并移除特殊标记 ' ##',最后去除首尾的空格
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokensdefbuild_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""# 如果没有提供 token_ids_1,则构建单个序列的输入列表,包括特殊 token `[CLS]` 和 `[SEP]`if token_ids_1 isNone:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# 否则,构建序列对的输入列表,包括两个序列的特殊 token `[CLS]`、`[SEP]` 以及分隔符 `[SEP]`
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_maskdefget_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False):
"""
Retrieve sequence ids from a sequence of tokens that should not be masked.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens.
Returns:
`List[int]`: List of indices indicating which tokens are special tokens.
"""# 如果已经有特殊 token,则直接返回全为 1 的掩码列表,长度为 token_ids_0 的长度if already_has_special_tokens:
return [1] * len(token_ids_0)
# 否则,构建一个掩码列表,长度为 token_ids_0 的长度加上特殊 token `[CLS]` 和 `[SEP]` 的长度# 并设置特殊 token 对应位置为 1,其余位置为 0
cls_sep = [self.cls_token_id, self.sep_token_id]
returnlist(map(lambda x: 1if x in cls_sep else0, token_ids_0))
# 从不包含特殊token的token列表中提取序列id。当使用tokenizer的`prepare_for_model`方法添加特殊token时调用此方法。defget_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""if already_has_special_tokens:
# 如果token列表已包含特殊token,则调用父类的方法获取特殊token的掩码returnsuper().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 isnotNone:
# 如果有第二个token列表,则返回一个包含特殊token的掩码列表return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
# 如果只有一个token列表,则返回一个包含特殊token的掩码列表return [1] + ([0] * len(token_ids_0)) + [1]
# 从给定的序列创建token类型ID的方法,用于序列对分类任务defcreate_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id] # 分隔token的ID列表
cls = [self.cls_token_id] # 类别开始token的ID列表if token_ids_1 isNone:
# 如果没有第二个token列表,只返回第一个序列部分的token类型ID列表(全为0)returnlen(cls + token_ids_0 + sep) * [0]
# 如果有第二个token列表,返回两个序列的token类型ID列表,第一个序列为0,第二个序列为1returnlen(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# 定义保存词汇表的方法,接受保存目录和可选的文件名前缀作为参数,并返回保存的文件名元组defsave_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 初始化索引
index = 0# 检查保存目录是否存在,构建词汇表文件路径if os.path.isdir(save_directory):
# 如果保存目录是一个目录,则在该目录下创建词汇表文件路径
vocab_file = os.path.join(
save_directory, (filename_prefix + "-"if filename_prefix else"") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
# 如果保存目录是一个文件路径,则直接使用该路径作为词汇表文件路径
vocab_file = (filename_prefix + "-"if filename_prefix else"") + save_directory
# 打开词汇表文件,准备写入withopen(vocab_file, "w", encoding="utf-8") as writer:
# 遍历词汇表中的每个词汇及其索引,按索引排序for token, token_index insorted(self.vocab.items(), key=lambda kv: kv[1]):
# 如果当前索引不等于预期的索引值,记录警告日志if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."" Please check that the vocabulary is not corrupted!"
)
index = token_index
# 写入词汇到文件,并添加换行符
writer.write(token + "\n")
# 更新索引值
index += 1# 返回保存的词汇表文件路径的元组return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizerclassBasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""def__init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
# 如果 `never_split` 参数未提供,将其设为空列表if never_split isNone:
never_split = []
# 设定是否将输入内容全部转换为小写
self.do_lower_case = do_lower_case
# 将 `never_split` 转换为集合,这些标记在分词时不会被分开
self.never_split = set(never_split)
# 设定是否对中文字符进行分词处理
self.tokenize_chinese_chars = tokenize_chinese_chars
# 设定是否去除所有的重音符号
self.strip_accents = strip_accents
# 设定是否在标点符号处进行基础分词
self.do_split_on_punc = do_split_on_punc
deftokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""# union() returns a new set by concatenating the two sets.# 如果给定了 `never_split` 参数,则将其转换为集合并与 `self.never_split` 取并集,否则直接使用 `self.never_split`
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# 清理文本,去除不必要的字符
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese# models. This is also applied to the English models now, but it doesn't# matter since the English models were not trained on any Chinese data# and generally don't have any Chinese data in them (there are Chinese# characters in the vocabulary because Wikipedia does have some Chinese# words in the English Wikipedia.).# 如果开启了 tokenize_chinese_chars 参数,则对文本中的中文字符进行特殊处理if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
# 将文本进行 Unicode 规范化为 NFC 格式,确保字符的一致性
unicode_normalized_text = unicodedata.normalize("NFC", text)
# 将规范化后的文本按空白字符进行分词
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
# 对每个 token 进行处理for token in orig_tokens:
# 如果 token 不在 never_split 中,则继续处理if token notin never_split:
# 如果开启了小写化处理,则将 token 转换为小写if self.do_lower_case:
token = token.lower()
# 如果开启了去除重音处理,则去除 token 的重音if self.strip_accents isnotFalse:
token = self._run_strip_accents(token)
# 如果开启了去除重音处理,则去除 token 的重音elif self.strip_accents:
token = self._run_strip_accents(token)
# 将处理后的 token 再进行标点符号分割处理,并加入到 split_tokens 中
split_tokens.extend(self._run_split_on_punc(token, never_split))
# 将处理后的分词按空白字符再次分割,并返回最终的输出 tokens
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def_run_strip_accents(self, text):
"""Strips accents from a piece of text."""# 将文本进行 Unicode 规范化为 NFD 格式,分解字符为基字符和附加记号
text = unicodedata.normalize("NFD", text)
output = []
# 遍历文本中的每个字符for char in text:
# 获取字符的 Unicode 分类
cat = unicodedata.category(char)
# 如果字符的分类是 Mark, Nonspacing,则跳过该字符,不加入输出if cat == "Mn":
continue# 否则将字符加入输出列表
output.append(char)
# 将列表中的字符连接成字符串并返回return"".join(output)
def_run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""# 如果不需要在标点处分割或者文本在never_split列表中,直接返回文本列表ifnot self.do_split_on_punc or (never_split isnotNoneand text in never_split):
return [text]
# 将文本转换为字符列表
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
# 如果是标点符号,则将其作为单独的列表项加入output,并标记可以开始一个新单词if _is_punctuation(char):
output.append([char])
start_new_word = Trueelse:
# 如果不是标点符号,则将字符添加到当前列表项中if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1# 将列表中的子列表合并为字符串,并返回分割后的文本列表return ["".join(x) for x in output]
def_tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
# 如果是中文字符,则在其前后添加空格,并加入到输出列表中if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
# 如果不是中文字符,则直接添加到输出列表中
output.append(char)
# 将输出列表转换为字符串,并返回return"".join(output)
def_is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""# 检查Unicode码点是否位于CJK统一表意文字区块中,返回是否是中文字符的布尔值if (
(cp >= 0x4E00and cp <= 0x9FFF)
or (cp >= 0x3400and cp <= 0x4DBF) #or (cp >= 0x20000and cp <= 0x2A6DF) #or (cp >= 0x2A700and cp <= 0x2B73F) #or (cp >= 0x2B740and cp <= 0x2B81F) #or (cp >= 0x2B820and cp <= 0x2CEAF) #or (cp >= 0xF900and cp <= 0xFAFF)
or (cp >= 0x2F800and cp <= 0x2FA1F) #
): #returnTruereturnFalsedef_clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
# 如果字符为无效字符或控制字符,跳过if cp == 0or cp == 0xFFFDor _is_control(char):
continue# 如果是空白字符,则替换为单个空格;否则保留原字符if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
# 将输出列表转换为字符串,并返回清理后的文本return"".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizerclassWordpieceTokenizer(object):
"""Runs WordPiece tokenization."""def__init__(self, vocab, unk_token, max_input_chars_per_word=100):
# 初始化 WordpieceTokenizer 类的实例
self.vocab = vocab # 词汇表,用于词片段(token)的匹配
self.unk_token = unk_token # 未知标记,用于替换无法识别的词片段
self.max_input_chars_per_word = max_input_chars_per_word # 单个词的最大字符数deftokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = [] # 存储最终的词片段(token)列表for token in whitespace_tokenize(text): # 对输入文本进行空白字符分割,并遍历每个分割后的单词
chars = list(token) # 将单词转换为字符列表iflen(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token) # 如果单词字符数超过最大限制,则使用未知标记替代continue
is_bad = False# 标记是否出现无法识别的子词
start = 0# 初始化子词起始位置
sub_tokens = [] # 存储当前单词分割后的词片段while start < len(chars):
end = len(chars)
cur_substr = Nonewhile start < end:
substr = "".join(chars[start:end]) # 获取当前起始位置到结束位置的子字符串if start > 0:
substr = "##" + substr # 对非初始子词添加 ## 前缀if substr in self.vocab: # 如果找到匹配的词片段在词汇表中
cur_substr = substr # 记录当前词片段break
end -= 1if cur_substr isNone:
is_bad = True# 如果找不到匹配的词片段,则标记为无法识别break
sub_tokens.append(cur_substr) # 将找到的词片段添加到子词列表中
start = end # 更新起始位置为当前结束位置if is_bad:
output_tokens.append(self.unk_token) # 如果整个单词无法识别,则使用未知标记替代else:
output_tokens.extend(sub_tokens) # 将识别出的词片段添加到最终结果中return output_tokens # 返回最终的词片段列表
# coding=utf-8# Copyright 2018 The HuggingFace Inc. team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Tokenization classes for DistilBERT."""import json
from typing importList, Optional, Tuplefrom tokenizers import normalizers
# 导入必要的日志记录模块from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
# 导入DistilBERT的标记器from .tokenization_distilbert import DistilBertTokenizer
# 获取全局日志记录器
logger = logging.get_logger(__name__)
# 定义词汇和标记器文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
# 定义预训练模型的词汇和标记器文件的URL映射
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json"
),
"distilbert-base-german-cased": (
"https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json"
),
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json"
),
},
}
# 定义预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"distilbert-base-uncased": 512,
"distilbert-base-uncased-distilled-squad": 512,
"distilbert-base-cased": 512,
"distilbert-base-cased-distilled-squad": 512,
}
# 定义模型名称为键,对应的最大输入长度为值的字典条目"distilbert-base-german-cased": 512,
"distilbert-base-multilingual-cased": 512,
}
# 预训练模型的初始化配置,包含了不同模型的配置信息
PRETRAINED_INIT_CONFIGURATION = {
"distilbert-base-uncased": {"do_lower_case": True}, # 使用小写字符"distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, # 使用小写字符"distilbert-base-cased": {"do_lower_case": False}, # 区分大小写"distilbert-base-cased-distilled-squad": {"do_lower_case": False}, # 区分大小写"distilbert-base-german-cased": {"do_lower_case": False}, # 区分大小写,适用于德语"distilbert-base-multilingual-cased": {"do_lower_case": False}, # 区分大小写,适用于多语言
}
classDistilBertTokenizerFast(PreTrainedTokenizerFast):
r"""
构建一个“快速”的 DistilBERT 分词器(基于 HuggingFace 的 *tokenizers* 库)。基于 WordPiece。
此分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether ornot to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that isnotin the vocabulary cannot be converted to an ID andisset to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification orfor a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (`bool`, *optional*, defaults to `True`):
Whether ornot to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether ornot to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether ornot to strip all accents. If this option isnot specified, then it will be determined by the
value for `lowercase` (asin the original BERT).
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
# 定义一些常量和映射
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 指定模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 指定慢速分词器的类,这里使用的是 DistilBertTokenizer
slow_tokenizer_class = DistilBertTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 调用父类的初始化方法,设置词汇文件、分词器文件、大小写敏感、未知标记、分隔标记、填充标记、类别标记、掩码标记、处理中文字符等参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 获取当前后端分词器的规范化器状态并转换为字典
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查规范化器状态中的属性是否与当前初始化参数一致,若不一致则更新
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
# 获取当前规范化器的类,并更新相关属性
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 设置对象的大小写敏感属性
self.do_lower_case = do_lower_case
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens."""
# 构建带有特殊标记的输入序列,根据是否提供第二个序列决定是否添加第二个分隔符和第二个序列的 token IDs
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
Create token type IDs tensor from given sequences.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optionallist of IDs corresponding to the second sequence for sequence pairs.
Returns:
`List[int]`: List of token type IDs.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
00000000000111111111
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s)."""
# Define special tokens for separation and classification
sep = [self.sep_token_id] # List containing the separator token ID
cls = [self.cls_token_id] # List containing the classification token ID
# If only one sequence is provided (token_ids_1 is None), return a mask with 0s
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0] # Return a list of zeros representing token type IDs
# If two sequences are provided, concatenate their lengths and return a mask with 0s for the first sequence and 1s for the second sequence
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary files to the specified directory.
Args:
save_directory (str):
Directory path where the vocabulary files will be saved.
filename_prefix (Optional[str]):
Optional prefix for the vocabulary filenames.
Returns:
Tuple[str]: Tuple containing the paths of the saved vocabulary files.
"""
# Call the internal tokenizer's model save method to save vocabulary files
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
# Return the paths of the saved files as a tuple
return tuple(files)
# 设置文件编码为UTF-8# Copyright 2022 The HuggingFace Inc. team. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Feature extractor class for Donut."""# 导入警告模块import warnings
# 导入日志记录工具from ...utils import logging
# 导入图像处理模块from .image_processing_donut import DonutImageProcessor
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 定义 DonutFeatureExtractor 类,继承自 DonutImageProcessor 类classDonutFeatureExtractor(DonutImageProcessor):
def__init__(self, *args, **kwargs) -> None:
# 发出警告,提醒该类将在 Transformers 版本 5 中被移除,建议使用 DonutImageProcessor 代替
warnings.warn(
"The class DonutFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"" use DonutImageProcessor instead.",
FutureWarning,
)
# 调用父类的初始化方法super().__init__(*args, **kwargs)
.\models\donut\image_processing_donut.py
# 如果视觉处理库可用,则导入PIL库if is_vision_available():
import PIL
# 定义一个名为DonutImageProcessor的类,继承自BaseImageProcessor类classDonutImageProcessor(BaseImageProcessor):
r"""
Constructs a Donut image processor.
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_thumbnail (`bool`, *optional*, defaults to `True`):
Whether to resize the image using thumbnail method.
do_align_long_axis (`bool`, *optional*, defaults to `False`):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
random amount of padding on each side, up to the largest image size in the batch. Otherwise, all images are
padded to the largest image size in the batch.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Image standard deviation.
"""
# 定义模型输入的名称列表,仅包含像素值
model_input_names = ["pixel_values"]
# 初始化函数,用于设置图像处理的各项参数和默认值
def __init__(
self,
do_resize: bool = True, # 是否进行图像尺寸调整,默认为True
size: Dict[str, int] = None, # 图像的目标尺寸,字典形式表示,包含高度和宽度,默认为None
resample: PILImageResampling = PILImageResampling.BILINEAR, # 图像调整大小时的重采样方法,默认为双线性插值
do_thumbnail: bool = True, # 是否生成缩略图,默认为True
do_align_long_axis: bool = False, # 是否在长轴上对齐图像,默认为False
do_pad: bool = True, # 是否进行图像填充,默认为True
do_rescale: bool = True, # 是否对图像进行重新缩放,默认为True
rescale_factor: Union[int, float] = 1 / 255, # 图像重新缩放的因子,默认为1/255
do_normalize: bool = True, # 是否对图像进行归一化,默认为True
image_mean: Optional[Union[float, List[float]]] = None, # 图像的均值用于归一化,默认为None
image_std: Optional[Union[float, List[float]]] = None, # 图像的标准差用于归一化,默认为None
**kwargs, # 其他可选的关键字参数
) -> None:
# 调用父类的初始化方法,传入其他的关键字参数
super().__init__(**kwargs)
# 如果size为None,则设定默认的高度和宽度
size = size if size is not None else {"height": 2560, "width": 1920}
# 如果size是元组或列表形式,则转换为字典形式,表示高度和宽度
if isinstance(size, (tuple, list)):
# The previous feature extractor size parameter was in (width, height) format
size = size[::-1]
# 使用函数get_size_dict处理size,确保返回的是一个标准化的尺寸字典
size = get_size_dict(size)
# 设置对象的属性值,将初始化函数的参数赋值给对象的属性
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_thumbnail = do_thumbnail
self.do_align_long_axis = do_align_long_axis
self.do_pad = do_pad
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN # 如果image_mean为None,则使用IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD # 如果image_std为None,则使用IMAGENET_STANDARD_STD
# 验证处理器的关键字列表,用于后续处理
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_thumbnail",
"do_align_long_axis",
"do_pad",
"random_padding",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def align_long_axis(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Align the long axis of the image to the longest axis of the specified size.
Args:
image (`np.ndarray`):
The image to be aligned.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to align the long axis to.
data_format (`str` or `ChannelDimension`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
Returns:
`np.ndarray`: The aligned image.
"""
# 获取输入图像的高度和宽度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出图像的高度和宽度
output_height, output_width = size["height"], size["width"]
# 如果输出宽度小于高度且输入宽度大于高度,或者输出宽度大于高度且输入宽度小于高度,则须旋转图像
if (output_width < output_height and input_width > input_height) or (
output_width > output_height and input_width < input_height
):
image = np.rot90(image, 3)
# 如果指定了输出数据格式,则转换图像数据格式
if data_format is not None:
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
# 返回对齐后的图像
return image
def pad_image(
self,
image: np.ndarray,
size: Dict[str, int],
random_padding: bool = False,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad the image to the specified size.
Args:
image (`np.ndarray`):
The image to be padded.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to pad the image to.
random_padding (`bool`, *optional*, defaults to `False`):
Whether to use random padding or not.
data_format (`str` or `ChannelDimension`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# Extract output height and width from the size dictionary
output_height, output_width = size["height"], size["width"]
# Obtain input height and width from the input image
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# Calculate the difference between output and input dimensions
delta_width = output_width - input_width
delta_height = output_height - input_height
# Determine padding amounts based on random_padding flag
if random_padding:
pad_top = np.random.randint(low=0, high=delta_height + 1)
pad_left = np.random.randint(low=0, high=delta_width + 1)
else:
pad_top = delta_height // 2
pad_left = delta_width // 2
# Calculate remaining padding amounts to complete the pad
pad_bottom = delta_height - pad_top
pad_right = delta_width - pad_left
# Construct the padding tuple for np.pad function
padding = ((pad_top, pad_bottom), (pad_left, pad_right))
# Apply padding to the image using np.pad
return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
def pad(self, *args, **kwargs):
# Log a deprecation warning for the `pad` method
logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.")
# Redirect to `pad_image` method
return self.pad_image(*args, **kwargs)
def thumbnail(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
corresponding dimension of the specified size.
Args:
image (`np.ndarray`):
The image to be resized.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to resize the image to.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
The resampling filter to use.
data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 获取输入图像的高度和宽度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出图像的目标高度和宽度
output_height, output_width = size["height"], size["width"]
# 始终调整图像大小为输入或输出大小中较小的那一个
height = min(input_height, output_height)
width = min(input_width, output_width)
# 如果输入图像已经符合要求的大小,则直接返回原图像
if height == input_height and width == input_width:
return image
# 根据输入图像的长宽比例调整目标高度或宽度
if input_height > input_width:
width = int(input_width * height / input_height)
elif input_width > input_height:
height = int(input_height * width / input_width)
# 调用 resize 函数,进行图像的实际调整
return resize(
image,
size=(height, width),
resample=resample,
reducing_gap=2.0,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Resize the input image to the specified size.
Args:
image (`np.ndarray`):
The image to be resized.
size (`Dict[str, int]`):
The target size `{"height": h, "width": w}` to resize the image to.
resample (`PILImageResampling`, *optional*):
The resampling filter to use.
data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
The data format of the output image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image.
Returns:
np.ndarray: The resized image.
"""
) -> np.ndarray:
"""
Resizes `image` to `(height, width)` specified by `size` using the PIL library.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 调整 `size` 参数,确保其为大小字典
size = get_size_dict(size)
# 计算 `size` 中较短的边长
shortest_edge = min(size["height"], size["width"])
# 获取调整大小后的输出图像尺寸
output_size = get_resize_output_image_size(
image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
)
# 调整图像大小并返回调整后的图像
resized_image = resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return resized_image
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
do_pad: bool = None,
random_padding: bool = False,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\donut\modeling_donut_swin.py
# coding=utf-8# Copyright 2022 The HuggingFace Inc. team. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.""" PyTorch Donut Swin Transformer model.
This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states."""import collections.abc
import math
from dataclasses import dataclass
from typing importOptional, Tuple, Unionimport torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_donut_swin import DonutSwinConfig
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DonutSwinConfig"# Base docstring
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
"naver-clova-ix/donut-base",
# See all Donut Swin models at https://huggingface.co/models?filter=donut
]
@dataclass# 数据类,定义了一个用于存储数据的类# 从transformers.models.swin.modeling_swin.SwinEncoderOutput复制而来,仅将Swin替换为DonutSwinclassDonutSwinEncoderOutput(ModelOutput):
"""
DonutSwin encoder's outputs, with potential hidden states and attentions.
"""# DonutSwin编码器的输出,可能包含隐藏状态和注意力# 最后一层模型的隐藏状态,形状为 `(batch_size, sequence_length, hidden_size)`。
last_hidden_state: torch.FloatTensor = None# 模型每层的隐藏状态的元组,如果设置了 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 则返回。# 元组中包含 `torch.FloatTensor`,形状为 `(batch_size, sequence_length, hidden_size)`。
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None# 注意力权重的元组,如果设置了 `output_attentions=True` 或者 `config.output_attentions=True` 则返回。# 元组中包含 `torch.FloatTensor`,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None# 模型每层的隐藏状态的元组,如果设置了 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 则返回。# 元组中包含 `torch.FloatTensor`,形状为 `(batch_size, hidden_size, height, width)`,表示包括空间维度在内的每层的隐藏状态。
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None# 通过 @dataclass 装饰器定义一个数据类 DonutSwinModelOutput,继承自 ModelOutput# 从 transformers.models.swin.modeling_swin.SwinModelOutput 复制,将 Swin 替换为 DonutSwin@dataclassclassDonutSwinModelOutput(ModelOutput):
"""
DonutSwin model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""# 定义类的成员变量,用于存储模型输出的不同部分
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None# 从 transformers.models.swin.modeling_swin.window_partition 复制的函数defwindow_partition(input_feature, window_size):
"""
Partitions the given input into windows.
"""# 获取输入特征的形状信息
batch_size, height, width, num_channels = input_feature.shape
# 将输入特征按窗口大小分割成小窗口,存储在 input_feature 中
input_feature = input_feature.view(
batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
)
# 调整分割后的窗口顺序,并重新整理为一个扁平化的张量
windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
# 返回分割后的窗口张量return windows
# 从 transformers.models.swin.modeling_swin.window_reverse 复制的函数# 定义一个函数window_reverse,用于合并窗口以生成更高分辨率的特征defwindow_reverse(windows, window_size, height, width):
# 获取窗口的通道数
num_channels = windows.shape[-1]
# 重塑窗口张量的形状,以便进行后续操作
windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
# 调整张量的维度顺序,以便后续操作的连续性
windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
# 返回处理后的窗口张量return windows
# 从transformers.models.swin.modeling_swin.SwinEmbeddings复制并修改为DonutSwinEmbeddingsclassDonutSwinEmbeddings(nn.Module):
"""
构建补丁和位置嵌入。可选择添加掩码令牌。
"""def__init__(self, config, use_mask_token=False):
super().__init__()
# 初始化补丁嵌入
self.patch_embeddings = DonutSwinPatchEmbeddings(config)
# 获取补丁数量和网格大小
num_patches = self.patch_embeddings.num_patches
self.patch_grid = self.patch_embeddings.grid_size
# 如果需要使用掩码令牌,则初始化掩码令牌
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token elseNone# 根据配置决定是否使用绝对位置嵌入if config.use_absolute_embeddings:
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
else:
self.position_embeddings = None# 初始化LayerNorm和Dropout
self.norm = nn.LayerNorm(config.embed_dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
defforward(
self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None) -> Tuple[torch.Tensor]:
# 获取补丁嵌入和输出维度
embeddings, output_dimensions = self.patch_embeddings(pixel_values)
# 对嵌入进行LayerNorm
embeddings = self.norm(embeddings)
# 获取批量大小、序列长度和嵌入维度
batch_size, seq_len, _ = embeddings.size()
# 如果存在掩码位置信息,则用掩码令牌替换掩码的视觉令牌if bool_masked_pos isnotNone:
mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
# 将掩码应用到嵌入张量中
mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
# 如果存在位置嵌入,则添加到嵌入张量中if self.position_embeddings isnotNone:
embeddings = embeddings + self.position_embeddings
# 应用Dropout到嵌入张量
embeddings = self.dropout(embeddings)
# 返回处理后的嵌入张量和输出维度return embeddings, output_dimensions
# 从transformers.models.swin.modeling_swin.SwinPatchEmbeddings复制并修改为DonutSwinPatchEmbeddingsclassDonutSwinPatchEmbeddings(nn.Module):
"""
将形状为(batch_size, num_channels, height, width)的像素值转换为Transformer可消耗的初始隐藏状态(补丁嵌入),
形状为(batch_size, seq_length, hidden_size)。
"""# 初始化函数,接受一个配置对象作为参数def__init__(self, config):
# 调用父类的初始化方法super().__init__()
# 从配置对象中获取图像大小和patch大小
image_size, patch_size = config.image_size, config.patch_size
# 从配置对象中获取通道数和嵌入维度大小
num_channels, hidden_size = config.num_channels, config.embed_dim
# 如果图像大小和patch大小不是可迭代对象,则转换为元组
image_size = image_size ifisinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size ifisinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算图像中patch的数量
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# 设置对象的属性
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
# 计算图像网格大小
self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
# 创建一个卷积层,用于将输入的通道转换为隐藏维度的输出
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
# 对输入的像素值进行可能的填充,使其能够被patch大小整除defmaybe_pad(self, pixel_values, height, width):
# 如果宽度不能被patch的宽度整除,则进行填充if width % self.patch_size[1] != 0:
pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
pixel_values = nn.functional.pad(pixel_values, pad_values)
# 如果高度不能被patch的高度整除,则进行填充if height % self.patch_size[0] != 0:
pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
pixel_values = nn.functional.pad(pixel_values, pad_values)
# 返回填充后的像素值return pixel_values
# 前向传播函数,接受一个可选的torch.FloatTensor类型的像素值作为输入,返回嵌入向量和输出维度defforward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
# 获取输入张量的形状信息
_, num_channels, height, width = pixel_values.shape
# 如果通道数不等于设定的通道数,则抛出值错误异常if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 将输入进行填充,使其能够被patch大小整除
pixel_values = self.maybe_pad(pixel_values, height, width)
# 将填充后的像素值通过投影卷积层转换为嵌入向量
embeddings = self.projection(pixel_values)
# 获取嵌入向量的形状信息
_, _, height, width = embeddings.shape
# 计算输出的维度信息
output_dimensions = (height, width)
# 将嵌入向量展平,并在特定维度上转置
embeddings = embeddings.flatten(2).transpose(1, 2)
# 返回嵌入向量和输出维度return embeddings, output_dimensions
# Copied from transformers.models.swin.modeling_swin.SwinPatchMergingclassDonutSwinPatchMerging(nn.Module):
"""
Patch Merging Layer.
Args:
input_resolution (`Tuple[int]`):
Resolution of input feature.
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""def__init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
# 初始化 Patch Merging 层,保存输入分辨率和通道数
self.input_resolution = input_resolution
self.dim = dim
# 创建线性层,用于降维操作,从4倍的通道数到2倍的通道数,无偏置
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
# 初始化归一化层,对4倍的通道数进行归一化处理
self.norm = norm_layer(4 * dim)
defmaybe_pad(self, input_feature, height, width):
# 判断是否需要对输入特征进行填充,使得高度和宽度均为偶数
should_pad = (height % 2 == 1) or (width % 2 == 1)
if should_pad:
pad_values = (0, 0, 0, width % 2, 0, height % 2)
input_feature = nn.functional.pad(input_feature, pad_values)
return input_feature
defforward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
height, width = input_dimensions
# 获取输入特征的批大小、通道数以及特征图的数量
batch_size, dim, num_channels = input_feature.shape
# 将输入特征重塑为四维张量 [batch_size, height, width, num_channels]
input_feature = input_feature.view(batch_size, height, width, num_channels)
# 如果需要,对输入特征进行填充,使得高度和宽度均为偶数
input_feature = self.maybe_pad(input_feature, height, width)
# 下采样操作,将特征图划分成四个区域,分别对应输入特征的四分之一大小
input_feature_0 = input_feature[:, 0::2, 0::2, :]
input_feature_1 = input_feature[:, 1::2, 0::2, :]
input_feature_2 = input_feature[:, 0::2, 1::2, :]
input_feature_3 = input_feature[:, 1::2, 1::2, :]
# 将四个区域的特征按通道拼接起来,形成新的特征图
input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
# 将新的特征图重塑为三维张量 [batch_size, height/2*width/2, 4*num_channels]
input_feature = input_feature.view(batch_size, -1, 4 * num_channels)
# 对新的特征图进行归一化处理
input_feature = self.norm(input_feature)
# 使用线性层进行降维操作,输出特征维度为 [batch_size, height/2*width/2, 2*dim]
input_feature = self.reduction(input_feature)
return input_feature
# Copied from transformers.models.beit.modeling_beit.drop_pathdefdrop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
"""# 实现样本级别的路径丢弃(随机深度),在残差块的主路径中应用# 输入参数包括:input - 输入张量,drop_prob - 丢弃概率(默认为0.0),training - 是否处于训练模式(默认为False)# 详细讨论可以参考 Ross Wightman 的评论和链接中的讨论# 如果 drop_prob 为 0.0 或者不处于训练模式,则直接返回输入if drop_prob == 0.0ornot training:
returninput# 计算保留的概率
keep_prob = 1 - drop_prob
# 确定输出张量的形状,适用于不同维度的张量,不仅限于二维卷积网络
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
# 生成与输入张量相同设备和数据类型的随机张量,值在 [keep_prob, 1.0) 之间
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
# 将随机张量向下取整,将其二值化
random_tensor.floor_()
# 计算输出,将输入张量除以保留概率,再乘以随机张量
output = input.div(keep_prob) * random_tensor
# 返回处理后的输出张量return output
# 从transformers.models.swin.modeling_swin.SwinDropPath复制而来,定义了DonutSwinDropPath类,用于实现每个样本的Drop Path(随机深度)机制。classDonutSwinDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""def__init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob # 初始化Drop Path的概率defforward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training) # 执行Drop Path操作defextra_repr(self) -> str:
return"p={}".format(self.drop_prob) # 返回Drop Path概率的描述字符串# 从transformers.models.swin.modeling_swin.SwinSelfAttention复制而来,定义了DonutSwinSelfAttention类,实现Swin Transformer的自注意力机制,被修改为适应新的Donut模型。classDonutSwinSelfAttention(nn.Module):
def__init__(self, config, dim, num_heads, window_size):
super().__init__()
if dim % num_heads != 0:
raise ValueError(
f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
)
self.num_attention_heads = num_heads # 注意力头的数量
self.attention_head_size = int(dim / num_heads) # 每个注意力头的大小
self.all_head_size = self.num_attention_heads * self.attention_head_size # 所有注意力头的总大小
self.window_size = (
window_size ifisinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
) # 窗口大小,用于相对位置编码
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
) # 相对位置偏置表格,用作注意力矩阵的偏置# 获取窗口内每个标记的成对相对位置索引
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij")) # 构建网格坐标
coords_flatten = torch.flatten(coords, 1)
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 计算成对相对坐标
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
relative_coords[:, :, 0] += self.window_size[0] - 1
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1)
self.register_buffer("relative_position_index", relative_position_index) # 注册成对相对位置索引为模型的缓冲区
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) # 查询变换器
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) # 键变换器
self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) # 值变换器
self.dropout = nn.Dropout(config.attention_probs_dropout_prob) # 注意力概率的dropout机制deftranspose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3) # 转置矩阵,以适应多头注意力的计算# 定义前向传播方法,用于处理输入隐藏状态和注意力掩码等参数defforward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 获取输入隐藏状态的维度信息
batch_size, dim, num_channels = hidden_states.shape
# 通过 self.query 对隐藏状态进行查询操作,生成混合的查询层
mixed_query_layer = self.query(hidden_states)
# 使用 self.key 对隐藏状态进行键操作,并转换维度以便进行注意力计算
key_layer = self.transpose_for_scores(self.key(hidden_states))
# 使用 self.value 对隐藏状态进行值操作,并转换维度以便进行注意力计算
value_layer = self.transpose_for_scores(self.value(hidden_states))
# 对混合的查询层进行维度转换以便进行注意力计算
query_layer = self.transpose_for_scores(mixed_query_layer)
# 计算 "查询" 和 "键" 之间的点积,得到原始的注意力分数
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
# 将注意力分数除以 sqrt(注意力头的大小),以归一化
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# 获取相对位置偏置,并按照特定方式重塑其形状以便加到注意力分数上
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
relative_position_bias = relative_position_bias.view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
)
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
# 如果提供了注意力掩码,则应用掩码if attention_mask isnotNone:
# 将注意力掩码重塑为适合注意力分数张量的形状
mask_shape = attention_mask.shape[0]
attention_scores = attention_scores.view(
batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
)
attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
# 对注意力分数进行 softmax 归一化,得到注意力概率
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 使用 dropout 随机丢弃整个令牌的注意力概率
attention_probs = self.dropout(attention_probs)
# 如果提供了头部掩码,则应用头部掩码if head_mask isnotNone:
attention_probs = attention_probs * head_mask
# 计算上下文层,将注意力概率乘以值层,并重塑其形状
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
# 根据输出标志决定返回的输出,可能包括注意力概率
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutputclassDonutSwinSelfOutput(nn.Module):
def__init__(self, config, dim):
super().__init__()
# 初始化一个线性层,输入和输出维度均为 dim
self.dense = nn.Linear(dim, dim)
# 初始化一个 dropout 层,使用 config 中的 dropout 概率
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
defforward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 将输入的 hidden_states 通过线性层 self.dense 进行变换
hidden_states = self.dense(hidden_states)
# 对变换后的 hidden_states 使用 dropout 进行随机置零
hidden_states = self.dropout(hidden_states)
# 返回处理后的 hidden_statesreturn hidden_states
# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwinclassDonutSwinAttention(nn.Module):
def__init__(self, config, dim, num_heads, window_size):
super().__init__()
# 初始化 DonutSwinSelfAttention 层
self.self = DonutSwinSelfAttention(config, dim, num_heads, window_size)
# 初始化 DonutSwinSelfOutput 层
self.output = DonutSwinSelfOutput(config, dim)
# 初始化一个空集合,用于存储被修剪的注意力头
self.pruned_heads = set()
defprune_heads(self, heads):
iflen(heads) == 0:
return# 根据给定的 heads 进行注意力头修剪,并返回修剪后的 heads 和索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 对线性层进行修剪
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储被修剪的注意力头
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
defforward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 调用 self 层进行自注意力计算
self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
# 将 self 层的输出经过 output 层处理
attention_output = self.output(self_outputs[0], hidden_states)
# 将输出打包成元组,如果需要输出注意力权重,则加入输出中
outputs = (attention_output,) + self_outputs[1:]
# 返回处理后的输出return outputs
# Copied from transformers.models.swin.modeling_swin.SwinIntermediateclassDonutSwinIntermediate(nn.Module):
def__init__(self, config, dim):
super().__init__()
# 初始化一个线性层,输入维度为 dim,输出维度为 config.mlp_ratio * dim
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
# 根据 config 中的 hidden_act 初始化激活函数ifisinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
defforward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入的 hidden_states 通过线性层 self.dense 进行变换
hidden_states = self.dense(hidden_states)
# 使用预定义的激活函数对 hidden_states 进行非线性变换
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的 hidden_statesreturn hidden_states
# Copied from transformers.models.swin.modeling_swin.SwinOutput# 此处代码被省略,需要在此处补充相关注释# 定义名为 DonutSwinOutput 的神经网络模块,继承自 nn.Module 类classDonutSwinOutput(nn.Module):
# 初始化函数,接收 config 和 dim 两个参数def__init__(self, config, dim):
# 调用父类的初始化函数super().__init__()
# 创建一个线性层,输入大小为 config.mlp_ratio * dim,输出大小为 dim
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
# 创建一个 Dropout 层,使用配置中的 hidden_dropout_prob 参数
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播函数,接收输入 hidden_states,返回经过处理的张量defforward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入 hidden_states 通过 self.dense 线性层处理
hidden_states = self.dense(hidden_states)
# 对处理后的张量应用 dropout 操作
hidden_states = self.dropout(hidden_states)
# 返回处理后的张量作为输出return hidden_states
# 从 transformers.models.swin.modeling_swin.SwinLayer 复制代码,将 Swin 改为 DonutSwinclassDonutSwinLayer(nn.Module):
# 初始化函数,接收 config、dim、input_resolution、num_heads 和可选的 shift_size 参数def__init__(self, config, dim, input_resolution, num_heads, shift_size=0):
# 调用父类的初始化函数super().__init__()
# 设置分块大小为 config.chunk_size_feed_forward
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 设置 shift_size
self.shift_size = shift_size
# 设置窗口大小为 config.window_size
self.window_size = config.window_size
# 设置输入分辨率
self.input_resolution = input_resolution
# 在层归一化之前应用层归一化,设置归一化的 epsilon 为 config.layer_norm_eps
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 创建自注意力机制(Attention)层 DonutSwinAttention 对象
self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
# 如果 drop_path_rate 大于 0.0,则创建 DropPath 层 DonutSwinDropPath 对象,否则创建 Identity 层
self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0else nn.Identity()
# 在层归一化之后应用层归一化,设置归一化的 epsilon 为 config.layer_norm_eps
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 创建中间层 DonutSwinIntermediate 对象
self.intermediate = DonutSwinIntermediate(config, dim)
# 创建输出层 DonutSwinOutput 对象
self.output = DonutSwinOutput(config, dim)
# 设置 shift_size 和 window_size 的函数,根据输入分辨率 input_resolution 进行调整defset_shift_and_window_size(self, input_resolution):
# 如果输入分辨率中最小的尺寸小于等于窗口大小 window_size,则不分割窗口ifmin(input_resolution) <= self.window_size:
self.shift_size = 0
self.window_size = min(input_resolution)
# 根据给定的高度和宽度生成注意力掩码的函数defget_attn_mask(self, height, width, dtype):
# 如果 shift_size 大于 0,则计算 SW-MSA 的注意力掩码if self.shift_size > 0:
# 创建一个高度为 1,宽度为 height 和 width,通道数为 1 的零张量 img_mask
img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
# 定义高度和宽度的切片
height_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
width_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
count = 0# 遍历高度和宽度切片,并在 img_mask 上进行相应的标记for height_slice in height_slices:
for width_slice in width_slices:
img_mask[:, height_slice, width_slice, :] = count
count += 1# 将 img_mask 分割为窗口并展平成二维张量
mask_windows = window_partition(img_mask, self.window_size)
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
# 构建注意力掩码,使对角线上的元素为 0,其他位置为 -100.0
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
attn_mask = None# 返回生成的注意力掩码return attn_mask
# 在可能的情况下,对隐藏状态进行填充,以保证其尺寸能够被窗口大小整除defmaybe_pad(self, hidden_states, height, width):
# 计算右边和底部需要填充的像素数,确保能够被窗口大小整除
pad_right = (self.window_size - width % self.window_size) % self.window_size
pad_bottom = (self.window_size - height % self.window_size) % self.window_size
# 定义填充值的元组:(前, 后, 上, 右, 下, 左),这里只在右边和底部进行填充
pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
# 对隐藏状态进行填充操作
hidden_states = nn.functional.pad(hidden_states, pad_values)
# 返回填充后的隐藏状态和填充值的元组return hidden_states, pad_values
# 前向传播函数defforward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
always_partition: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
# 如果不总是分区,设置偏移量和窗口大小ifnot always_partition:
self.set_shift_and_window_size(input_dimensions)
else:
# 否则,什么也不做pass# 获取输入维度的高度和宽度
height, width = input_dimensions
# 获取隐藏状态的批量大小、通道数和维度
batch_size, _, channels = hidden_states.size()
# 备份隐藏状态
shortcut = hidden_states
# 在层归一化之前应用层归一化
hidden_states = self.layernorm_before(hidden_states)
# 将隐藏状态重塑为四维张量 (batch_size, height, width, channels)
hidden_states = hidden_states.view(batch_size, height, width, channels)
# 使用 maybe_pad 方法对隐藏状态进行填充,使其大小为窗口大小的倍数
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
# 获取填充后的高度和宽度
_, height_pad, width_pad, _ = hidden_states.shape
# 如果有循环偏移量,将隐藏状态进行循环移位if self.shift_size > 0:
shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
else:
shifted_hidden_states = hidden_states
# 分区窗口,将移位后的隐藏状态分割成窗口
hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
# 获取注意力遮罩,以排除填充区域的影响
attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
if attn_mask isnotNone:
attn_mask = attn_mask.to(hidden_states_windows.device)
# 使用注意力机制处理窗口
attention_outputs = self.attention(
hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
)
# 获取注意力机制的输出
attention_output = attention_outputs[0]
# 将注意力输出重塑为四维张量 (batch_size, height, width, channels)
attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
# 反向操作,将窗口重排成原始形状
shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
# 如果有循环偏移量,对注意力窗口进行反向循环移位if self.shift_size > 0:
attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
attention_windows = shifted_windows
# 如果存在填充,则裁剪注意力窗口以匹配原始图像尺寸
was_padded = pad_values[3] > 0or pad_values[5] > 0if was_padded:
attention_windows = attention_windows[:, :height, :width, :].contiguous()
# 将注意力窗口重塑为三维张量 (batch_size, height*width, channels)
attention_windows = attention_windows.view(batch_size, height * width, channels)
# 将快捷连接和注意力窗口加和,并应用 drop_path
hidden_states = shortcut + self.drop_path(attention_windows)
# 在层归一化之后应用层归一化
layer_output = self.layernorm_after(hidden_states)
# 应用中间层
layer_output = self.intermediate(layer_output)
# 应用输出层
layer_output = hidden_states + self.output(layer_output)
# 返回层输出,如果需要输出注意力权重则包含在内
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
# 从 transformers.models.swin.modeling_swin.SwinStage 复制而来,将 Swin 替换为 DonutSwinclassDonutSwinStage(nn.Module):
def__init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
super().__init__()
self.config = config
self.dim = dim
# 创建包含多个 DonutSwinLayer 的模块列表,根据给定的深度
self.blocks = nn.ModuleList(
[
DonutSwinLayer(
config=config,
dim=dim,
input_resolution=input_resolution,
num_heads=num_heads,
# 根据奇偶性确定 shift_size 的值
shift_size=0if (i % 2 == 0) else config.window_size // 2,
)
for i inrange(depth)
]
)
# 如果有 downsample 参数,创建 patch merging 层;否则为 Noneif downsample isnotNone:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None
self.pointing = Falsedefforward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
always_partition: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
height, width = input_dimensions
# 遍历每个 DonutSwinLayer 模块进行前向传播计算for i, layer_module inenumerate(self.blocks):
layer_head_mask = head_mask[i] if head_mask isnotNoneelseNone# 调用当前层的前向传播方法,得到该层的输出
layer_outputs = layer_module(
hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
hidden_states_before_downsampling = hidden_states
# 如果存在 downsample 层,对隐藏状态进行下采样处理if self.downsample isnotNone:
height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
output_dimensions = (height, width, height_downsampled, width_downsampled)
hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
else:
output_dimensions = (height, width, height, width)
# 生成 stage 的输出,包括隐藏状态、下采样前的隐藏状态和输出维度信息
stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
# 如果开启了输出注意力机制信息的选项,则将每个层的注意力信息添加到输出中if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs
# 从 transformers.models.swin.modeling_swin.SwinEncoder 复制而来,将 Swin 替换为 DonutSwinclassDonutSwinEncoder(nn.Module):
# 初始化函数,用于初始化一个 DonutSwin 模型实例def__init__(self, config, grid_size):
# 调用父类(nn.Module)的初始化方法super().__init__()
# 计算模型层数
self.num_layers = len(config.depths)
# 保存配置对象
self.config = config
# 计算每层的 drop path rate
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
# 创建模型的层列表,每层是一个 DonutSwinStage 实例
self.layers = nn.ModuleList(
[
DonutSwinStage(
config=config,
# 设置每层的输入维度
dim=int(config.embed_dim * 2**i_layer),
# 设置输入分辨率
input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
# 设置层的深度(即重复次数)
depth=config.depths[i_layer],
# 设置注意力头的数量
num_heads=config.num_heads[i_layer],
# 设置当前层的 drop path rates
drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
# 如果不是最后一层,则使用 DonutSwinPatchMerging 进行下采样
downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) elseNone,
)
# 循环创建每一层的实例for i_layer inrange(self.num_layers)
]
)
# 是否使用梯度检查点,默认为 False
self.gradient_checkpointing = False# 前向传播函数,计算模型的输出defforward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
output_hidden_states_before_downsampling: Optional[bool] = False,
always_partition: Optional[bool] = False,
return_dict: Optional[bool] = True,
# 从 transformers.models.swin.modeling_swin.SwinPreTrainedModel 复制并修改为 DonutSwinPreTrainedModel 类class DonutSwinPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""# 指定配置类为 DonutSwinConfig
config_class = DonutSwinConfig
# 基础模型的前缀为 "swin"
base_model_prefix = "swin"# 主输入名称为 "pixel_values"
main_input_name = "pixel_values"# 支持梯度检查点
supports_gradient_checkpointing = Truedef _init_weights(self, module):
"""Initialize the weights"""ifisinstance(module, (nn.Linear, nn.Conv2d)):
# 对线性层和卷积层使用正态分布初始化权重
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias isnotNone:
# 如果有偏置,则将偏置初始化为零
module.bias.data.zero_()
elifisinstance(module, nn.LayerNorm):
# 对 LayerNorm 层,初始化偏置为零,权重为1
module.bias.data.zero_()
module.weight.data.fill_(1.0)
SWIN_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`DonutImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings("The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.",
SWIN_START_DOCSTRING,
)
# 定义 DonutSwinModel 类,继承自 DonutSwinPreTrainedModelclass DonutSwinModel(DonutSwinPreTrainedModel):
pass# 这里省略了具体实现,仅作为示例展示类的定义和继承关系def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
# 调用父类的初始化方法,传入配置信息super().__init__(config)
# 将配置信息保存到实例变量中
self.config = config
# 计算编码器层数量
self.num_layers = len(config.depths)
# 计算特征数量
self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
# 初始化嵌入层对象
self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
# 初始化编码器对象,传入嵌入层的补丁网格
self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)
# 如果需要添加池化层,则初始化自适应平均池化层,否则设为 None
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer elseNone# 调用初始化权重和应用最终处理的方法
self.post_init()
def get_input_embeddings(self):
# 返回嵌入层的补丁嵌入return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""# 遍历需要剪枝的层和对应的注意力头信息for layer, heads in heads_to_prune.items():
# 调用编码器对象中相应层的注意力模块的剪枝方法
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DonutSwinModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 此处应该包含模型前向传播的详细说明文档和示例代码,但在注释中无法展示具体内容pass) -> Union[Tuple, DonutSwinModelOutput]:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""# 设置输出是否包含注意力权重,默认与模型配置一致
output_attentions = output_attentions if output_attentions isnotNoneelse self.config.output_attentions
# 设置输出是否包含隐藏状态,默认与模型配置一致
output_hidden_states = (
output_hidden_states if output_hidden_states isnotNoneelse self.config.output_hidden_states
)
# 设置返回对象类型,默认与模型配置一致
return_dict = return_dict if return_dict isnotNoneelse self.config.use_return_dict
# 如果未提供像素值,则抛出数值错误if pixel_values isNone:
raise ValueError("You have to specify pixel_values")
# 准备头部掩码(如果需要)# 头部掩码中的 1 表示保留该头部# attention_probs 的形状为 bsz x n_heads x N x N# 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]# 将 head_mask 转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, len(self.config.depths))
# 将像素值和布尔掩码位置作为输入,传递给嵌入层获取嵌入输出和输入维度
embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
# 将嵌入输出传递给编码器,获取编码器的输出
encoder_outputs = self.encoder(
embedding_output,
input_dimensions,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的序列输出
sequence_output = encoder_outputs[0]
# 如果存在池化器,则对序列输出进行池化和扁平化处理
pooled_output = Noneif self.pooler isnotNone:
pooled_output = self.pooler(sequence_output.transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
# 如果不返回字典,则返回元组格式的输出ifnot return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
# 如果返回字典,则返回特定的模型输出对象return DonutSwinModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
.\models\donut\processing_donut.py
# 设置编码格式为 UTF-8# 版权声明:2022 年由 HuggingFace Inc. 团队所有## 根据 Apache 许可证 2.0 版本,只有在符合许可证的情况下才能使用此文件# 您可以从以下链接获取许可证的副本:## http://www.apache.org/licenses/LICENSE-2.0## 除非适用法律要求或书面同意,否则本软件根据“原样”分发,不附带任何明示或暗示的保证或条件。# 有关详细信息,请参阅许可证。"""
Donut 的处理器类。
"""import re # 导入正则表达式模块import warnings # 导入警告模块from contextlib import contextmanager # 导入上下文管理器from ...processing_utils import ProcessorMixin # 导入处理器混合类classDonutProcessor(ProcessorMixin):
r"""
构造一个 Donut 处理器,将 Donut 图像处理器和 XLMRoBERTa 分词器封装成一个单一处理器。
[`DonutProcessor`] 提供 [`DonutImageProcessor`] 和 [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] 的所有功能。
详见 [`~DonutProcessor.__call__`] 和 [`~DonutProcessor.decode`] 获取更多信息。
Args:
image_processor ([`DonutImageProcessor`], *可选*):
[`DonutImageProcessor`] 的实例。图像处理器是必需的输入。
tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *可选*):
[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] 的实例。分词器是必需的输入。
"""
attributes = ["image_processor", "tokenizer"] # 类属性列表
image_processor_class = "AutoImageProcessor"# 图像处理器类名
tokenizer_class = "AutoTokenizer"# 分词器类名def__init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = Noneif"feature_extractor"in kwargs:
# 警告:`feature_extractor` 参数已弃用,并将在 v5 中删除,请使用 `image_processor` 替代。
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
# 如果 kwargs 中包含 `feature_extractor`,则将其赋给 feature_extractor 变量
image_processor = image_processor if image_processor isnotNoneelse feature_extractor
# 如果未指定 image_processor,则引发 ValueError 异常if image_processor isNone:
raise ValueError("You need to specify an `image_processor`.")
# 如果未指定 tokenizer,则引发 ValueError 异常if tokenizer isNone:
raise ValueError("You need to specify a `tokenizer`.")
# 调用父类 ProcessorMixin 的构造函数,传入 image_processor 和 tokenizersuper().__init__(image_processor, tokenizer)
# 设置当前处理器为 image_processor
self.current_processor = self.image_processor
# 标记目标上下文管理器未启动
self._in_target_context_manager = Falsedef__call__(self, *args, **kwargs):
"""
当在正常模式下使用时,该方法将所有参数转发给 AutoImageProcessor 的 [`~AutoImageProcessor.__call__`] 并返回其输出。
如果在上下文 [`~DonutProcessor.as_target_processor`] 中使用,则将所有参数转发给 DonutTokenizer 的 [`~DonutTokenizer.__call__`]。
请参阅上述两个方法的文档了解更多信息。
"""# 对于向后兼容性if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
images = kwargs.pop("images", None)
text = kwargs.pop("text", None)
iflen(args) > 0:
images = args[0]
args = args[1:]
if images isNoneand text isNone:
raise ValueError("You need to specify either an `images` or `text` input to process.")
if images isnotNone:
# 使用图像处理器处理图像和其他参数
inputs = self.image_processor(images, *args, **kwargs)
if text isnotNone:
# 使用分词器处理文本和其他参数
encodings = self.tokenizer(text, **kwargs)
if text isNone:
return inputs
elif images isNone:
return encodings
else:
# 将标签添加到输入字典中
inputs["labels"] = encodings["input_ids"]
return inputs
defbatch_decode(self, *args, **kwargs):
"""
将所有参数转发给 DonutTokenizer 的 [`~PreTrainedTokenizer.batch_decode`] 方法。请参阅该方法的文档了解更多信息。
"""return self.tokenizer.batch_decode(*args, **kwargs)
defdecode(self, *args, **kwargs):
"""
将所有参数转发给 DonutTokenizer 的 [`~PreTrainedTokenizer.decode`] 方法。请参阅该方法的文档了解更多信息。
"""return self.tokenizer.decode(*args, **kwargs)
@contextmanagerdefas_target_processor(self):
"""
临时设置处理输入的分词器。用于在微调 TrOCR 时对标签进行编码。
"""
warnings.warn(
"`as_target_processor` 已弃用,并将在 Transformers 的 v5 中移除。您可以通过在常规 `__call__` 方法的参数 `text` 中处理您的标签(在与图像输入相同的调用中或在单独的调用中)。"
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.image_processor
self._in_target_context_manager = Falsedeftoken2json(self, tokens, is_inner_value=False, added_vocab=None):
"""
Convert a (generated) token sequence into an ordered JSON format.
Args:
tokens (str): The token sequence to convert into JSON format.
is_inner_value (bool, optional): Indicates if the function is processing inner values. Defaults to False.
added_vocab (list, optional): List of added vocabulary tokens. Defaults to None.
Returns:
dict or list: Ordered JSON format representing the token sequence.
Converts a sequence of tokens into a structured JSON format. Handles both leaf and non-leaf nodes
in the token sequence recursively.
"""if added_vocab isNone:
added_vocab = self.tokenizer.get_added_vocab()
output = {}
while tokens:
# Locate the start token in the token sequence
start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
if start_token isNone:
break
key = start_token.group(1)
key_escaped = re.escape(key)
# Find the corresponding end token for the current start token
end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
start_token = start_token.group()
if end_token isNone:
tokens = tokens.replace(start_token, "")
else:
end_token = end_token.group()
start_token_escaped = re.escape(start_token)
end_token_escaped = re.escape(end_token)
# Extract content between start and end tokens
content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
if content isnotNone:
content = content.group(1).strip()
ifr"<s_"in content andr"</s_"in content: # non-leaf node# Recursively convert inner token sequence to JSON
value = self.token2json(content, is_inner_value=True, added_vocab=added_vocab)
if value:
iflen(value) == 1:
value = value[0]
output[key] = value
else: # leaf nodes
output[key] = []
# Split content into leaf nodes based on separator "<sep/>"for leaf in content.split(r"<sep/>"):
leaf = leaf.strip()
if leaf in added_vocab and leaf[0] == "<"and leaf[-2:] == "/>":
leaf = leaf[1:-2] # for categorical special tokens
output[key].append(leaf)
iflen(output[key]) == 1:
output[key] = output[key][0]
# Remove processed tokens from the sequence
tokens = tokens[tokens.find(end_token) + len(end_token):].strip()
if tokens[:6] == r"<sep/>": # non-leaf nodes# Return a list with current output and recursively processed tokensreturn [output] + self.token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)
# Handle cases where no output is generatediflen(output):
return [output] if is_inner_value else output
else:
return [] if is_inner_value else {"text_sequence": tokens}
@propertydeffeature_extractor_class(self):
"""
Property accessor for deprecated feature_extractor_class.
Returns:
class: The image processor class.
Warns:
FutureWarning: This property is deprecated and will be removed in v5.
Use `image_processor_class` instead.
"""
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@propertydeffeature_extractor(self):
"""
Property accessor for deprecated feature_extractor.
Returns:
object: The image processor instance.
Warns:
FutureWarning: This property is deprecated and will be removed in v5.
Use `image_processor` instead.
"""
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\donut\__init__.py
# Copyright 2022 The HuggingFace Team. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# 导入类型检查模块from typing import TYPE_CHECKING
# 导入自定义异常和LazyModule类、检查是否有torch和vision库可用的函数from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
# 定义模块导入结构字典
_import_structure = {
"configuration_donut_swin": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutSwinConfig"],
"processing_donut": ["DonutProcessor"],
}
# 检查是否有torch库可用,如果不可用则抛出OptionalDependencyNotAvailable异常try:
ifnot is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
# 如果可用,添加相关模块到_import_structure字典中
_import_structure["modeling_donut_swin"] = [
"DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
"DonutSwinModel",
"DonutSwinPreTrainedModel",
]
# 检查是否有vision库可用,如果不可用则抛出OptionalDependencyNotAvailable异常try:
ifnot is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
# 如果可用,添加相关模块到_import_structure字典中
_import_structure["feature_extraction_donut"] = ["DonutFeatureExtractor"]
_import_structure["image_processing_donut"] = ["DonutImageProcessor"]
# 如果在类型检查模式下if TYPE_CHECKING:
# 从相应模块导入特定类和变量from .configuration_donut_swin import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutSwinConfig
from .processing_donut import DonutProcessor
try:
# 检查是否有torch库可用,如果不可用则抛出OptionalDependencyNotAvailable异常ifnot is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
# 如果可用,从相应模块导入特定类和变量from .modeling_donut_swin import (
DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
DonutSwinModel,
DonutSwinPreTrainedModel,
)
try:
# 检查是否有vision库可用,如果不可用则抛出OptionalDependencyNotAvailable异常ifnot is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
passelse:
# 如果可用,从相应模块导入特定类和变量from .feature_extraction_donut import DonutFeatureExtractor
from .image_processing_donut import DonutImageProcessor
# 如果不是在类型检查模式下else:
# 导入sys模块import sys
# 将当前模块替换为_LazyModule的实例,通过LazyModule进行按需导入
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek “源神”启动!「GitHub 热点速览」
· 我与微信审核的“相爱相杀”看个人小程序副业
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 如何使用 Uni-app 实现视频聊天(源码,支持安卓、iOS)
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)