squad_convert_example_to_features

最近在看QA,对dataset不是很了解,所以看了一下pytorch中的squad_convert_example_to_features。

1.squad_convert_example_to_features

以下为pytorch源代码:  

其中example数据大致呈现(不完整):

def squad_convert_example_to_features(
    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
):
   # example:需要转化为dataset的数据
  “”“
    doc_stride:如果context过长的话,会进行截断,doc_stride就代表平移的步长(相邻的截断中会出现重复的,详情请见后面)
    max_seq_length:代表最长的sequence length(包括query_length+
sequence_added_tokens_length+doc_stride+repeat_length
            repeat_length:代表每一段span中,重复的个数
  ”“”
features
= [] if is_training and not example.is_impossible:
     # is_imposiible=False,代表正样本
# Get start and end position
     # start_position代表answer text在context text中的start position(注意,这里代表的是词,eg.cuting
     # end_position 代表answer text在context text中的end position(注意,这里代表的是词)
start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'") return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if tokenizer.__class__.__name__ in [ "RobertaTokenizer", "LongformerTokenizer", "BartTokenizer", "RobertaTokenizerFast", "LongformerTokenizerFast", "BartTokenizerFast", ]:
       # 代表分词,分为把不同的token,eg.cuting=cut+ing sub_tokens
= tokenizer.tokenize(token, add_prefix_space=True) else: sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i)
       # 代表context tokens all_doc_tokens.append(sub_token)
if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1      # 获得answer在contex中的start与end postion (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text ) spans = []    # 将question进行分词+convert_token_ids;
   # 参数
add_special_tokens=False,代表分词结果不出现特殊的token,eg.[cls],[sep]等 truncated_query = tokenizer.encode( example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
    )

    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
    # in the way they compute mask of added tokens.
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
    sequence_added_tokens = (
        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
    )
    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
  # 在context length较大的情况下,根据步长为doc_stride进行分段
while len(spans) * doc_stride < len(all_doc_tokens): # Define the side we want to truncate / pad and the text/pair sorting if tokenizer.padding_side == "right":
       # padding_side代表context片段和question拼接的相对位置关系 texts
= truncated_query pairs = span_doc_tokens truncation = TruncationStrategy.ONLY_SECOND.value else: texts = span_doc_tokens pairs = truncated_query truncation = TruncationStrategy.ONLY_FIRST.value      # 当padding_size为right时,拼接的token为question+context片段,总长为max_length(前提是context过长) encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic texts, pairs, truncation=truncation, padding=padding_strategy, max_length=max_seq_length, return_overflowing_tokens=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, return_token_type_ids=True, )      # 求本段span长度 paragraph_len = min(
       # 本段长度未达到max_length len(all_doc_tokens)
- len(spans) * doc_stride,
       # 求得本段context总长度,注意,这时的值应该比doc_stride要大 max_seq_length
- len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) ) non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict or ( "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0 ): break
     # overflowing_len代表剩下的context text
span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) index = ( j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implementation also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens :] = 0 else: p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True) ).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens           # 标记一下本段是否出现answer的相关文本,并标记其start和endposition start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, qas_id=example.qas_id, ) ) return features

关于encode_plus的解释:

import torch
from transformers import BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

sentence = "Hello, my son is laughing."
sentence2 = "Hello, my son is cuting."

print(tokenizer.encode_plus(sentence,sentence2))

结果:

{'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102, 7592, 1010, 2026, 2365, 2003, 3013, 2075, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
print(tokenizer.encode_plus(sentence,sentence2,truncation="only_second",padding="max_length",max_length=12,stride=2,return_token_type_ids=True,return_overflowing_tokens=True,))

结果:此时doc_stride=0

{'overflowing_tokens': [7592, 1010, 2026, 2365, 2003, 3013, 2075, 1012], 'num_truncated_tokens': 6, 'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102, 7592, 1010, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

print(tokenizer.encode_plus(sentence,sentence2,truncation="only_second",padding="max_length",max_length=12,stride=1,return_token_type_ids=True,return_overflowing_tokens=True,))

结果:此时doc_stirde=1

{'overflowing_tokens': [1010, 2026, 2365, 2003, 3013, 2075, 1012], 'num_truncated_tokens': 6, 'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102, 7592, 1010, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

print(tokenizer.encode_plus(sentence,sentence2,truncation="only_second",padding="max_length",max_length=12,stride=0,return_token_type_ids=True,return_overflowing_tokens=True,))

结果:此时doc_stride=2

{'overflowing_tokens': [2026, 2365, 2003, 3013, 2075, 1012], 'num_truncated_tokens': 6, 'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102, 7592, 1010, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

2.生成dataset

pytorch中源代码:

def squad_convert_examples_to_features(
    examples,
    tokenizer,
    max_seq_length,
    doc_stride,
    max_query_length,
    is_training,
    padding_strategy="max_length",
    return_dataset=False,
    threads=1,
    tqdm_enabled=True,
    args=None,
):
    """
    Converts a list of examples into a list of features that can be directly given as input to a model.
    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.

    Args:
        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
        max_seq_length: The maximum sequence length of the inputs.
        doc_stride: The stride used when the context is too large and is split across several features.
        max_query_length: The maximum length of the query.
        is_training: whether to create features for model evaluation or model training.
        return_dataset: Default False. Either 'pt' or 'tf'.
            if 'pt': returns a torch.data.TensorDataset,
            if 'tf': returns a tf.data.Dataset
        threads: multiple processing threadsa-smi


    Returns:
        list of :class:`~transformers.data.processors.squad.SquadFeatures`

    Example::

        processor = SquadV2Processor()
        examples = processor.get_dev_examples(data_dir)

        features = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )
    """

    # Defining helper methods
    features = []
    threads = min(threads, cpu_count())
    if args.use_multiprocessing:
        with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
            annotate_ = partial(
                squad_convert_example_to_features,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                padding_strategy=padding_strategy,
                is_training=is_training,
            )
            features = list(
                tqdm(
                    p.imap(annotate_, examples, chunksize=args.multiprocessing_chunksize),
                    total=len(examples),
                    desc="convert squad examples to features",
                    disable=not tqdm_enabled,
                    position=0, leave=True
                )
            )
    else:
        squad_convert_example_to_features_init(tokenizer)
        annotate_ = partial(
            squad_convert_example_to_features,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            padding_strategy=padding_strategy,
            is_training=is_training,
        )
        features = [annotate_(example) for example in tqdm(examples, disable=not tqdm_enabled, position=0, leave=True)]
    new_features = []
    unique_id = 1000000000
    example_index = 0
    for example_features in tqdm(
        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled, position=0, leave=True
    ):
        if not example_features:
            continue
        for example_feature in example_features:
            example_feature.example_index = example_index
            example_feature.unique_id = unique_id
            new_features.append(example_feature)
            unique_id += 1
        example_index += 1
    features = new_features
    del new_features

    # Convert to Tensors and build dataset
    max_len = 512
    all_input_ids = torch.tensor([f.input_ids[:max_len] for f in features], dtype=torch.long)
    all_attention_masks = torch.tensor([f.attention_mask[:max_len] for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids[:max_len] for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask[:max_len] for f in features], dtype=torch.float)
    all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)

    if not is_training:
        all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(
            all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
        )
    else:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
        dataset = TensorDataset(
            all_input_ids,
            all_attention_masks,
            all_token_type_ids,
            all_start_positions,
            all_end_positions,
            all_cls_index,
            all_p_mask,
            all_is_impossible,
        )

return features, dataset

在生成dataset的时候,若样本中的contetx长度过长,将会进行分段(question+context),当作一组训练数据。

posted @ 2021-10-15 10:50  为红颜  阅读(628)  评论(0编辑  收藏  举报