一种数据增强方法-非核心词替换

对于要替换的词是随机选择的，因此一种直观感受是，如果一些重要词被替换了，那么增强后文本的质量会大打折扣。这一部分介绍的方法，则是为了尽量避免这一问题，所实现的词替换技术，姑且称之为「基于非核心词替换的数据增强技术」。

我们最早是在 google 提出 UDA 算法的那篇论文中发现的这一技术 [6]，是否在更早的文献中出现过，我们没有再深究了，有了解的同学请留言告知。

整个技术的核心点也比较简单，用词典中不重要的词去替换文本中一定比例的不重要词，从而产生新的文本。

我们知道在信息检索中，一般会用 TF-IDF 值来衡量一个词对于一段文本的重要性，下面简单介绍一下 TF-IDF 的定义：

TF（词频）即一个词在文中出现的次数，统计出来就是词频 TF，显而易见，一个词在文章中出现很多次，那么这个词可能有着很大的作用，但如果这个词又经常出现在其他文档中，如「的」、「我」，那么其重要性就要大打折扣，后者就是用 IDF 来表征。

IDF（逆文档频率），一个重要性调整系数，衡量一个词是不是常见词。如果某个词比较少见，但是它在这篇文章中多次出现，那么它很可能就反映了这篇文章的特性，正是我们所需要的关键词。

TF-IDF = TF×IDF，通过此公式可以有效衡量一个词对于一段文本的重要性。当我们知道一个词对于一个文本的重要性之后，再采用与 TF-IDF 负相关的概率去采样文中的词，用来决定是否要替换，这样可以有效避免将文本中的一些关键词进行错误替换或删除。

UDA 论文中所提出的具体实现方式如下：

实现一：该方法没有使用字典词汇中被抽取的概率，而是在待分析文本中选出待替换数据后，通过预训练词向量查找top1相似词汇（sgns.merge.word这个词向量还是质量蛮高的，感谢小萌同学），进行替换得到增强样本。代码如下：

"""
    Function: data enhance
    Author: dyx
    DateTime: 2020.7.16
"""
import os
import jieba
import jieba.analyse
from gensim import corpora
from gensim import models
import pickle
import json
from gensim.models import KeyedVectors

STOPWORDS = open('../resources/stopwords.txt', 'r', encoding='utf8').readlines()
STOPWORDS = {each.strip():1 for each in STOPWORDS}
CURRENT_FOLDER = os.path.dirname(os.path.abspath(__file__))
WORD2VEC_FILE = r'F:\词向量\sgns.merge.word\sgns.merge.word'
ENHANCE_FILE = os.path.join(CURRENT_FOLDER, 'data', 'enhance_train_v1.json')
WORD2VEC = models.KeyedVectors.load_word2vec_format(WORD2VEC_FILE, binary=False)
ANALYSIS_DATA = os.path.join(CURRENT_FOLDER, 'data', 'analysis_data.txt')
PERD_DEV = os.path.join(CURRENT_FOLDER, 'data/error_data/predict_dev')
TRUE_DEV = os.path.join(CURRENT_FOLDER, 'data/error_data/dev.json')
# wv_from_text.init_sims(replace=True)

class DataEnhance():

    def __init__(self, data_path):
        self.data_path = data_path
        # with open(self.data_path, 'r', encoding='utf8') as fr:
        #     self.content = fr.readlines()
        with open(PERD_DEV, 'r', encoding='utf8') as fr:
            self.pred_dev = fr.readlines()
        with open(TRUE_DEV, 'r', encoding='utf8') as fr:
            self.true_dev = fr.readlines()



    def read_data(self):
            label_count = {'Yes':0, 'No':0, 'Depends':0}
            word_list = []
            analysis_data = []

            for each in self.content:
                yesno_answer = eval(each).get('yesno_answer')
                answer = eval(each).get('answer')
                question = eval(each).get('question')
                tokens = list(jieba.cut(answer))
                # 针对answer进行增强
                word_list.append(tokens)
                # tokens = [each for each in tokens if each not in STOPWORDS]

                # keywords_tfidf = jieba.analyse.extract_tags(answer, topK=30, withWeight=True)
                # jieba.analyse.set_stop_words(STOPWORDS)
                # keywords_tfidf_sotpwords = jieba.analyse.extract_tags(answer, topK=30, withWeight=True)
                # keywords_textrank = jieba.analyse.textrank(answer, topK=30, withWeight=True)
                # print('question:{}; yesno_answer:{}\nanswer:{}\nsegment:{}\nkeywords_tfidf:{}\n'.format(question, yesno_answer, answer, tokens, keywords_tfidf))
                if yesno_answer == "Yes":
                    label_count[yesno_answer] += 1
                elif yesno_answer == 'No':
                    label_count[yesno_answer] += 1
                elif yesno_answer == "Depends":
                    label_count[yesno_answer] += 1
                    to_save = 'label:{}\tanswer:{}\n'.format(yesno_answer, answer)
                    analysis_data.append(to_save)
                    print(to_save)

            with open(ANALYSIS_DATA, 'w', encoding='utf8') as fw:
                fw.writelines(analysis_data)
            dictionary = corpora.Dictionary(word_list)
            new_corpus = [dictionary.doc2bow(w) for w in word_list]
            tfidf = models.TfidfModel(new_corpus)
            tfidf.save(os.path.join(CURRENT_FOLDER, 'model', 'tfidf.model'))
            str_id = dictionary.token2id
            with open(os.path.join(CURRENT_FOLDER, 'model', 'str_id.pkl'), 'wb') as fw:
                pickle.dump(str_id, fw)
            print(label_count)
            print('tfidf:{}'.format(tfidf))


    def unimportant_word_enhance(self):

        with open(os.path.join(CURRENT_FOLDER, 'model', 'str_id.pkl'), 'rb') as fr:
            str_id = pickle.load(fr)
        tfidf_temp = models.TfidfModel.load(os.path.join(CURRENT_FOLDER, 'model', 'tfidf.model'))

        # 构造文本-tfidf值
        tfidf = {}
        for id  in tfidf_temp.dfs:
            word = list(str_id.keys())[list(str_id.values()).index(id)]
            tfidf[word] = tfidf_temp.dfs[id] * tfidf_temp.idfs[id]
        # 最大tfidf值
        max_tfidf = max(tfidf.values())
        for key, value in tfidf.items():
            if (value == max_tfidf):
                print('{} 词汇最大tfidf值：{}'.format(key, value))
        # word was extract prob
        extract_prob = {}
        z_sigma = 0
        for key in tfidf:
            z_sigma += max_tfidf-tfidf[key]

        for key, value in tfidf.items():
            extract_prob[key] = (max_tfidf-value)/z_sigma
        print("每个词被抽取的概率：{}".format(extract_prob))


        finnal_data = []
        p = 0.3
        for sample in self.content:
            text = eval(sample).get('answer')
            # replace prob
            # text = r'是的。凡是能够被2整除的整数都是偶数，例如2、4、-6等。偶数当中也有正偶数、0、负偶数。'
            tokens = jieba.lcut(text.strip())
            sent_tfidf = {}

            for each in tokens:
                sent_tfidf[each] = tfidf.get(each)

            C = max(sent_tfidf.values())
            Z = 0.0
            replace_prob = {}
            for each in tokens:
                Z += (C-sent_tfidf[each])/len(tokens)
            for each in tokens:
                temp = p*(C-sent_tfidf[each])/Z
                if temp < 1.0:
                    replace_prob[each] = temp
                else:
                    replace_prob[each] = 1.0
            replace_prob_sorted = sorted(replace_prob.items(), key=lambda x:x[1], reverse=True)
            print("当前句子词被替换的概率：{}".format(replace_prob_sorted))
            replace_word_01 = replace_prob_sorted[0][0]
            replace_word_02 = replace_prob_sorted[0][1]

            # similar_word
            for i in [replace_word_01, replace_word_02]:
                if i in WORD2VEC.vocab:
                    target_word = WORD2VEC.similar_by_word(i, 1)
                    target_word = target_word[0][0]
                    print('原始词汇：{}\t被替换词汇：{}'.format(i, target_word))
                    text = text.replace(i, target_word)
            new_sample = json.loads(sample)
            new_sample['answer'] = text
            finnal_data.append(json.dumps(new_sample, ensure_ascii=False)+'\n')
        with open(ENHANCE_FILE, 'w', encoding='utf8') as fw:
            fw.writelines(finnal_data)

        print('ok')


    def analysis_data(self):
        label_count = {'Yes': 0, 'No': 0, 'Depends': 0}
        error_count = 0
        length = len(self.true_dev)

        for a, p in zip(self.true_dev, self.pred_dev):
            x = json.loads(a)
            a_yna = eval(a).get('yesno_answer')
            p_yna = eval(p).get('yesno_answer')
            if a_yna != p_yna:
                error_count += 1

                if a_yna == "Yes":
                    label_count[a_yna] += 1
                elif a_yna == 'No':
                    label_count[a_yna] += 1
                elif a_yna == "Depends":
                    label_count[a_yna] += 1

                q = eval(a).get('question')
                a = eval(a).get('answer')

                title = [each.get('title') for each in x.get('documents')]
                paragraphs = [each.get('paragraphs') for each in x.get('documents')]
                print(title)
                print(paragraphs)
                print('question:{}\nanswer:{}\ntrue label:{}\npredict label:{}\n\n'.format( q, a, a_yna, p_yna))

        print('总预测样本数：{}，误判样本数：{}，Acc:{:.4f}'.format(length, error_count, (length-error_count)/length))
        print('预测错误标签分布：{}'.format(label_count))


if __name__ == '__main__':
    data_path = os.path.join(CURRENT_FOLDER, 'data', 'train.json')
    de = DataEnhance(data_path)
    # de.read_data()
    # de.unimportant_word_enhance()
    de.analysis_data()

实现二：复现原文中的词汇抽取方法。优点是这些词汇依旧是原始语料中的词汇，能够避免一定的语义漂移现象。初步代码如下：

"""
    Function: data enhance
    Author: dyx
    DateTime: 2020.7.16
"""
import os
import jieba
import jieba.analyse
from gensim import corpora
from gensim import models
import pickle
import json
import numpy as np
from gensim.models import KeyedVectors

STOPWORDS = open('../resources/stopwords.txt', 'r', encoding='utf8').readlines()
STOPWORDS = {each.strip():1 for each in STOPWORDS}
CURRENT_FOLDER = os.path.dirname(os.path.abspath(__file__))
# WORD2VEC_FILE = r'F:\词向量\sgns.merge.word\sgns.merge.word'
ENHANCE_FILE = os.path.join(CURRENT_FOLDER, 'data', 'enhance_train_v1.json')
# WORD2VEC = models.KeyedVectors.load_word2vec_format(WORD2VEC_FILE, binary=False)
ANALYSIS_DATA = os.path.join(CURRENT_FOLDER, 'data', 'analysis_data.txt')
PERD_DEV = os.path.join(CURRENT_FOLDER, 'data/error_data/predict_dev')
TRUE_DEV = os.path.join(CURRENT_FOLDER, 'data/error_data/dev.json')
# wv_from_text.init_sims(replace=True)

class DataEnhance():

    def __init__(self, data_path):
        self.data_path = data_path
        with open(self.data_path, 'r', encoding='utf8') as fr:
            self.content = fr.readlines()
        with open(PERD_DEV, 'r', encoding='utf8') as fr:
            self.pred_dev = fr.readlines()
        with open(TRUE_DEV, 'r', encoding='utf8') as fr:
            self.true_dev = fr.readlines()



    def read_data(self):
            label_count = {'Yes':0, 'No':0, 'Depends':0}
            word_list = []
            analysis_data = []

            for each in self.content:
                yesno_answer = eval(each).get('yesno_answer')
                answer = eval(each).get('answer')
                question = eval(each).get('question')
                tokens = list(jieba.cut(answer))
                # 针对answer进行增强
                word_list.append(tokens)
                # tokens = [each for each in tokens if each not in STOPWORDS]

                # keywords_tfidf = jieba.analyse.extract_tags(answer, topK=30, withWeight=True)
                # jieba.analyse.set_stop_words(STOPWORDS)
                # keywords_tfidf_sotpwords = jieba.analyse.extract_tags(answer, topK=30, withWeight=True)
                # keywords_textrank = jieba.analyse.textrank(answer, topK=30, withWeight=True)
                # print('question:{}; yesno_answer:{}\nanswer:{}\nsegment:{}\nkeywords_tfidf:{}\n'.format(question, yesno_answer, answer, tokens, keywords_tfidf))
                if yesno_answer == "Yes":
                    label_count[yesno_answer] += 1
                elif yesno_answer == 'No':
                    label_count[yesno_answer] += 1
                elif yesno_answer == "Depends":
                    label_count[yesno_answer] += 1
                    to_save = 'label:{}\tanswer:{}\n'.format(yesno_answer, answer)
                    analysis_data.append(to_save)
                    print(to_save)

            with open(ANALYSIS_DATA, 'w', encoding='utf8') as fw:
                fw.writelines(analysis_data)
            dictionary = corpora.Dictionary(word_list)
            new_corpus = [dictionary.doc2bow(w) for w in word_list]
            tfidf = models.TfidfModel(new_corpus)
            tfidf.save(os.path.join(CURRENT_FOLDER, 'model', 'tfidf.model'))
            str_id = dictionary.token2id
            with open(os.path.join(CURRENT_FOLDER, 'model', 'str_id.pkl'), 'wb') as fw:
                pickle.dump(str_id, fw)
            print(label_count)
            print('tfidf:{}'.format(tfidf))


    def unimportant_word_enhance(self):

        with open(os.path.join(CURRENT_FOLDER, 'model', 'str_id.pkl'), 'rb') as fr:
            str_id = pickle.load(fr)
        tfidf_temp = models.TfidfModel.load(os.path.join(CURRENT_FOLDER, 'model', 'tfidf.model'))

        # 构造文本-tfidf值
        tfidf = {}
        for id  in tfidf_temp.dfs:
            word = list(str_id.keys())[list(str_id.values()).index(id)]
            tfidf[word] = tfidf_temp.dfs[id] * tfidf_temp.idfs[id]
        # 最大tfidf值
        max_tfidf = max(tfidf.values())
        for key, value in tfidf.items():
            if (value == max_tfidf):
                print('{} 词汇最大tfidf值：{}'.format(key, value))
        # word was extract prob
        extract_prob = {}
        z_sigma = 0
        for key in tfidf:
            z_sigma += max_tfidf-tfidf[key]

        original_word = []
        word_prob = []

        for key, value in tfidf.items():
            prob = (max_tfidf-value)/z_sigma
            extract_prob[key] = prob
            original_word.append(key)
            word_prob.append(prob)
        print("每个词被抽取的概率：{}".format(extract_prob))


        finnal_data = []
        p = 0.3
        for sample in self.content:
            text = eval(sample).get('answer')
            extract_word = original_word[np.random.choice(len(original_word), 1, p=word_prob)[0]]

            # replace prob
            # text = r'是的。凡是能够被2整除的整数都是偶数，例如2、4、-6等。偶数当中也有正偶数、0、负偶数。'
            tokens = jieba.lcut(text.strip())
            sent_tfidf = {}

            for each in tokens:
                sent_tfidf[each] = tfidf.get(each)

            C = max(sent_tfidf.values())
            Z = 0.0
            replace_prob = {}
            for each in tokens:
                Z += (C-sent_tfidf[each])/len(tokens)
            for each in tokens:
                temp = p*(C-sent_tfidf[each])/Z
                if temp < 1.0:
                    replace_prob[each] = temp
                else:
                    replace_prob[each] = 1.0
            replace_prob_sorted = sorted(replace_prob.items(), key=lambda x:x[1], reverse=True)
            print("当前句子词被替换的概率：{}".format(replace_prob_sorted))
            replace_word_01 = replace_prob_sorted[0][0]
            text = text.replace(replace_word_01, extract_word)
            print('被替换词：{}； 目标词汇：{}'.format(replace_word_01, extract_word))
            new_sample = json.loads(sample)
            new_sample['answer'] = text
            finnal_data.append(json.dumps(new_sample, ensure_ascii=False)+'\n')


        with open(ENHANCE_FILE, 'w', encoding='utf8') as fw:
            fw.writelines(finnal_data)

        print('ok')


    def analysis_data(self):
        label_count = {'Yes': 0, 'No': 0, 'Depends': 0}
        error_count = 0
        length = len(self.true_dev)

        for a, p in zip(self.true_dev, self.pred_dev):
            x = json.loads(a)
            a_yna = eval(a).get('yesno_answer')
            p_yna = eval(p).get('yesno_answer')
            if a_yna != p_yna:
                error_count += 1

                if a_yna == "Yes":
                    label_count[a_yna] += 1
                elif a_yna == 'No':
                    label_count[a_yna] += 1
                elif a_yna == "Depends":
                    label_count[a_yna] += 1

                q = eval(a).get('question')
                a = eval(a).get('answer')

                title = [each.get('title') for each in x.get('documents')]
                paragraphs = [each.get('paragraphs') for each in x.get('documents')]
                print(title)
                print(paragraphs)
                print('question:{}\nanswer:{}\ntrue label:{}\npredict label:{}\n\n'.format( q, a, a_yna, p_yna))

        print('总预测样本数：{}，误判样本数：{}，Acc:{:.4f}'.format(length, error_count, (length-error_count)/length))
        print('预测错误标签分布：{}'.format(label_count))


if __name__ == '__main__':
    data_path = os.path.join(CURRENT_FOLDER, 'data', 'train.json')
    de = DataEnhance(data_path)
    de.read_data()
    de.unimportant_word_enhance()
    # de.analysis_data()

posted @ 2020-07-16 17:32 今夜无风阅读(861) 评论(0) 编辑收藏举报

刷新页面返回顶部

一种数据增强方法-非核心词替换

公告