中文数据增强 同义句 扩展

def FuncRecursive(len_curr=0, sen_odd=[], sen_curr=[]):

    """

      递归函数,将形如 [['1'], ['1', '2'], ['1']] 的list转为 ['111','121']

    :param count: int, recursion times

    :param candidate_list_set: list, eg.[[''], ['', '是不是'], ['喜欢', '喜爱', ''], ['米饭']]

    :param syn_sentences: list, Storing intermediate variables of syn setnence, eg.['你是喜欢米饭', '你是不是喜欢米饭', '你是不是爱米饭']

    :return: list, result of syn setnence, eg.['你是喜欢米饭', '你是不是喜欢米饭', '你是不是爱米饭']

    """

    syn_sentences = []

    len_curr = len_curr - 1

    if len_curr == -1:

        return sen_curr

    for sen_odd_one in sen_odd[0]:

        for syn_one in sen_curr:

            syn_sentences.append(syn_one + sen_odd_one)

    syn_sentences = FuncRecursive(len_curr=len_curr,

                                  sen_odd=sen_odd[1:],

                                  sen_curr=syn_sentences)

    return syn_sentences





def gen_syn_sentences(org_data):

    """

        同义句生成等

    :param org_data: list, list of rule

    :return: list

    """

    # 获取数据

    sentences_pre = []

    for org_sen in org_data:

        org_sen_sp = org_sen.split("][")

        sentences_add = []

        for words in org_sen_sp:

            words_sp = words.split("|")

            words_sp = [word.replace("]", "").replace("[", "") for word in words_sp]

            sentences_add.append(words_sp)

        sentences_pre.append(sentences_add)



    # 递归生成

    sentences_syn = []

    for sen_rule in sentences_pre:

        len_sen_rule = len(sen_rule)

        if len_sen_rule == 1: # 长度为1不递归

            sentences_syn = sentences_syn + sen_rule[0]

        else:

            sentences_syn = sentences_syn + FuncRecursive(len_curr=len_sen_rule-1,

                                                          sen_odd=sen_rule[1:],

                                                          sen_curr=sen_rule[0])

    return sentences_syn
if __name__=="__main__":

    org_data = ["[你][喜欢|喜爱|爱][虾米|啥子|什么]", "[1|11][2|22][3|33][44|444]", "大漠帝国"]

    syn_sentences = gen_syn_sentences(org_data)

    # syn_sentences = sorted(syn_sentences)

    print(syn_sentences)

    gg = 0

原文:

https://blog.csdn.net/rensihui/article/details/94589739

posted @ 2019-11-20 10:41  cup_leo  阅读(263)  评论(0编辑  收藏  举报