LexicalAnalysis
1.概述
Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型,在单个模型中完成中文分词、词性标注、专名识别任务。我们在自建的数据集上对分词、词性标注、专名识别进行整体的评估效果。主要通过标注来完成这些任务。
2.预测和损失函数
标注问题一般用crf来作为损失函数,然后用crf decoding来完成预测
其中crf decoding相当于viterbi算法
def lex_net(word, args, vocab_size, num_labels, for_infer=True, target=None): """ define the lexical analysis network structure word: stores the input of the model for_infer: a boolean value, indicating if the model to be created is for training or predicting. return: for infer: return the prediction otherwise: return the prediction """ word_emb_dim = args.word_emb_dim grnn_hidden_dim = args.grnn_hidden_dim emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0 crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0 bigru_num = args.bigru_num init_bound = 0.1 IS_SPARSE = True def _bigru_layer(input_feature): """ define the bidirectional gru layer """ pre_gru = fluid.layers.fc( input=input_feature, size=grnn_hidden_dim * 3, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-init_bound, high=init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) gru = fluid.layers.dynamic_gru( input=pre_gru, size=grnn_hidden_dim, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-init_bound, high=init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) pre_gru_r = fluid.layers.fc( input=input_feature, size=grnn_hidden_dim * 3, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-init_bound, high=init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) gru_r = fluid.layers.dynamic_gru( input=pre_gru_r, size=grnn_hidden_dim, is_reverse=True, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-init_bound, high=init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1) return bi_merge def _net_conf(word, target=None): """ Configure the network """ word_embedding = fluid.embedding( input=word, size=[vocab_size, word_emb_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( learning_rate=emb_lr, name="word_emb", initializer=fluid.initializer.Uniform( low=-init_bound, high=init_bound))) input_feature = word_embedding for i in range(bigru_num): bigru_output = _bigru_layer(input_feature) input_feature = bigru_output emission = fluid.layers.fc( size=num_labels, input=bigru_output, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-init_bound, high=init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) if target is not None: crf_cost = fluid.layers.linear_chain_crf( input=emission, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=crf_lr)) avg_cost = fluid.layers.mean(x=crf_cost) crf_decode = fluid.layers.crf_decoding( input=emission, param_attr=fluid.ParamAttr(name='crfw')) return avg_cost, crf_decode else: size = emission.shape[1] fluid.layers.create_parameter( shape=[size + 2, size], dtype=emission.dtype, name='crfw') crf_decode = fluid.layers.crf_decoding( input=emission, param_attr=fluid.ParamAttr(name='crfw')) return crf_decode if for_infer: return _net_conf(word) else: # assert target != None, "target is necessary for training" return _net_conf(word, target)
3.lod_level
fluid.data参数中的lod_level指的是传入数据lod的个数如果为1表示一个batch为 二维的,如果为二表示一个batch是三维的。