金融领域预训练模型用于分类任务,大模型应用参考
在bert的基础上加了一个分类层:
代码实现:
output = bert.model.output output = Lambda(lambda x: x[:, 0], name='CLS-token')(output) output = Dense( units=num_classes, activation='softmax', kernel_initializer=bert.initializer )(output) model = keras.models.Model(bert.model.input, output)
然后就是利用bert的输出训练一个分类任务了!!!
完整代码如下:
#! -*- coding:utf-8 -*- #FinWoBERT:中文金融领域增强预训练模型 ''' 康明. 深度学习预训练语言模型(案例篇) ——中文金融文本情绪分类研究[M]. 北京: 清华大学出版社, 2022. Ming Kang. Pretraining Language Models in Deep Learning: A Case Study of Chinese Sentiment Classification for Financial Text. Beijing: Tsinghua University Press, 2022. ''' import os, json import numpy as np from bert4keras.backend import keras, set_gelu from bert4keras.tokenizers import Tokenizer from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam,extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.snippets import open from keras.layers import Lambda, Dense from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix import jieba_fast as jieba jieba.initialize() num_classes = 3 maxlen = 512 batch_size = 32 # bert配置 # path = "/Users/sssdjj/bert_source/" config_path = 'data/chinese_wobert_L-12_H-768_A-12/bert_config.json' checkpoint_path = 'data/chinese_wobert_L-12_H-768_A-12/bert_model.ckpt' dict_path = 'data/chinese_wobert_L-12_H-768_A-12/vocab.txt' labels = {"其他":0,"利多":1,"利空":2} stop_words = [] # 加入停用词 # with open("data/cn_stopwords.txt") as f: # for i in f: # stop_words.append(i.strip()) def load_data(filename): """加载数据 单条格式:(文本, 标签id) """ D = [] with open(filename, encoding='utf-8') as f: for l in f: if len(l.strip().split('|||')) == 2: label,text = l.strip().split('|||') # 去除停用词 # for i in stop_words: # text = str(text).replace(i," ") D.append((text, labels[label])) return D path = "data/" # 加载数据集 train_data = load_data(path+'train.txt') valid_data = load_data(path+'test.txt') # 增加自定义词库 word.txt 元词表 word_zhengf.txt 加入正负词 jieba.load_userdict(path+"word_zhengf_buzai_vocab.txt") # 建立分词器 tokenizer = Tokenizer( dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False) ) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] ctokens = [] with open(path+"word_zhengf_buzai_vocab.txt") as f: for i in f: ctokens.append(tokenizer.encode(i.strip())[0][1:-1]) bert = build_transformer_model( config_path, checkpoint_path, return_keras_model=False, compound_tokens=ctokens ) output = bert.model.output output = Lambda(lambda x: x[:, 0], name='CLS-token')(output) output = Dense( units=num_classes, activation='softmax', kernel_initializer=bert.initializer )(output) model = keras.models.Model(bert.model.input, output) model.summary() # AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=1e-6), # 用足够小的学习率 metrics=['accuracy'], ) # 转换数据集 train_generator = data_generator(train_data, batch_size) valid_generator = data_generator(valid_data, batch_size) def norm_index(y_true,y_pred): acc = accuracy_score(y_true, y_pred) macro_prec = precision_score(y_true, y_pred, average='macro') micro_prec = precision_score(y_true, y_pred, average='micro') macro_recall = recall_score(y_true, y_pred, average='macro') micro_recall = recall_score(y_true, y_pred, average='micro') macro_f1 = f1_score(y_true, y_pred, average='macro') micro_f1 = f1_score(y_true, y_pred, average='micro') cm = confusion_matrix(y_true, y_pred) return acc, macro_prec,micro_prec, macro_recall, micro_recall,macro_f1,micro_f1, cm def evaluate(data): total, right = 0., 0. pred_list,true_list = [], [] for x_true, y_true in data: y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] # total += len(y_true) # right += (y_true == y_pred).sum() pred_list.extend(y_pred) true_list.extend(y_true) return norm_index(true_list,pred_list) class Evaluator(keras.callbacks.Callback): def __init__(self): self.best_val_acc = 0. def on_epoch_end(self, epoch, logs=None): val_acc, macro_prec,micro_prec, macro_recall, micro_recall,macro_f1,micro_f1, cm = evaluate(valid_generator) if val_acc > self.best_val_acc: self.best_val_acc = val_acc model.save_weights('train/best_model_sentiment.weights') print( u'val_acc: %.15f, best_val_acc: %.15f,loss:%s\n' % (val_acc, self.best_val_acc,logs) ) print( u'macro_prec: %.15f, micro_prec: %.15f\n' % (macro_prec, micro_prec) ) print( u'macro_recall: %.15f, micro_recall: %.15f\n' % (macro_recall, micro_recall) ) print( u'macro_f1: %.15f, micro_f1: %.15f\n' % (macro_f1, micro_f1) ) print(cm) if __name__ == '__main__': evaluator = Evaluator() model.fit_generator( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs= 100, callbacks=[evaluator] ) else: model.load_weights('best_model_sentiment.weights')
为了提升金融领域的领域大模型,还可以针对预训练加入金融领域特有的语料库:
关键技术:
灾难性遗忘
几个文章可以深入阅读下:
Yuqing Zhao, Divya Saxena, Jiannong Cao. Revisiting Parameter Reuse to Overcome Catastrophic Forgetting in Neural Networks. arXiv:2207.11005v1 [cs.LG], 2022. Matteo Boschini, Lorenzo Bonicelli, Angelo Porrello, et al. Transfer without Forgetting // Computer Vision – ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23–27, 2022, Proceedings, Part I. Cham: Springer, 2022. Yabin Wang, Zhiwu Huang, Xiaopeng Hong. S-Prompts Learning with Pre-trained Transformers: An Occam's Razor for Domain Incremental Learning. arXiv:2207.12819v1 [cs.CV], 2022.
另外,为了增强可解释性,预训练的语料库要和分类任务保持一致。
最后为了增强模型的健壮性,还可以加入GAN:
Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy. Explaining and Harnessing Adversarial Examples. arXiv:1412.6572v3 [stat.ML], 2015.
Christian Szegedy, Wojciech Zaremba, Ilya Sutskever, et al. Intriguing Properties of Neural Networks. arXiv:1312.6199v4 [cs.CV], 2014.
TensorFlow. Adversarial example using FGSM. https://tensorflow.google.cn/tutorials/generative/adversarial_fgsm, 2021.
Nathan Inkawhich. Adversarial Example Generation. https://pytorch.org/tutorials/beginner/fgsm_tutorial.html, 2021.
样本生成用的是该文章的方法:
另外,为了防止过拟合,在输出层可以加入L1正则化!