tf2.0-bert4keras Theseus 模型精简
tf2.0 调试通过代码
#! -*- coding:utf-8 -*- # 文本分类例子下的模型压缩 # 方法为BERT-of-Theseus # 论文:https://arxiv.org/abs/2002.02925 # 博客:https://kexue.fm/archives/7575 import json import numpy as np from bert4keras.backend import keras, K from bert4keras.tokenizers import Tokenizer from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.snippets import open from keras.layers import Input, Lambda, Dense, Layer from keras.models import Model num_classes = 119 maxlen = 128 batch_size = 32 # BERT base config_path = '/models/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/models/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/models/chinese_L-12_H-768_A-12/vocab.txt' def load_data(filename): """加载数据 单条格式:(文本, 标签id) """ D = [] with open(filename) as f: for i, l in enumerate(f): l = json.loads(l) text, label = l['sentence'], l['label'] D.append((text, int(label))) return D # 加载数据集 train_data = load_data( '/data/train.json' ) valid_data = load_data( '/data/dev.json' ) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] # 转换数据集 train_generator = data_generator(train_data, batch_size) valid_generator = data_generator(valid_data, batch_size) class BinaryRandomChoice(Layer): """随机二选一 """ def __init__(self, **kwargs): super(BinaryRandomChoice, self).__init__(**kwargs) self.supports_masking = True def compute_mask(self, inputs, mask=None): if mask is not None: return mask[1] def call(self, inputs): source, target = inputs mask = K.random_binomial(shape=[1], p=0.5) output = mask * source + (1 - mask) * target return K.in_train_phase(output, target) def compute_output_shape(self, input_shape): return input_shape[1] def bert_of_theseus(predecessor, successor, classfier): """bert of theseus """ inputs = predecessor.inputs # 固定住已经训练好的层 for layer in predecessor.model.layers: layer._name = 'Predecessor-stable' + layer.name layer.trainable = False classfier.trainable = False # Embedding层替换 predecessor_outputs = predecessor.apply_embeddings(inputs) successor_outputs = successor.apply_embeddings(inputs) outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs]) # Transformer层替换 layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers for index in range(successor.num_hidden_layers): predecessor_outputs = outputs for sub_index in range(layers_per_module): predecessor_outputs = predecessor.apply_main_layers( predecessor_outputs, layers_per_module * index + sub_index ) successor_outputs = successor.apply_main_layers(outputs, index) outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs]) # 返回模型 outputs = classfier(outputs) model = Model(inputs, outputs) return model def evaluate(data, model): total, right = 0., 0. for x_true, y_true in data: y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() return right / total class Evaluator(keras.callbacks.Callback): """评估与保存 """ def __init__(self, savename): self.best_val_acc = 0. self.savename = savename def on_epoch_end(self, epoch, logs=None): val_acc = evaluate(valid_generator, self.model) if val_acc > self.best_val_acc: self.best_val_acc = val_acc self.model.save_weights(self.savename) print( u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc) ) # 加载预训练模型(12层) predecessor = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-' ) # predecessor.name = "Predecessor" # 加载预训练模型(3层) successor = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-' ) # successor.name = "Successor" # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classfier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classfier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary() successor_model = Model(successor.inputs, classfier(successor.output)) successor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) successor_model.summary() theseus_model = bert_of_theseus(predecessor, successor, classfier) theseus_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) theseus_model.summary() if __name__ == '__main__': # 训练predecessor predecessor_evaluator = Evaluator('best_predecessor.weights') predecessor_model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=5, callbacks=[predecessor_evaluator] ) # 训练theseus theseus_evaluator = Evaluator('best_theseus.weights') theseus_model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=10, callbacks=[theseus_evaluator] ) theseus_model.load_weights('best_theseus.weights') # 训练successor successor_evaluator = Evaluator('best_successor.weights') successor_model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=5, callbacks=[successor_evaluator] )
参考: https://kexue.fm/archives/7575
2022.7.26修正
1)bert4keras在外接如textCNN、Rnn等结构时需要魔改
2) Conv1D、MaxPooling等需要独立操作,或者下载包版本完善
部分改动如下:
def get_data(filename): res = [] print(filename) with open(filename, 'r') as fr: for e in fr: item = e.split("\t") res.append((item[1].strip(), int(item[0]))) return res def train_test_split(data): data = get_data(data) random.shuffle(data) nums = int(len(data) * 0.05) return data[nums:], data[:nums] train_data, valid_data = get_data(train_path), get_data(valid_path) print(f"训练集数据:{len(train_data)},验证集数据:{len(valid_data)}") # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] def fortest(self, random=False): for d in self.__iter__(random): yield d[0] # 转换数据集 train_generator = data_generator(train_data, batch_size) valid_generator = data_generator(valid_data, batch_size) class BinaryRandomChoice(Layer): """随机二选一 """ def __init__(self, **kwargs): super(BinaryRandomChoice, self).__init__(**kwargs) self.supports_masking = True def compute_mask(self, inputs, mask=None): if mask is not None: return mask[1] def call(self, inputs): source, target = inputs mask = K.random_bernoulli(shape=[1], p=0.5) output = mask * source + (1 - mask) * target return K.in_train_phase(output, target) def compute_output_shape(self, input_shape): return input_shape[1] def bert_of_theseus(predecessor, successor, classfier): """bert of theseus """ inputs = predecessor.inputs # 固定住已经训练好的层 for layer in predecessor.model.layers: layer._name = 'Predecessor-stable-' + layer.name layer.trainable = False classfier.trainable = False # Embedding层替换 predecessor_outputs = predecessor.apply_embeddings(inputs) successor_outputs = successor.apply_embeddings(inputs) outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs]) # Transformer层替换 layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers for index in range(successor.num_hidden_layers): predecessor_outputs = outputs for sub_index in range(layers_per_module): predecessor_outputs = predecessor.apply_main_layers( predecessor_outputs, layers_per_module * index + sub_index ) successor_outputs = successor.apply_main_layers(outputs, index) outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs]) # 返回模型 outputs = classfier(outputs) model = Model(inputs, outputs) return model def evaluate(data, model): total, right = 0., 0. for x_true, y_true in data: y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() return right / total class Evaluator(keras.callbacks.Callback): """评估与保存 """ def __init__(self, savename): self.best_val_acc = 0. self.savename = savename def on_epoch_end(self, epoch, logs=None): val_acc = evaluate(valid_generator, self.model) if val_acc > self.best_val_acc: self.best_val_acc = val_acc tf.saved_model.save(self.model, self.savename) print( u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc) ) class MyConv1D(keras.layers.Conv1D): def __init__(self, *args, **kwargs): super(MyConv1D, self).__init__(*args, **kwargs) self.supports_masking = True def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) inputs = inputs * mask[:, :, None] return super(MyConv1D, self).call(inputs) class MyMaxPooling1D(keras.layers.MaxPooling1D): def __init__(self, *args, **kwargs): super(MyMaxPooling1D, self).__init__(*args, **kwargs) self.supports_masking = True def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) inputs = inputs * mask[:, :, None] return super(MyMaxPooling1D, self).call(inputs) def textCnn(embed): # 一维卷积 cnn1 = MyConv1D(256, 3, padding='same', strides=1, activation='relu')(embed) cnn1 = MyMaxPooling1D(pool_size=48)(cnn1) cnn2 = MyConv1D(256, 4, padding='same', strides=1, activation='relu')(embed) cnn2 = MyMaxPooling1D(pool_size=47)(cnn2) cnn3 = MyConv1D(256, 5, padding='same', strides=1, activation='relu')(embed) cnn3 = MyMaxPooling1D(pool_size=46)(cnn3) cnn = Lambda(lambda x:K.concatenate(x, axis=-1))([cnn1, cnn2, cnn3]) cnn_shape = K.int_shape(cnn) cnn = Lambda(lambda x:K.reshape(x, [-1, cnn_shape[1] * cnn_shape[2], 1]))(cnn) flat = Lambda(lambda x:K.squeeze(x, -1))(cnn) drop = Dropout(0.2)(flat) return drop # 加载预训练模型(12层) predecessor = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-', sequence_length=maxlen ) # 加载预训练模型(3层),输出为索引为2的向量输出 successor = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=2, prefix='Successor-', sequence_length=maxlen ) # 判别模型,768向量的输出作为判别模型的输入 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) # x = textCnn(x_in) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax', name="out")(x) classfier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classfier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) # predecessor_model.summary() # 学生模型搞一个同样的 successor_model = Model(successor.inputs, classfier(successor.output)) successor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) # successor_model.summary() theseus_model = bert_of_theseus(predecessor, successor, classfier) theseus_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) # theseus_model.summary() if __name__ == '__main__': # 训练predecessor predecessor_evaluator = Evaluator('keras/model/predecessor') predecessor_model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=2, callbacks=[predecessor_evaluator] ) # 训练theseus theseus_evaluator = Evaluator('keras/model/theseus') theseus_model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=3, callbacks=[theseus_evaluator] ) # 训练successor successor_evaluator = Evaluator('keras/model/successor') successor_model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=2, callbacks=[successor_evaluator] ) print("training over!")
时刻记着自己要成为什么样的人!