1-3文本数据建模流程范例——eat_tensorflow2_in_30_days
1-3文本数据建模流程范例
准备数据
imdb数据集的目标是根据电影评论的文本内容预测评论的情感标签。
训练集有20000条电影评论文本,测试集有5000条电影评论文本,其中正面评论和负面评论都各占一半。
文本数据预处理较为繁琐,包括中文切词(本示例不涉及),构建词典,编码转换,序列填充,构建数据管道等等。
在tensorflow中完成文本数据预处理的常用方案有两种,第一种是利用tf.keras.preprocessing中的Tokenizer词典构建工具和tf.keras.utils.Sequence构建文本数据生成器管道。
第二种是使用tf.data.Dataset搭配.keras.layers.experimental.preprocessing.TextVectorization预处理层。
第一种方法较为复杂,其使用范例可以参考以下文章。
https://zhuanlan.zhihu.com/p/67697840
第二种方法为TensorFlow原生方式,相对也更加简单一些。
我们此处介绍第二种方法。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing, optimizers, losses, metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re, string
train_data_path = './data/imdb/train.csv'
test_data_path = './data/imdb/test.csv'
MAX_WORDS = 10000 # 仅考虑最高频的10000个词
MAX_LEN = 200 # 每个样本保留200个词的长度
BATCH_SIZE = 20
# 构建管道
def splite_line(line):
arr = tf.strings.split(line, '\t')
label = tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]), tf.int32), axis=0)
text = tf.expand_dims(arr[1], axis=0)
return text, label
# num_parallel_calls函数是为了提高数据map的处理速度, 并行处理,
# tf.data.experimental.AUTOTUNE可以让程序自动的选择最优的线程并行个数
# prefetch可以让数据队列预先缓存一定个数的batch, 提高对GPU的利用率.
ds_train_raw = tf.data.TextLineDataset(filenames=[train_data_path]) \
.map(splite_line, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
.shuffle(buffer_size=1000).batch(BATCH_SIZE) \
.prefetch(tf.data.experimental.AUTOTUNE)
ds_test_raw = tf.data.TextLineDataset(filenames=[test_data_path]) \
.map(splite_line, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
.shuffle(buffer_size=1000).batch(BATCH_SIZE) \
.prefetch(tf.data.experimental.AUTOTUNE)
# 构建词典
def clean_text(text):
lowercase = tf.strings.lower(text)
stripped_html = tf.strings.regex_replace(lowercase, '<br /', ' ')
"""
re.escape(pattern) 可以对字符串中所有可能被解释为正则运算符的字符进行转义的应用函数。
如果字符串很长且包含很多特殊技字符,而你又不想输入一大堆反斜杠,或者字符串来自于用户
(比如通过raw_input函数获取输入的内容),且要用作正则表达式的一部分的时候,可以使用这个函数.
string.punctuation 返回所有标点符号
"""
cleaned_punctuation = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
return cleaned_punctuation
vectorize_layer = TextVectorization(
standardize=clean_text,
split='whitespace',
max_tokens=MAX_WORDS - 1, # 留一个给占位符
output_mode='int',
output_sequence_length=MAX_LEN
)
ds_text = ds_train_raw.map(lambda text, label: text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])
# 单词编码
ds_train = ds_train_raw.map(lambda text, label: (vectorize_layer(text), label)) \
.prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test_raw.map(lambda text, label: (vectorize_layer(text), label)) \
.prefetch(tf.data.experimental.AUTOTUNE)
"""
['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'his', 'are', 'have', 'be', 'he', 'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so', 'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some', 'there', 'what', 'good', 'more', 'when', 'very', 'she', 'even', 'my', 'no', 'would', 'up', 'time', 'only', 'which', 'story', 'really', 'their', 'were', 'had', 'see', 'can', 'me', 'than', 'we', 'much', 'well', 'get', 'been', 'will', 'into', 'people', 'also', 'other', 'do', 'bad', 'because', 'great', 'first', 'how', 'him', 'most', 'dont', 'made', 'then', 'them', 'films', 'movies', 'way', 'make', 'could', 'too', 'any']
"""
定义模型
使用Keras接口有以下3种方式构建模型:使用Sequential按层顺序构建模型,使用函数式API构建任意结构模型,继承Model基类构建自定义模型。
此处选择使用继承Model基类构建自定义模型。
# 演示自定义模型范例,实际上应该优先使用Sequential或者函数式API
tf.keras.backend.clear_session()
class CnnModel(models.Model):
def __init__(self):
super(CnnModel, self).__init__()
def build(self, input_shape):
self.embedding = layers.Embedding(MAX_WORDS, 7, input_length=MAX_LEN)
self.conv_1 = layers.Conv1D(16, kernel_size=5, name='conv_1', activation='relu')
self.pool = layers.MaxPool1D()
self.conv_2 = layers.Conv1D(128, kernel_size=2, name='conv_2', activation='relu')
self.flatten = layers.Flatten()
self.dense = layers.Dense(1, activation='sigmoid')
super(CnnModel, self).build(input_shape)
def call(self, x):
x = self.embedding(x)
x = self.conv_1(x)
x = self.pool(x)
x = self.conv_2(x)
x = self.flatten(x)
x = self.dense(x)
return (x)
model = CnnModel()
model.build(input_shape=(None, MAX_LEN))
model.summary()
"""
Model: "cnn_model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) multiple 70000
conv_1 (Conv1D) multiple 576
max_pooling1d (MaxPooling1D multiple 0
)
conv_2 (Conv1D) multiple 4224
flatten (Flatten) multiple 0
dense (Dense) multiple 12417
=================================================================
Total params: 87,217
Trainable params: 87,217
Non-trainable params: 0
_________________________________________________________________
"""
训练模型
训练模型通常有3种方法,内置fit方法,内置train_on_batch方法,以及自定义训练循环。此处我们通过自定义训练循环训练模型。
#打印时间分割线
@tf.function
def printbar():
ts = tf.timestamp()
today_ts = ts % (24*60*60)
hour = tf.cast(today_ts//3600 + 8, tf.int32) % tf.constant(24)
minite = tf.cast((today_ts%3600)//60, tf.int32)
second = tf.cast(tf.floor(today_ts%60), tf.int32)
def timeformat(m):
if tf.strings.length(tf.strings.format("{}", m))==1:
return (tf.strings.format("0{}", m))
else:
return (tf.strings.format("{}", m))
timestring = tf.strings.join([timeformat(hour), timeformat(minite),
timeformat(second)], separator=':')
tf.print("=========="*6,end = "")
tf.print(timestring)
optimizer = optimizers.Nadam()
loss_func = losses.BinaryCrossentropy()
train_loss = metrics.Mean(name='train_loss')
train_metric = metrics.BinaryAccuracy(name='train_accuracy')
valid_loss = metrics.Mean(name='vaild_loss')
valid_metric = metrics.BinaryAccuracy(name='valid_accuracy')
@tf.function
def train_step(model, features, labels):
with tf.GradientTape() as tape:
predictions = model(features, training=True)
loss = loss_func(labels, predictions)
gradents = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradents, model.trainable_variables))
train_loss.update_state(loss)
train_metric.update_state(labels, predictions)
@tf.function
def valid_step(model, features, labels):
predictions = model(features, training=False)
batch_loss = loss_func(labels, predictions)
valid_loss.update_state(batch_loss)
valid_metric.update_state(labels, predictions)
def train_model(model, ds_train, ds_valid, epochs):
for epoch in tf.range(1, epochs+1):
for features, labels in ds_train:
train_step(model, features, labels)
for features, labels in ds_valid:
valid_step(model, features, labels)
#此处logs模板需要根据metric具体情况修改
logs = 'Epoch={},Loss:{},Accuracy:{},Valid Loss:{},Valid Accuracy:{}'
if epoch%1 == 0:
printbar()
tf.print(tf.strings.format(logs,
(epoch, train_loss.result(), train_metric.result(),
valid_loss.result(), valid_metric.result())))
tf.print("")
train_loss.reset_states()
train_metric.reset_states()
valid_loss.reset_states()
valid_metric.reset_states()
train_model(model, ds_train, ds_test, epochs=6)
"""
============================================================17:17:01
Epoch=1,Loss:0.436266869,Accuracy:0.7774,Valid Loss:0.316734344,Valid Accuracy:0.866
============================================================17:17:06
Epoch=2,Loss:0.242585599,Accuracy:0.90305,Valid Loss:0.316047966,Valid Accuracy:0.8678
============================================================17:17:11
Epoch=3,Loss:0.170076191,Accuracy:0.93525,Valid Loss:0.364521772,Valid Accuracy:0.871
============================================================17:17:16
Epoch=4,Loss:0.112194523,Accuracy:0.961,Valid Loss:0.445208907,Valid Accuracy:0.8624
============================================================17:17:22
Epoch=5,Loss:0.0685834885,Accuracy:0.97625,Valid Loss:0.602242172,Valid Accuracy:0.8612
============================================================17:17:28
Epoch=6,Loss:0.0407535546,Accuracy:0.9865,Valid Loss:0.770922959,Valid Accuracy:0.8564
"""
评估模型
通过自定义训练循环训练的模型没有经过编译,无法直接使用model.evaluate(ds_valid)方法
def evaluate_model(model, ds_valid):
for features, labels in ds_valid:
valid_step(model, features, labels)
logs = 'Valid Loss:{}, Valid Accuracy:{}'
tf.print(tf.strings.format(logs, (valid_loss.result(), valid_metric.result())))
valid_loss.reset_states()
train_metric.reset_states()
valid_metric.reset_states()
evaluate_model(model, ds_test)
"""
Valid Loss:0.770922899, Valid Accuracy:0.8564
"""
使用模型
可以使用以下方法:
- model.predict(ds_test)
- model(x_test)
- model.call(x_test)
- model.predict_on_batch(x_test)
推荐优先使用model.predict(ds_test)方法,既可以对Dataset,也可以对Tensor使用。
model.predict(ds_test)
"""
array([[3.5137840e-02],
[1.0000000e+00],
[1.4748545e-07],
...,
[6.2157035e-09],
[4.2759150e-01],
[9.9991405e-01]], dtype=float32)
"""
for x_test,_ in ds_test.take(1):
print(model(x_test))
#以下方法等价:
#print(model.call(x_test))
#print(model.predict_on_batch(x_test))
"""
tf.Tensor(
[[1.0000000e+00]
[3.8479683e-01]
[3.8026985e-05]
[2.7880540e-01]
[9.9999988e-01]
[9.9999976e-01]
[1.0000000e+00]
[9.9999964e-01]
[4.3148658e-01]
[9.9726164e-01]
[5.0594736e-02]
[7.1867041e-02]
[1.4818518e-07]
[1.0000000e+00]
[9.6764386e-01]
[6.9687414e-01]
[1.0000000e+00]
[5.9486812e-01]
[3.8141350e-04]
[3.2319189e-05]], shape=(20, 1), dtype=float32)
"""
保存模型
推荐使用TensorFlow原生方式保存模型。
model.save('./data/tf_model_savedmodel/', save_format='tf')
print('export saved model.')
model_loaded = tf.keras.models.load_model('./data/tf_model_savedmodel/')
model_loaded.predict(ds_test)
"""
INFO:tensorflow:Assets written to: ./data/tf_model_savedmodel/assets
export saved model.
WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.
array([[1.3959842e-05],
[9.8147720e-01],
[9.9960679e-01],
...,
[2.6070832e-03],
[2.2976701e-01],
[9.9999046e-01]], dtype=float32)
"""