Tensorflow之基于LSTM神经网络写唐诗
最近看了不少关于写诗的博客,在前人的基础上做了一些小的改动,因比较喜欢一次输入很长的开头句,所以让机器人输出压缩为一个开头字生成两个诗句,写五言和七言诗,当然如果你想写更长的诗句是可以继续改动的。
在输入做了些改动,去除误输入的标点符号,例如输入下面词句:
怒发冲冠,凭栏处,潇潇雨歇。抬望眼,仰天长啸,壮怀激烈。
机器人写出如下:
怒漠多无度袍小,巡管山明恰见偷。
发杵共鸿莼散暮,家山曾住上阳台。
冲钩麻衣隐步障,楼舟复别赤轮楼。
冠盘一线倾中令,音信长思两足阴。
凭栏十字送月沈,莫待长筵韦与兵。
栏湿地闲鱼脚吠,匣中虚伴虎前羊。
处向梅香千万里,石城心中明碧簟。
潇湘夜塘独思不,几处深笼不尽迟。
潇湘十二海云天,五月今留报主人。
雨后青青沧练过,月明南渡雁来新。
歇鞭千里知贤哭,清镜草浮麋没繁。
抬吾谢安犹带减,醉来赤此不才齐。
望台已待隋家咏,楼上不能犹扫成。
眼看药炉香岭上,比惟无言旧青春。
仰归安得衰无事,三笑出师湘水春。
天初碧玉衣襟淡,国药满川霞彩寒。
长叹榆关家已远,从来父子屈襄郎。
啸花青石速望尽,宛逐汀洲随并年。
壮时天下还如旧,生计孤吟去杀无。
怀哉却寄终拘束,莫道人来有也才。
激眼剑旗喧并髻,新菰麦落破门骄。
烈灵不识槛西间,芳草青天有五禽。
怒搜温液切,若近太阳香。
发欲奔宾影,争得频人怜。
冲腾临缺曙,谢豹出红残。
冠盖若移在,想得绛皇皇。
凭高晋家雨,才不寄黄金。
栏落临巨浸,根孔恨仙桃。
处世愿越游,飘扬共行之。
潇湘南北洞,蜀国湘江湄。
潇湘弦管绝,上月洞庭时。
雨历道中朔,不起列仙风。
歇毂须江道,家歌住忽依。
抬山弄弓寞,装束岛霞裙。
望闻拜天子,幸有窦金狐。
眼看尽无些,香雨不兴天。
仰阮子不笔,不敢相思逸。
天与十二胆,彩笔取时七。
长安陇波归,银没乌方地。
啸作胡人船,大作康庄匠。
壮何时七两,隐括为匡庐。
怀君青纪肥,常带牺胆圣。
激逐鸱子子,人来儆此处。
烈太仓毛黄,家长受德吉。
代码如下:
main.py
import collections import os import sys import re import numpy as np import tensorflow as tf from model import rnn_model from poems import process_poems, generate_batch os.environ['TF_CPP_MIN_LOG_LEVEL']='2' tf.flags.DEFINE_integer('batch_size', 64, 'batch size.') tf.flags.DEFINE_float('learning_rate', 0.01, 'learning rate.') # set this to 'main.py' relative path tf.flags.DEFINE_string('checkpoints_dir', './checkpoints/', 'checkpoints save path.') tf.flags.DEFINE_string('file_path', './data/poetry.txt', 'file name of poems.') tf.flags.DEFINE_integer('epochs', 50, 'train how many epochs.') FLAGS = tf.flags.FLAGS start_token = 'G' end_token = 'E' #开始训练 def run_training(): if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)): os.mkdir(os.path.dirname(FLAGS.checkpoints_dir)) if not os.path.exists(FLAGS.checkpoints_dir): os.mkdir(FLAGS.checkpoints_dir) # 单词转化的数字:向量,单词和数字一一对应的字典,单词 poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) # 真实值和目标值 batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) # 数据占位符 input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len( vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate) # 实例化保存模型 saver = tf.train.Saver(tf.global_variables()) # 全局变量进行初始化 init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess) # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # 先执行,全局变量初始化 sess.run(init_op) start_epoch = 0 # 把之前训练过的checkpoint拿出来 checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir) if checkpoint: # 拿出训练保存模型 saver.restore(sess, checkpoint) print("[INFO] restore from the checkpoint {0}".format(checkpoint)) start_epoch += int(checkpoint.split('-')[-1]) print('[INFO] start training...') try: for epoch in range(start_epoch, FLAGS.epochs): n = 0 # 多少行唐诗//每次训练的个数 n_chunk = len(poems_vector) // FLAGS.batch_size for batch in range(n_chunk): loss, _, _ = sess.run([ end_points['total_loss'], # 损失 end_points['last_state'], # 最后一次输出 end_points['train_op'] # 训练优化损失 ], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]}) n += 1 print('[INFO] Epoch: %d , batch: %d , training loss: %.6f' % (epoch, batch, loss)) if epoch % 6 == 0: # 每隔多少次保存 saver.save(sess, FLAGS.checkpoints_dir, global_step=epoch) except KeyboardInterrupt: print('[INFO] Interrupt manually, try saving checkpoint for now...') saver.save(sess, FLAGS.checkpoints_dir, global_step=epoch) print('[INFO] Last epoch were saved, next time will start from epoch {}.'.format(epoch)) def to_word(predict, vocabs): t = np.cumsum(predict) s = np.sum(predict) # searchsorted 在前面查找后面的 sample = int(np.searchsorted(t, np.random.rand(1) * s)) # sample = np.argmax(predict) if sample > len(vocabs): sample = len(vocabs) - 1 return vocabs[sample] #调用模型生成诗句 def gen_poem(begin_words, num): batch_size = 1 print('[INFO] loading corpus from %s' % FLAGS.file_path) # 单词转化的数字:向量,单词和数字一一对应的字典,单词 poems_vector, word_int_map, vocabularies = process_poems(FLAGS.file_path) # 此时输入为1个 input_data = tf.placeholder(tf.int32, [batch_size, None]) # 损失等 end_points = rnn_model(model='lstm', input_data=input_data, output_data=None, vocab_size=len( vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate) saver = tf.train.Saver(tf.global_variables()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(init_op) # 保存模型的位置,拿回sess checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir) # checkpoint = tf.train.latest_checkpoint('./model/') saver.restore(sess, checkpoint) # saver.restore(sess,'./model/-24') # 从字典里面获取到的开始值 x = np.array([list(map(word_int_map.get, start_token))]) [predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']], feed_dict={input_data: x}) poem = '' for begin_word in begin_words: while True: if begin_word: word = begin_word else: word = to_word(predict, vocabularies) sentence = '' while word != end_token: sentence += word x = np.zeros((1, 1)) x[0, 0] = word_int_map[word] [predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']], feed_dict={input_data: x, end_points['initial_state']: last_state}) word = to_word(predict, vocabularies) # word = words[np.argmax(probs_)] if len(sentence) == 2 + 2 * num and (',' or '?') not in sentence[:num] and (',' or '?') not in sentence[num+1:-1] and sentence[num] == ',' and '□' not in sentence: poem += sentence # sentence = '' break else: print("我正在写诗呢") return poem #这里将生成的诗句,按照中文诗词的格式输出 #同时方便接入应用 def pretty_print_poem(poem): poem_sentences = poem.split('。') # print(poem_sentences) for s in poem_sentences: if s != '' and len(s) > 10: # if s != '': print(s + '。') def main(): if len(sys.argv) == 2: if sys.argv[1] == '1': print('[INFO] train tang poem...') run_training() elif sys.argv[1] == '2': num = int(input("请输入训练诗句(5:五言,7:七言):")) if num == 5 or num == 7: print('[INFO] write tang poem...') begin_word = input('开始作诗,请输入起始字:') if len(begin_word) == 0: print("请输入词句") return r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+' begin_word = re.sub(r1, '', begin_word) poem2 = gen_poem(begin_word, num) pretty_print_poem(poem2) else: print('输入有误') else: print('a',sys.argv[1]) print("请按照以下方式执行:") print("python xxxx.py 1(1:训练,2:写诗)") else: print(len(sys.argv)) print("请按照以下方式执行:") print("python xxxx.py 1(1:训练,2:写诗)") if __name__ == '__main__': main()
将诗歌和单词转换为一一对应的数字:
def process_poems(file_name): # 诗集 poems = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: title, content = line.strip().split(':') content = content.replace(' ', '') if '_' in content or '(' in content or '《' in content or '[' in content or \ start_token in content or end_token in content: continue if len(content) < 5 or len(content) > 79: continue content = start_token + content + end_token poems.append(content) except ValueError as e: pass # 按诗的字数排序 poems = sorted(poems, key=lambda l: len(line)) # 统计每个字出现次数 all_words = [] for poem in poems: all_words += [word for word in poem] # 这里根据包含了每个字对应的频率 Counter({'我': 3, '你': 2, '他': 1}) counter = collections.Counter(all_words) # items转化为列表,里面为元组 [('他', 1), ('你', 2), ('我', 3)] count_pairs = sorted(counter.items(), key=lambda x: x[-1]) # ('他', '你', '我'),(1, 2, 3) words, _ = zip(*count_pairs) # 取前多少个常用字 words = words[:len(words)] + (' ',) # words = words[:len(words)] # 每个字映射为一个数字ID {'他': 0, '你': 1, '我': 2} word_int_map = dict(zip(words, range(len(words)))) # 将诗歌中每个字转换成一一对应的数字,输入poem中的每个字,转化成对应数字返回 # 没有获得word对应数字Id,就返回len(words) poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems] return poems_vector, word_int_map, words
定义真实值和目标值:
def generate_batch(batch_size, poems_vec, word_to_int): # 每次取64首诗进行训练 # 计算有多少个batch_size n_chunk = len(poems_vec) // batch_size x_batches = [] y_batches = [] for i in range(n_chunk): start_index = i * batch_size end_index = start_index + batch_size batches = poems_vec[start_index:end_index] # 找到这个batch的所有poem中最长的poem的长度 length = max(map(len, batches)) # 填充一个这么大小的空batch,空的地方放空格进行填充 x_data = np.full((batch_size, length), word_to_int[' '], np.int32) for row in range(batch_size): # 每一行就是一首诗,在原本的长度上把诗还原上去 x_data[row, :len(batches[row])] = batches[row] y_data = np.copy(x_data) # y的话就是x向左边也就是前面移动一个 y_data[:, :-1] = x_data[:, 1:] """ x_data y_data [6,2,4,6,9] [2,4,6,9,9] [1,4,2,8,5] [4,2,8,5,5] """ x_batches.append(x_data) y_batches.append(y_data) return x_batches, y_batches
定义训练模型:
# -*- coding: utf-8 -*- import os import tensorflow as tf import numpy as np os.environ['TF_CPP_MIN_LOG_LEVEL']='2' #不显示一些提示警告信息 def rnn_model(model, input_data, output_data, vocab_size, rnn_size=128, num_layers=2, batch_size=64, learning_rate=0.01): """ construct rnn seq2seq model. :param model: model class 模型种类 :param input_data: input data placeholder 输入 :param output_data: output data placeholder 输出 :param vocab_size: 词长度 :param rnn_size: 一个RNN单元的大小 :param num_layers: RNN层数,神经元 :param batch_size: 步长 :param learning_rate: 学习速率 :return: """ end_points = {} def rnn_cell(): if model == 'rnn': cell_fun = tf.contrib.rnn.BasicRNNCell elif model == 'gru': cell_fun = tf.contrib.rnn.GRUCell elif model == 'lstm': # 基础模型 cell_fun = tf.contrib.rnn.BasicLSTMCell # 指定rnn_size大小,H,C,控制值和输出值,是否当做元组返回,默认为True cell = cell_fun(rnn_size, state_is_tuple=True) return cell # 基本单元 cell = tf.contrib.rnn.MultiRNNCell([rnn_cell() for _ in range(num_layers)], state_is_tuple=True) # cell = tf.contrib.rnn.MultiRNNCell([rnn_cell()] * num_layers, state_is_tuple=True) if output_data is not None: # 初始化 initial_state = cell.zero_state(batch_size, tf.float32) else: initial_state = cell.zero_state(1, tf.float32) # 指定用cpu运行 with tf.device("/cpu:0"): # 输入的向量转化为128维向量,所以先构建隐层,指定值为+1 到-1区间 embedding = tf.get_variable('embedding', initializer=tf.random_uniform( [vocab_size + 1, rnn_size], -1.0, 1.0)) # embedding = tf.Variable(tf.random_uniform([vocab_size + 1,rnn_size],-1.0,1.0)) # 输入的不是所有的词,是所有词的一部分,寻找属于哪个词 inputs = tf.nn.embedding_lookup(embedding, input_data) # [batch_size, ?, rnn_size] = [64, ?, 128] # 输出,和最后一次的输出 outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state) # 转换为128列的输出,output相当于中间一个128维隐层结果,向量 output = tf.reshape(outputs, [-1, rnn_size]) # 128维的权重,和词汇量 weights = tf.Variable(tf.truncated_normal([rnn_size, vocab_size + 1])) # 偏置 bias = tf.Variable(tf.zeros(shape=[vocab_size + 1])) # bias加到前面没一行,预测值 logits = tf.nn.bias_add(tf.matmul(output, weights), bias=bias) # [?, vocab_size+1] if output_data is not None: # output_data must be one-hot encode 指定深度为,词汇量深度+1,真实值 labels = tf.one_hot(tf.reshape(output_data, [-1]), depth=vocab_size + 1) # should be [?, vocab_size+1] # 计算损失,真实值和预测值 loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits) # loss shape should be [?, vocab_size+1] # 对损失求平均 total_loss = tf.reduce_mean(loss) # 训练,优化损失 train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss) end_points['initial_state'] = initial_state end_points['output'] = output end_points['train_op'] = train_op end_points['total_loss'] = total_loss end_points['loss'] = loss end_points['last_state'] = last_state else: # 预测值 prediction = tf.nn.softmax(logits) end_points['initial_state'] = initial_state end_points['last_state'] = last_state end_points['prediction'] = prediction return end_points
训练的时候大概训练了半天吧,基于cpu,loss降到2到3之间就没训练了。古诗里面有些宋词,所以可能会写出一些宋词来。因此当写出宋词的时候,统统过滤了,让机器人从新写一次。
怒浩叛奴风入襄,相思犹记浇秋木。
发夜在关江伯均,屏帏倾酒醺醺时。
冲棱出与征轮钓,北阙朱门注胜囚。
冠子弄衣诗句苦,雁归羊祜得行戎。
凭仗眼巡分一已,拂眉下枕风吹鬓。
栏畔秋风听别情,无将日月悲卮花。
处飞更醉紫骝出,莫怕黄芽头孤酒。
潇湘似人风共恶,月影湿金横向波。
潇湘湘水涵红团,拖蜡桐花过酒楼。
雨丰寒鸟没疏角,雨过嵩门逐几层。
歇山地在秦栖狎,借问纯山色色微。
抬普高花量且白,也遭纱帽却离歌。
望华衔泥不复相,时时避缴如红兰。
眼眼南天南岳畔,十年章岸一何当。
仰头忽要天台绿,九没街边雪里临。
天下别容兵起力,怀王过写复何人。
长江九泽分明月,应逐征帆转汉阳。
啸骨立亡何所赠,别时身去坦云中。
壮时烟雨分雪发,战马翩翩一千尺。
怀乡生抛怀甲得,终竟颠应年子人。
激尔饷于无一事,岛层明月上高台。
烈士不能还续竹,应经此去虽迷定。