深度学习:文本预测
代码:
#coding=utf-8 import os import sys import re import random import numpy as np from keras.models import Sequential,load_model from keras.layers import Dense, LSTM, Bidirectional from keras.optimizers import Adam def sample(preds, gamma=1.0): preds **= 1 / gamma preds /= np.sum(preds) # 从多项分布中抽取样本 probas = np.random.multinomial(1, preds, 1) return np.argmax(probas) def getyucetext(generated_text, sent_len, N, char_indices, char_set, model): #[0.2, 0.5, 1.0, 1.2, 1.5] gamma = 1.0 next_chars = [] for j in range(100): sampled = np.zeros((1, sent_len, N)) # 第一个维度为1,取单个样本 for t, char in enumerate(generated_text): sampled[0, t, char_indices[char]] = 1 preds = model.predict(sampled, verbose=0)[0] # 返回3843个概率值的向量 preds = preds.astype(float) next_index = sample(preds, gamma) next_char = char_set[next_index] generated_text = (generated_text + next_char)[1:] next_chars.append(next_char) return ''.join(next_chars) fp = open(r'data/sanguoyanyi.txt', 'r', encoding='utf-8') text = fp.read() fp.close() print('语料总长度:', len(text)) sent_len = 4 # 样本长度,用前4个字来预测第5个字 step = 1 # 在总文本上面平移取样本,步长为1 sentences = [] # 样本集X,用来预测 next_chars = [] # 样本集Y,真实值 for i in range(0, len(text) - sent_len, step): sentences.append(text[i:i + sent_len]) next_chars.append(text[i + sent_len]) sentenceslen = len(sentences) print('训练样本数目:', sentenceslen) # 创建字典,文本的字符集,sorted()对文本进行排序 char_set = sorted(list(set(text))) N = len(char_set) print('字典中字的个数:', N) # 根据值来获取索引,存入字典 char_indices = dict((char, index) for index, char in enumerate(char_set)) #print(char_indices) x = np.zeros((sentenceslen, sent_len, N), dtype=bool) y = np.zeros((sentenceslen, N), dtype=bool) # 对每一个字做one-hot编码,构成词向量 for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): x[i, t, char_indices[char]] = 1 # 将x索引值的文本置1,形成one——hot编码 y[i, char_indices[next_chars[i]]] = 1 # 将Y索引值的文本置1,形成one——hot编码 xsplit = int(len(x) * 0.95) ysplit = int(len(y) * 0.95) x_data = x[:xsplit] y_data = y[:ysplit] x_test_data = x[xsplit:] y_test_data = y[ysplit:] model_path = 'models/wenben_yuce_lstm.h5' if os.path.exists(model_path): model = load_model(model_path) else: # 构建模型 model = Sequential(name='text_generation') # 将3843维降维到300,输入维度,样本:3个字,每个字3843维 model.add(LSTM(300, return_sequences=True, input_shape=(sent_len, N))) model.add(Bidirectional(LSTM(64, return_sequences=True))) # LSTM双向传递 model.add(Bidirectional(LSTM(32, return_sequences=False))) model.add(Dense(N, activation="softmax")) # 全连接层,有N个分类 model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01)) model.fit(x_data, y_data, batch_size=128, epochs=10, validation_data=(x_test_data, y_test_data)) model.save(model_path) model.summary()
generated_texts = ['诸葛村夫', '安敢望此', '一将飞出', '背义之徒', '你大爷的'] for generated_text in generated_texts: print('输入值:%s' % generated_text) yucetext = getyucetext(generated_text, sent_len, N, char_indices, char_set, model) print('预测值:%s' % yucetext) print()
data/sanguoyanyi.txt 是三国演义文字版
执行输出:
语料总长度: 603705 训练样本数目: 603701 字典中字的个数: 3843 Model: "text_generation" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm (LSTM) (None, 4, 300) 4972800 bidirectional (Bidirectiona (None, 4, 128) 186880 l) bidirectional_1 (Bidirectio (None, 64) 41216 nal) dense (Dense) (None, 3843) 249795 ================================================================= Total params: 5,450,691 Trainable params: 5,450,691 Non-trainable params: 0 _________________________________________________________________ 输入值:诸葛村夫 预测值:里,用将鹤威瘤矣,故其成州,何显伐何?”昭曰:“令然引相扶看,闻被袁瓒按两州。孙宫暗见迷间。曹德好到也,足期于中,曜休前意!”默笑,图教而去曰:“主面有此幕,不欲而走。布同、赵咨回柩,则恨骂姬女两处, 输入值:安敢望此 预测值:,必敢容怒,果得轮运爪若,非第民成义来章礼矣。前弟词保天舅兮之者宗,安可图此之?”孔明曰:“败不若当官居城!但相起士。此急怕师,正得所商明耳。”绍见入之。”玄德曰:“国化众近刘褚。襄议霸而去。尽正附到 输入值:一将飞出 预测值:而走。突空败,鼓三里围刺,翻见李曰:“涓相夜成将口。手大惊灵于,姓刘大江谷关,胜足欲用邳之,用后悲失,彤不携吞下,黄言有诸将于,傲连英苦,荆更追卒,引百万擒疑,所投东面所神,忽饮国地主,与陈音回书统卜 输入值:背义之徒 预测值:。此处后国炎众也。字昭鼓,旌面攻回,岂要举功,止后惊盔而无言。今日连一大将势佯瓒,鼻受三尺,随李砍往傍海,背崇陆主,足兵为此,都败击钺。只说张权去马,使进世军旬唇者也。”其人叹曰:“伯兄轻辅为疾之兵, 输入值:你大爷的 预测值:杀,景赛敢到术,欲勿辞也。”二笑曰:“此从之?”赵芳皆退,乃须唤敌子出中。魏延等可教曰:“前将有父。”又藏三个住平之,房南相居兵无说,何三处急肉段,有一常索五阳,生曰:“马不可如魏耳。以曰:“公孙春观
还是挺好玩的
参考:https://blog.csdn.net/zzjcymbq/article/details/125666251
本文来自博客园,作者:河北大学-徐小波,转载请注明原文链接:https://www.cnblogs.com/xuxiaobo/p/17264852.html