深度学习:文本预测

代码:

#coding=utf-8

import os
import sys
import re
import random
import numpy as np
from keras.models import Sequential,load_model
from keras.layers import Dense, LSTM, Bidirectional
from keras.optimizers import Adam

def sample(preds, gamma=1.0):
    preds **= 1 / gamma
    preds /= np.sum(preds)
    # 从多项分布中抽取样本
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def getyucetext(generated_text, sent_len, N, char_indices, char_set, model):
    #[0.2, 0.5, 1.0, 1.2, 1.5]
    gamma = 1.0
    next_chars = []
    for j in range(100):
        sampled = np.zeros((1, sent_len, N))  # 第一个维度为1,取单个样本
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1
        preds = model.predict(sampled, verbose=0)[0]  # 返回3843个概率值的向量
        preds = preds.astype(float)
        next_index = sample(preds, gamma)
        next_char = char_set[next_index]
        generated_text = (generated_text + next_char)[1:]
        next_chars.append(next_char)
    
    return ''.join(next_chars)


fp = open(r'data/sanguoyanyi.txt', 'r', encoding='utf-8')
text = fp.read()
fp.close()

print('语料总长度:', len(text))
sent_len = 4  # 样本长度,用前4个字来预测第5个字
step = 1  # 在总文本上面平移取样本,步长为1
sentences = []  # 样本集X,用来预测
next_chars = []  # 样本集Y,真实值
for i in range(0, len(text) - sent_len, step):
    sentences.append(text[i:i + sent_len])
    next_chars.append(text[i + sent_len])

sentenceslen = len(sentences)
print('训练样本数目:', sentenceslen)

# 创建字典,文本的字符集,sorted()对文本进行排序
char_set = sorted(list(set(text)))
N = len(char_set)
print('字典中字的个数:', N)

# 根据值来获取索引,存入字典
char_indices = dict((char, index) for index, char in enumerate(char_set))
#print(char_indices)

x = np.zeros((sentenceslen, sent_len, N), dtype=bool)
y = np.zeros((sentenceslen, N), dtype=bool)

# 对每一个字做one-hot编码,构成词向量
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1  # 将x索引值的文本置1,形成one——hot编码
    y[i, char_indices[next_chars[i]]] = 1  # 将Y索引值的文本置1,形成one——hot编码
    
xsplit = int(len(x) * 0.95)
ysplit = int(len(y) * 0.95)
x_data = x[:xsplit]
y_data = y[:ysplit]
x_test_data = x[xsplit:]
y_test_data = y[ysplit:]

model_path = 'models/wenben_yuce_lstm.h5'
if os.path.exists(model_path):
    model = load_model(model_path)
else:
    # 构建模型
    model = Sequential(name='text_generation')
    # 将3843维降维到300,输入维度,样本:3个字,每个字3843维
    model.add(LSTM(300, return_sequences=True, input_shape=(sent_len, N)))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))  # LSTM双向传递
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dense(N, activation="softmax"))  # 全连接层,有N个分类
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01))
    model.fit(x_data, y_data, batch_size=128, epochs=10, validation_data=(x_test_data, y_test_data))
    model.save(model_path)
    
model.summary()

generated_texts = ['诸葛村夫', '安敢望此', '一将飞出', '背义之徒', '你大爷的']
for generated_text in generated_texts:
    print('输入值:%s' % generated_text)
    yucetext = getyucetext(generated_text, sent_len, N, char_indices, char_set, model)
    print('预测值:%s' % yucetext)
    print()

 

data/sanguoyanyi.txt  是三国演义文字版

 

 

 

执行输出:

语料总长度: 603705
训练样本数目: 603701
字典中字的个数: 3843

Model: "text_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm (LSTM)                 (None, 4, 300)            4972800   
                                                                 
 bidirectional (Bidirectiona  (None, 4, 128)           186880    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 3843)              249795    
                                                                 
=================================================================
Total params: 5,450,691
Trainable params: 5,450,691
Non-trainable params: 0
_________________________________________________________________
输入值:诸葛村夫
预测值:里,用将鹤威瘤矣,故其成州,何显伐何?”昭曰:“令然引相扶看,闻被袁瓒按两州。孙宫暗见迷间。曹德好到也,足期于中,曜休前意!”默笑,图教而去曰:“主面有此幕,不欲而走。布同、赵咨回柩,则恨骂姬女两处,

输入值:安敢望此
预测值:,必敢容怒,果得轮运爪若,非第民成义来章礼矣。前弟词保天舅兮之者宗,安可图此之?”孔明曰:“败不若当官居城!但相起士。此急怕师,正得所商明耳。”绍见入之。”玄德曰:“国化众近刘褚。襄议霸而去。尽正附到

输入值:一将飞出
预测值:而走。突空败,鼓三里围刺,翻见李曰:“涓相夜成将口。手大惊灵于,姓刘大江谷关,胜足欲用邳之,用后悲失,彤不携吞下,黄言有诸将于,傲连英苦,荆更追卒,引百万擒疑,所投东面所神,忽饮国地主,与陈音回书统卜

输入值:背义之徒
预测值:。此处后国炎众也。字昭鼓,旌面攻回,岂要举功,止后惊盔而无言。今日连一大将势佯瓒,鼻受三尺,随李砍往傍海,背崇陆主,足兵为此,都败击钺。只说张权去马,使进世军旬唇者也。”其人叹曰:“伯兄轻辅为疾之兵,

输入值:你大爷的
预测值:杀,景赛敢到术,欲勿辞也。”二笑曰:“此从之?”赵芳皆退,乃须唤敌子出中。魏延等可教曰:“前将有父。”又藏三个住平之,房南相居兵无说,何三处急肉段,有一常索五阳,生曰:“马不可如魏耳。以曰:“公孙春观

 

还是挺好玩的

参考:https://blog.csdn.net/zzjcymbq/article/details/125666251

posted @ 2023-03-28 13:38  河北大学-徐小波  阅读(126)  评论(0编辑  收藏  举报