[Tensorflow] RNN - 01. Spam Prediction with BasicRNNCell

Ref: http://blog.csdn.net/mebiuw/article/details/60780813

Ref: https://medium.com/@erikhallstrm/hello-world-rnn-83cd7105b767 [Nice]

Ref: https://medium.com/@erikhallstrm/tensorflow-rnn-api-2bb31821b185 [Nice]

Code Analysis

Download and pre-preprocess

# Implementing an RNN in Tensorflow
#----------------------------------
#
# We implement an RNN in Tensorflow to predict spam/ham from texts
#
# Jeffrey: the data process for nlp here is advanced.

import os
import re
import io
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from zipfile import ZipFile
import urllib.request

from tensorflow.python.framework import ops
ops.reset_default_graph()

# Start a graph
sess = tf.Session()

# Set RNN parameters
epochs              = 30
batch_size          = 250
max_sequence_length = 40
rnn_size            = 10
embedding_size      = 50
min_word_frequency  = 10
learning_rate       = 0.0005
dropout_keep_prob   = tf.placeholder(tf.float32)


# Download or open data
data_dir = 'temp'
data_file = 'text_data.txt'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.isfile(os.path.join(data_dir, data_file)):
    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    page = urllib.request.urlopen(zip_url)
    html_content = page.read()
    z = ZipFile(io.BytesIO(html_content))
    
    file = z.read('SMSSpamCollection')
    
    # Format Data
    text_data = file.decode()
    text_data = text_data.encode('ascii',errors='ignore')
    text_data = text_data.decode().split('\n')

    # Save data to text file
    with open(os.path.join(data_dir, data_file), 'w') as file_conn:
        for text in text_data:
            file_conn.write("{}\n".format(text))
else:
    # Open data from text file
    text_data = []
    with open(os.path.join(data_dir, data_file), 'r') as file_conn:
        for row in file_conn:
            text_data.append(row)
    text_data = text_data[:-1]

text_data = [x.split('\t') for x in text_data if len(x)>=1]
[text_data_target, text_data_train] = [list(x) for x in zip(*text_data)]


# Create a text cleaning function
def clean_text(text_string):
    text_string = re.sub(r'([^\s\w]|_|[0-9])+', '', text_string)
    text_string = " ".join(text_string.split())
    text_string = text_string.lower()
    return(text_string)
    
# Clean texts
text_data_train = [clean_text(x) for x in text_data_train]

#Jeffrey
#print("[x]:", text_data_train[:10][:10])
#print("[y]:", text_data_target[:10])

View Code

Stage result:

print("[x]:", text_data_train[:10])
print("[y]:", text_data_target[:10])

[x]: ['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat', 'ok lar joking wif u oni', 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry questionstd txt ratetcs apply overs', 'u dun say so early hor u c already then say', 'nah i dont think he goes to usf he lives around here though', 'freemsg hey there darling its been weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send to rcv', 'even my brother is not like to speak with me they treat me like aids patent', 'as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press to copy your friends callertune', 'winner as a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hours only', 'had your mobile months or more u r entitled to update to the latest colour mobiles with camera for free call the mobile update co free on']
[y]: [1 1 0 1 1 0 1 1 0 0]

Change texts into numeric vectors

# Change texts into numeric vectors
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sequence_length, min_frequency=min_word_frequency)
text_processed  = np.array(list(vocab_processor.fit_transform(text_data_train)))

# Shuffle and split data
text_processed = np.array(text_processed)
text_data_target = np.array([1 if x=='ham' else 0 for x in text_data_target])

Stage result: one-hotting encoding

#Jeffrey
#print("[text_processed]:", text_processed.shape)
#print("[text_data_target]:", text_data_target.shape)
##[text_processed]:   (5574, 40)
##[text_data_target]: (5574,)

#print("[text_processed]:", text_processed)
#print("[text_data_target]:", text_data_target)

[text_processed]:
[[ 44 455 0 ..., 0 0 0]
[ 47 315 0 ..., 0 0 0]
[ 46 465 9 ..., 0 0 0]
..., 
[ 0 59 9 ..., 0 0 0]
[ 5 493 108 ..., 0 0 0]
[ 0 40 474 ..., 0 0 0]]

[text_data_target]:
[1 1 0 ..., 1 1 1]

Term statistics

shuffled_ix = np.random.permutation(np.arange(len(text_data_target)))
x_shuffled = text_processed[shuffled_ix]
y_shuffled = text_data_target[shuffled_ix]

# Split train/test set
ix_cutoff = int(len(y_shuffled)*0.80)
x_train, x_test = x_shuffled[:ix_cutoff], x_shuffled[ix_cutoff:]
y_train, y_test = y_shuffled[:ix_cutoff], y_shuffled[ix_cutoff:]


print(vocab_processor.vocabulary_)

vocab_size = len(vocab_processor.vocabulary_)
print("Vocabulary Size: {:d}".format(vocab_size))
print("80-20 Train Test split: {:d} -- {:d}".format(len(y_train), len(y_test)))

[text_processed]: (5574, 40)
[text_data_target]: (5574,)

Build Graph

###############################################################################

# Create placeholders
x_data   = tf.placeholder(tf.int32, [None, max_sequence_length])
y_output = tf.placeholder(tf.int32, [None])

# Create embedding

embedding_mat    = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
embedding_output = tf.nn.embedding_lookup(embedding_mat, x_data)　　# Here, this x_data is ids! <---- [termIdx1, termIdx2, ...]
#embedding_output_expanded = tf.expand_dims(embedding_output, -1)

Create our embedding matrix and embedding lookup operation for the x-input data:embedding_mat.

# Define the RNN cell
cell   = tf.nn.rnn_cell.BasicRNNCell(num_units = rnn_size)
output, state = tf.nn.dynamic_rnn(cell, embedding_output, dtype=tf.float32)
output = tf.nn.dropout(output, dropout_keep_prob)　　# parameters are variables, waiting for constant later.

# Get output of RNN sequence
output = tf.transpose(output, [1, 0, 2])
last   = tf.gather(output, int(output.get_shape()[0]) - 1)

API: rnn_cell

from Tensorflow RNN源代码解析笔记1：RNNCell的基本实现

在Tensorflow中，定义了一个RNNCell的抽象类，具体的所有不同类型的RNN Cell都是基于这个类的.

在Tensorflow中，将会基于整个RNNCell实现一系列常用的RNNCell，比如LSTM和GRU，并且将会支持包含Dropout等在内的特性，同时也支持构建多层的RNN网络。

class BasicRNNCell(RNNCell):
  """The most basic RNN cell.

  Args:
    num_units: int, The number of units in the RNN cell.
    activation: Nonlinearity to use.  Default: `tanh`.
    reuse: (optional) Python boolean describing whether to reuse variables
     in an existing scope.  If not `True`, and the existing scope already has
     the given variables, an error is raised.
  """

  def __init__(self, num_units, activation=None, reuse=None):
    super(BasicRNNCell, self).__init__(_reuse=reuse)
    self._num_units = num_units
    self._activation = activation or math_ops.tanh

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  def call(self, inputs, state):
    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
    output = self._activation(_linear([inputs, state], self._num_units, True))
    return output, output

学习参数：

From: YJango的循环神经网络——介绍

所有时刻的权重矩阵都是共享的。这是递归网络相对于前馈网络而言最为突出的优势。

递归神经网络是在时间结构上存在共享特性的神经网络变体。时间结构共享是递归网络的核心中的核心。

h_state:

# Variables
weight = tf.Variable(tf.truncated_normal([rnn_size, 2], stddev=0.1))
bias   = tf.Variable(tf.constant(0.1, shape=[2]))
logits_out = tf.nn.softmax(tf.matmul(last, weight) + bias)

# Loss function
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_out, labels=y_output) # logits=float32, labels=int32
loss   = tf.reduce_mean(losses)

accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits_out, 1), tf.cast(y_output, tf.int64)), tf.float32))

optimizer  = tf.train.RMSPropOptimizer(learning_rate)
train_step = optimizer.minimize(loss)

init = tf.initialize_all_variables()
sess.run(init)

###############################################################################
###############################################################################

train_loss     = []
test_loss      = []
train_accuracy = []
test_accuracy  = []

# Start training
for epoch in range(epochs):

    # Shuffle training data
    shuffled_ix = np.random.permutation(np.arange(len(x_train)))
    # Sort x_train and y_train based on shuffled_ix
    x_train = x_train[shuffled_ix]
    y_train = y_train[shuffled_ix]
    
    num_batches = int(len(x_train)/batch_size) + 1
    # TO DO CALCULATE GENERATIONS ExACTLY
    # For each batch.
    for i in range(num_batches):
        # Select train data
        min_ix = i * batch_size
        max_ix = np.min([len(x_train), ((i+1) * batch_size)])
        x_train_batch = x_train[min_ix:max_ix]
        y_train_batch = y_train[min_ix:max_ix]
        
        # Run train step
        train_dict = {x_data: x_train_batch, y_output: y_train_batch, dropout_keep_prob:0.5}
        sess.run(train_step, feed_dict=train_dict)
        
    # Run loss and accuracy for training
    temp_train_loss, temp_train_acc = sess.run([loss, accuracy], feed_dict=train_dict)
    train_loss.append(temp_train_loss)
    train_accuracy.append(temp_train_acc)
    
    # Run Eval Step
    test_dict = {x_data: x_test, y_output: y_test, dropout_keep_prob:1.0}
    temp_test_loss, temp_test_acc = sess.run([loss, accuracy], feed_dict=test_dict)
    test_loss.append(temp_test_loss)
    test_accuracy.append(temp_test_acc)
    print('Epoch: {}, Test Loss: {:.2}, Test Acc: {:.2}'.format(epoch+1, temp_test_loss, temp_test_acc))
    
# Plot loss over time
epoch_seq = np.arange(1, epochs+1)
plt.plot(epoch_seq, train_loss, 'k--', label='Train Set')
plt.plot(epoch_seq, test_loss, 'r-', label='Test Set')
plt.title('Softmax Loss')
plt.xlabel('Epochs')
plt.ylabel('Softmax Loss')
plt.legend(loc='upper left')
plt.show()

# Plot accuracy over time
plt.plot(epoch_seq, train_accuracy, 'k--', label='Train Set')
plt.plot(epoch_seq, test_accuracy, 'r-', label='Test Set')
plt.title('Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

posted @ 2017-09-28 11:16 郝壹贰叁阅读(463) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

机器学习水很深

We all have two lives. The second one starts when we realize that we only have one. --- Tom Hiddleston