基于Word2Vec的影评挖掘
0. 夜来幽梦忽还乡
先声明一下,我是文艺型技术博主哈哈哈哈。等会写完学一手怎么调整脚本权限,插入歌曲。放弃放弃,学不会,打脸了。
各种原因很久没更新了,因为需要给新论文做一些铺垫,所以最近打算更新两篇tensorflow(简称TF-boys吧哈哈哈哈)下的Word2Vec和Word2Doc的影评情感分析相关的技术贴。其实这两块东西是快两个月前的学习的了,稍微有一点遗忘了,正好拿出来复习一下。和以往一样,博客呢一定是代码加实验结果的,非常非常干货。因为没有那么多时间再去挨着介绍原理,所以本猫就默认看这篇博文的人都是有词向量和NLP基础的啊。喵喵喵,废话不多说,我们描述本博文的任务,然后大致描述思路,最后上附有详细注释的代码和运行结果。
首先有几个简单的概念需要描述一下。所谓Word2Vec就是一个将文本处理为计算机和神经网络能接收的向量或矩阵的工具,定语有点长,所以看加黑就行了哈哈哈,我写东西总是怕说不清楚所以定语老长老长了,以前翻译的时候会很让人头疼。Word2Vec这个手段包括两个模型可以完成这个目的,这两个模型在结果上各有千秋和偏向——CBOW模型和Skip-gram模型。具体内容自行CSDN百度吧,因为本猫不喜欢复制粘贴别人的东西。
CSDN上有基于影评的二分类问题(消极和积极),学习后自动将用户的评价进行二分类,自行参考。影评数据集来源http://www.cs.cornell.edu/people/pabo/movie-review-data/。但这次本猫要描述的问题和这个不一样。我们将对影评数据进行分析,然后对电影的好坏进行简单的评判。情感分析其实很难,挖苦嘲讽这些深层语义如果不对上下文进行好好的挖掘是没有办法知道得。平时你说great,可能是嘲讽预期对吧,“灭霸真TMD great“。
1. 小轩窗 正梳妆 相顾无言
代码分为两块,一块为text_helpers.py,负责做数据清洗(切词,去停用词等等)。一块为主函数。
# 基于Word2Vec的影评挖掘 #--------------------------------------- # # Author: Allen_ZQH # Date: 2018.3.20 # import tensorflow as tf import matplotlib.pyplot as plt import numpy as np import random import os import pickle import string import requests import collections import io import tarfile import urllib.request import text_helpers from nltk.corpus import stopwords from tensorflow.python.framework import ops ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # 开始计算图会话 sess = tf.Session() # 确定CROW模型参数 embedding_size = 200 vocabulary_size = 2000 batch_size = 100 max_words = 100 # 加载nltk库中的英文停顿词表 stops = stopwords.words('english') # 载入数据 print('Loading Data') data_folder_name = 'temp' texts, target = text_helpers.load_movie_data() # 使用text_helpers加载和转换文本数据集 print('Normalizing Text Data') texts = text_helpers.normalize_text(texts, stops) # 一句评论至少包含3个词 target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2] texts = [x for x in texts if len(x.split()) > 2] # 将数据分为测试集和训练集 train_indices = np.random.choice(len(target), round(0.8*len(target)), replace=False) test_indices = np.array(list(set(range(len(target))) - set(train_indices))) texts_train = [x for ix, x in enumerate(texts) if ix in train_indices] texts_test = [x for ix, x in enumerate(texts) if ix in test_indices] target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices]) target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices]) # 加载词典和Embedding矩阵 dict_file = os.path.join( '..', '05_Working_With_CBOW_Embeddings', 'temp', 'movie_vocab.pkl') word_dictionary = pickle.load(open(dict_file, 'rb')) # 通过字典将加载的句子转化为数值型numpy数组 text_data_train = np.array(text_helpers.text_to_numbers(texts_train, word_dictionary)) text_data_test = np.array(text_helpers.text_to_numbers(texts_test, word_dictionary)) # 由于影评长度不一样,规定一句影评为100个单词,不足用0填充 text_data_train = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_train]]) text_data_test = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_test]]) print('Creating Model') # Embedding层(Word2Vec相关,其实tf中可以直接调用google开发的Word2Vec库) embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # 定义Embedding层模型: # 声明逻辑回归的模型变量和占位符 A = tf.Variable(tf.random_normal(shape=[embedding_size,1])) b = tf.Variable(tf.random_normal(shape=[1,1])) x_data = tf.placeholder(shape=[None, max_words], dtype=tf.int32) y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32) # 在计算图中假如嵌套查找操作。计算句子中所有单词的平均嵌套 embed = tf.nn.embedding_lookup(embeddings, x_data) embed_avg = tf.reduce_mean(embed, 1) # 声明模型操作和损失函数 model_output = tf.add(tf.matmul(embed_avg, A), b) loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_output, labels=y_target)) # 预测函数和准确度函数 prediction = tf.round(tf.sigmoid(model_output)) predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32) accuracy = tf.reduce_mean(predictions_correct) my_opt = tf.train.AdagradOptimizer(0.005) train_step = my_opt.minimize(loss) # 初始化变量 init = tf.global_variables_initializer() sess.run(init) # 随机初始化单词嵌套,导入CBOW模型 model_checkpoint_path = os.path.join( '..', '05_Working_With_CBOW_Embeddings', 'temp','cbow_movie_embeddings.ckpt') saver = tf.train.Saver({"embeddings": embeddings}) saver.restore(sess, model_checkpoint_path) # 开始训练,每迭代100次保存训练集和测试集的损失和准确度 # 每500次打印一次模型状态 print('Starting Model Training') train_loss = [] test_loss = [] train_acc = [] test_acc = [] i_data = [] for i in range(10000): rand_index = np.random.choice(text_data_train.shape[0], size=batch_size) rand_x = text_data_train[rand_index] rand_y = np.transpose([target_train[rand_index]]) sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y}) # Only record loss and accuracy every 100 generations if (i+1)%100==0: i_data.append(i+1) train_loss_temp = sess.run(loss, feed_dict={x_data: rand_x, y_target: rand_y}) train_loss.append(train_loss_temp) test_loss_temp = sess.run(loss, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])}) test_loss.append(test_loss_temp) train_acc_temp = sess.run(accuracy, feed_dict={x_data: rand_x, y_target: rand_y}) train_acc.append(train_acc_temp) test_acc_temp = sess.run(accuracy, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])}) test_acc.append(test_acc_temp) if (i+1)%500==0: acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp, test_acc_temp] acc_and_loss = [np.round(x,2) for x in acc_and_loss] print('Generation # {}. Train Loss (Test Loss): {:.2f} ({:.2f}). Train Acc (Test Acc): {:.2f} ({:.2f})'.format(*acc_and_loss)) # 绘制损失函数 plt.plot(i_data, train_loss, 'k-', label='Train Loss') plt.plot(i_data, test_loss, 'r--', label='Test Loss', linewidth=4) plt.title('Cross Entropy Loss per Generation') plt.xlabel('Generation') plt.ylabel('Cross Entropy Loss') plt.legend(loc='upper right') plt.show() # 绘制训练和测试函数 plt.plot(i_data, train_acc, 'k-', label='Train Set Accuracy') plt.plot(i_data, test_acc, 'r--', label='Test Set Accuracy', linewidth=4) plt.title('Train and Test Accuracy') plt.xlabel('Generation') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.show() # Text Helper Functions #--------------------------------------- # import string import os import urllib.request import io import tarfile import collections import numpy as np import requests import gzip # Normalize text def normalize_text(texts, stops): # Lower case texts = [x.lower() for x in texts] # Remove punctuation texts = [''.join(c for c in x if c not in string.punctuation) for x in texts] # Remove numbers texts = [''.join(c for c in x if c not in '0123456789') for x in texts] # Remove stopwords texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts] # Trim extra whitespace texts = [' '.join(x.split()) for x in texts] return(texts) # Build dictionary of words def build_dictionary(sentences, vocabulary_size): # Turn sentences (list of strings) into lists of words split_sentences = [s.split() for s in sentences] words = [x for sublist in split_sentences for x in sublist] # Initialize list of [word, word_count] for each word, starting with unknown count = [['RARE', -1]] # Now add most frequent words, limited to the N-most frequent (N=vocabulary size) count.extend(collections.Counter(words).most_common(vocabulary_size-1)) # Now create the dictionary word_dict = {} # For each word, that we want in the dictionary, add it, then make it # the value of the prior dictionary length for word, word_count in count: word_dict[word] = len(word_dict) return(word_dict) # Turn text data into lists of integers from dictionary def text_to_numbers(sentences, word_dict): # Initialize the returned data data = [] for sentence in sentences: sentence_data = [] # For each word, either use selected index or rare word index for word in sentence.split(): if word in word_dict: word_ix = word_dict[word] else: word_ix = 0 sentence_data.append(word_ix) data.append(sentence_data |