R语言迪士尼点评文本挖掘

setwd("D:\\迪士尼点评文本挖掘")

#getwd()可查看目前的工作路径

#加载包()

library(DBI)
#library(RMySQL)
library(rJava)
library(openxlsx)
library(stringr)
library(xlsxjars)
library(reshape)
library(readxl)
library(xlsx)
library(sqldf)
library(wordcloud)
library(Rwordseg) #加载分词包
library(tm)
library(tmcn)
library(jiebaRD)
library(jiebaR) #里面的segmentC用于分词
#library(wordcloud2)

###########################读入表格###############################
comment_01 <- read_excel("点评文本挖掘.xlsx",1) #读入原始文本
comment_01 <- comment_01[,'点评内容'] #仅保留文本字段
stopwords_01 <- read_excel("停用词汇总.xlsx",1) # 读入停用词文本

#str函数，即structure,紧凑的显示对象内部结构，即对象里有什么。作用跟head相似
#str(comment_01)

#添加搜狗词汇词典查看已安装的词典用：listDict()。卸载词典：uninstallDict()。
installDict(dictpath = '旅游词汇大全【官方推荐】.scel',dictname = 'Vocabulary_books', dicttype = 'scel') #旅游词汇大全
installDict(dictpath = 'disney.scel',dictname = 'disney', dicttype = 'scel') #迪士尼词汇大全
installDict(dictpath = '自定义词典.txt',dictname = 'dictionary_01') #迪士尼词汇大全
#uninstallDict("disney")
#uninstallDict("Vocabulary_books")
#uninstallDict("dictionary_01")
#加词
#uninstallDict(disney)
listDict()
insertWords(c("排队","不满意","非常满意","很好","不方便","非常好","很棒","驴妈妈","不舒服","不值","七个小矮人","飞越地平线","创极速光轮","米奇大街","奇想花园","梦幻世界","探险岛","宝藏湾","明日世界","巴斯光年星际营救","喷气背包飞行器","太空幸会史迪奇","星球大战远征基地","皮克斯玩具总动员","快速通道","加勒比海盗","灯光秀","飞跃地平线"))

#去掉字母和数字
comment_02 <- gsub('[0-9a-zA-Z]','',comment_01)

#分词
segword <- unlist(lapply(X=comment_02,FUN=segmentCN))
#创建停止词
#head(stopwords_01) #查看数据
#class(stopwords_01) #查看变量类型，可知是属于数据框类型

#segword[1:10]

#需要将数据框格式的数据转化为向量格式
stopwords_01<- as.matrix(stopwords_01[,1])
stopwords_01<- as.vector(stopwords_01[,1])

#自定义删除停止词的函数

removeStopWords <- function(x,stopwords) {

temp <- character(0)

index <- 1

xLen <- length(x)

while (index <= xLen) {

if (length(stopwords[stopwords==x[index]]) <1)

temp<- c(temp,x[index])

index <- index +1

}

temp

}

#删词

segword3 <-lapply(segword,removeStopWords,stopwords_01)

#绘制文字图
word_freq <- createWordFreq(unlist(segword3)) #createWordFreq函数来自于tmcn包

opar <-par(no.readonly = TRUE)
par(bg = 'black')
#绘制出现频率最高的前100个词
wordcloud(words=word_freq$word,freq=word_freq$freq,max.words=100,random.color=TRUE,colors=rainbow(n=7))

par(opar)

##############################数据输出##############################
result_filename<-paste(Sys.Date(),'数据',".xlsx",sep="")
write.xlsx(word_freq,result_filename,sheetName='明细')

posted on 2019-05-10 18:31 阿柔公主阅读(354) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

阿柔公主

公告