豆瓣评论的爬取和词云展示
import jieba
import nltk
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import etree
import pandas as pd
import re
import wordcloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from selenium.webdriver.common.keys import Keys
def start_search(key, page):
driver = webdriver.Chrome(r'E:\chrome\new\chromedriver.exe') # r'E:\chrome\chromedriver.exe'
driver.maximize_window()
driver.get('https://www.douban.com/')
wait = WebDriverWait(driver, 20)
# 定位输入框 输入数据 点击搜索
_input = wait.until(EC.presence_of_element_located((By.XPATH, './/div[@class="anony-srh"]/form/span[1]/input')))
_input.send_keys(key)
submit = wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@class="anony-srh"]/form/span[2]/input')))
submit.send_keys(Keys.ENTER)
all_handles = driver.window_handles
result = wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@class="search-result"]/div[2]/div[1]/div[2]/div/h3/a')))
result.click()
all_handles2 = driver.window_handles # 所有窗口,包含打开的窗口
driver.implicitly_wait(2)
newhandle = [handle for handle in all_handles2 if handle not in all_handles]
driver.switch_to.window(newhandle[0]) # 切换到新窗口
time.sleep(2)
wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@id="comments-section"]/div/h2/span[2]/a'))).click()
# wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@class="comment-filter"]/label[2]/input'))).click()
time.sleep(5)
itemlist = []
temp = {}
total = pd.DataFrame()
for i in range(page):
html = driver.page_source
# print(html)
selector = etree.HTML(html)
items = selector.xpath('.//div[@id="comments"]/div/ul/li')
for i in items:
temp={}
# print('name:{}'.format(''.join(i.xpath('.//div[2]/h3/span[2]/a/text()'))))
# print('score:{}'.format(''.join(i.xpath('.//div[2]/h3/span[2]/span[1]/@class'))))
# print('time:{}'.format(''.join(i.xpath('.//div[2]/h3/span[2]/span[2]/text()'))))
# print('zan:{}'.format(''.join(i.xpath('.//div[2]/h3/span[1]/span/text()'))))
# print('comment:{}'.format(''.join(i.xpath('.//div[2]/p/span/text()'))))
name = ''.join(i.xpath('.//div[2]/h3/span[2]/a/text()'))
score = ''.join(i.xpath('.//div[2]/h3/span[2]/span[1]/@class'))
if re.findall(r'[0-5]',score):
score = re.findall(r'[0-5]',score)[0]
else:
score = -1
date = ''.join(i.xpath('.//div[2]/h3/span[2]/span[2]/text()'))
zan = ''.join(i.xpath('.//div[2]/h3/span[1]/span/text()'))
comment = ''.join(i.xpath('.//div[2]/p/span/text()'))
temp['name'] = name
temp['score'] = score
temp['date'] = date
temp['zan'] = zan
temp['comment'] = comment
itemlist.append(temp)
# print(itemlist)
wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@id="paginator"]/a'))).click()
time.sleep(5)
total = pd.DataFrame(itemlist)
return total
def write_html_to_csv(s):
# with open('comment.csv', 'w', encoding='utf-8') as f:
# f.write(s)
s.to_csv('comment.csv', index=False)
def read_html_from_csv():
s = pd.read_csv("comment.csv")
return s
def read_from_file(file):
with open(file, 'r', encoding='utf-8') as f:
s = f.readlines()
return s
def fen_ci(all_content):
# 载入停用词数据
stopwords = [line.strip() for line in read_from_file('stopwords.txt')]
stopwordsAdd = ['\n',' ']
for i in stopwordsAdd:
stopwords.append(i)
# 词频统计
segments = {}
words = jieba.cut(all_content,)
for word in words:
if word not in stopwords:
segments[word] = segments.get(word, 0) + 1
# 按照词频排序
sort_segments = sorted(segments.items(), key=lambda item: item[1], reverse=True)
words_on_list = []
for word, count in sort_segments[:99]:
words_on_list.append(word)
return words_on_list
def make_wordclude(words_on_list):
# 生成词云
word_show = ' '.join(words_on_list)
w = wordcloud.WordCloud(font_path="msyh.ttc", width=1000, height=700, background_color="white", max_words=100)
w.generate(word_show)
# w.to_file("hot_word.jpg")
plt.figure(figsize=(8, 8.5))
plt.imshow(w, interpolation='bilinear')
plt.axis('off')
plt.title('Most Popular Words in Title', fontsize=30)
plt.show()
if __name__ == '__main__':
# # write_html_to_csv(start_search("平凡的世界",3))
a = read_html_from_csv()
# print(fen_ci(a['comment'].to_string()))
make_wordclude(fen_ci(a['comment'].to_string()))