周报10

豆瓣评论的爬取和词云展示
import jieba
import nltk
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import etree
import pandas as pd
import re
import wordcloud
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

from selenium.webdriver.common.keys import Keys


def start_search(key, page):
    driver = webdriver.Chrome(r'E:\chrome\new\chromedriver.exe')  # r'E:\chrome\chromedriver.exe'
    driver.maximize_window()

    driver.get('https://www.douban.com/')
    wait = WebDriverWait(driver, 20)
    # 定位输入框 输入数据 点击搜索
    _input = wait.until(EC.presence_of_element_located((By.XPATH, './/div[@class="anony-srh"]/form/span[1]/input')))
    _input.send_keys(key)

    submit = wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@class="anony-srh"]/form/span[2]/input')))
    submit.send_keys(Keys.ENTER)

    all_handles = driver.window_handles
    result = wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@class="search-result"]/div[2]/div[1]/div[2]/div/h3/a')))
    result.click()

    all_handles2 = driver.window_handles  # 所有窗口，包含打开的窗口
    driver.implicitly_wait(2)
    newhandle = [handle for handle in all_handles2 if handle not in all_handles]
    driver.switch_to.window(newhandle[0])  # 切换到新窗口
    time.sleep(2)

    wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@id="comments-section"]/div/h2/span[2]/a'))).click()
    # wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@class="comment-filter"]/label[2]/input'))).click()
    time.sleep(5)
    itemlist = []
    temp = {}
    total = pd.DataFrame()
    for i in range(page):
        html = driver.page_source
        # print(html)
        selector = etree.HTML(html)
        items = selector.xpath('.//div[@id="comments"]/div/ul/li')
        for i in items:
            temp={}
            # print('name:{}'.format(''.join(i.xpath('.//div[2]/h3/span[2]/a/text()'))))
            # print('score:{}'.format(''.join(i.xpath('.//div[2]/h3/span[2]/span[1]/@class'))))
            # print('time:{}'.format(''.join(i.xpath('.//div[2]/h3/span[2]/span[2]/text()'))))
            # print('zan:{}'.format(''.join(i.xpath('.//div[2]/h3/span[1]/span/text()'))))
            # print('comment:{}'.format(''.join(i.xpath('.//div[2]/p/span/text()'))))
            name = ''.join(i.xpath('.//div[2]/h3/span[2]/a/text()'))
            score = ''.join(i.xpath('.//div[2]/h3/span[2]/span[1]/@class'))
            if re.findall(r'[0-5]',score):
                score = re.findall(r'[0-5]',score)[0]
            else:
                score = -1
            date = ''.join(i.xpath('.//div[2]/h3/span[2]/span[2]/text()'))
            zan = ''.join(i.xpath('.//div[2]/h3/span[1]/span/text()'))
            comment = ''.join(i.xpath('.//div[2]/p/span/text()'))
            temp['name'] = name
            temp['score'] = score
            temp['date'] = date
            temp['zan'] = zan
            temp['comment'] = comment
            itemlist.append(temp)
        # print(itemlist)
        wait.until(EC.element_to_be_clickable((By.XPATH, './/div[@id="paginator"]/a'))).click()
        time.sleep(5)
    total = pd.DataFrame(itemlist)
    return total


def write_html_to_csv(s):
    # with open('comment.csv', 'w', encoding='utf-8') as f:
    #     f.write(s)
    s.to_csv('comment.csv', index=False)


def read_html_from_csv():
    s = pd.read_csv("comment.csv")
    return s


def read_from_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        s = f.readlines()
    return s


def fen_ci(all_content):
    # 载入停用词数据
    stopwords = [line.strip() for line in read_from_file('stopwords.txt')]
    stopwordsAdd = ['\n',' ']
    for i in stopwordsAdd:
        stopwords.append(i)

    # 词频统计
    segments = {}
    words = jieba.cut(all_content,)
    for word in words:
        if word not in stopwords:
            segments[word] = segments.get(word, 0) + 1

    # 按照词频排序
    sort_segments = sorted(segments.items(), key=lambda item: item[1], reverse=True)
    words_on_list = []
    for word, count in sort_segments[:99]:
        words_on_list.append(word)
    return words_on_list


def make_wordclude(words_on_list):
    # 生成词云
    word_show = ' '.join(words_on_list)
    w = wordcloud.WordCloud(font_path="msyh.ttc", width=1000, height=700, background_color="white", max_words=100)
    w.generate(word_show)
    # w.to_file("hot_word.jpg")

    plt.figure(figsize=(8, 8.5))
    plt.imshow(w, interpolation='bilinear')
    plt.axis('off')
    plt.title('Most Popular Words in Title', fontsize=30)
    plt.show()


if __name__ == '__main__':

    # # write_html_to_csv(start_search("平凡的世界",3))
    a = read_html_from_csv()
    # print(fen_ci(a['comment'].to_string()))
    make_wordclude(fen_ci(a['comment'].to_string()))
posted @ 2022-05-08 08:50 我的未来姓栗山阅读(18) 评论(0) 编辑收藏举报
刷新页面返回顶部
我的未来姓栗山

周报10

公告