用selenium抓取网易云音乐评论

import time
import random
from selenium import webdriver


def crawl():
    # 网易云音乐 只要平凡
    url = 'https://music.163.com/#/song?id=574919767'
　　# 拿到chrome参数配置对象
    opt = webdriver.ChromeOptions()
    # 启动无界面模式
    opt.add_argument('--headless')
    # 禁用gpu
    opt.add_argument('--disable-gpu')
　　# 拿到谷歌浏览器对象
    web = webdriver.Chrome(chrome_options=opt)
    # 向url发出请求
    web.get(url)
    # 隐式等待5秒
    web.implicitly_wait(5)
    # 因为评论数据在该iframe中，所以先定位到iframe标签
    iframe = web.find_element_by_id('g_iframe')
    # 切换到该iframe中
    web.switch_to.frame(iframe)
    # 直接传入iframe标签的name属性的值也行
    # web.switch_to.frame('contentFrame')
    # <iframe name="contentFrame" id="g_iframe" class="g-iframe" scrolling="auto" frameborder="0" src="about:blank" allowfullscreen="true"></iframe>
    # 滚动到最底下，看翻页  scrollTo(x,y) x水平移动, y垂直移动
    web.execute_script('scrollTo(0,document.body.scrollHeight)')
    # 将评论保存在comment.txt中
    f = open('comment.txt', 'w', encoding='utf-8')
    # 记录条数
    count = 0
    # 爬取的页数
    page = 1000
    for i in range(page):
        # 拿到每一页的20条评论列表
        div_list = web.find_elements_by_class_name('itm')
        for div in div_list:
            content = div.find_element_by_xpath('./div[2]/div/div').text
            content = content.split('：')[1]
            f.write(content + '\n')
            count += 1
            print(count)
        # 下一页
        next_page = web.find_element_by_xpath('//a[contains(text(),"下一页")]')
        web.execute_script("arguments[0].click();", next_page)
        # sleep 0.2到1秒等下一页的内容加载完成
        time.sleep(random.uniform(0.2, 1))
    # 关闭浏览器对象
    web.quit()
    # 关闭文件对象
    f.close()


if __name__ == '__main__':
    start = time.time()
    crawl()
    end = time.time()
    print('用时{:.2f}s'.format(end - start))
posted @ 2021-11-30 23:19 xuecl 阅读(148) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部