scrapy+selenium爬取网易云音乐评论

Posted on 2022-10-19 12:27 中国铁马阅读(98) 评论(0) 编辑收藏举报

废话不多说，先卡主代码

 1 import scrapy
 2 from selenium.webdriver import Chrome
 3 from selenium.webdriver.common.by import By
 4 import time
 5 from CodeNav.items import CodenavItem
 6 
 7 
 8 class CodeSpider(scrapy.Spider):
 9     name = 'code'
10     # allowed_domains = ['www.xxx.com']
11     start_urls = ['https://music.163.com/#/playlist?id=2329680016']
12     bro = Chrome()
13 
14     # bro.page_source
15 
16     def parse(self, response, **kwargs):
17         div_list = response.xpath('/html/body/div[3]/div[1]/div/div/div[3]/div/div[2]/div[2]/div')
18         for div in div_list:
19             title = div.xpath('./div[2]/div[1]/div/a/text()').extract_first()
20             cnt = div.xpath('./div[2]/div[1]/div/text()').extract_first()
21             item = CodenavItem()
22             item['title'] = title
23             item['cnt'] = cnt
24 
25             # print(title + '   ' + cnt)
26             yield item
27 
28     def close(self, spider):
29         self.bro.quit()

```
再看中间件的代码

```

 1 # Define here the models for your spider middleware
 2 #
 3 # See documentation in:
 4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 5 
 6 from scrapy import signals
 7 
 8 # useful for handling different item types with a single interface
 9 from itemadapter import is_item, ItemAdapter
10 from scrapy.http import HtmlResponse
11 from selenium.webdriver.common.by import By
12 from time import sleep
13 
14 
15 class CodenavDownloaderMiddleware:
16 
17     def process_request(self, request, spider):
18         web = spider.bro
19         web.get(request.url)
20         sleep(2)
21         web.execute_script('window.scrollTo(0,document.body.scrollHeight)')
22         sleep(2)
23         web.switch_to.frame('g_iframe')  # 评论嵌套在了iframe里，括号里是iframe的id
24         sleep(2)
25         page_count = web.page_source
26         # print(page_count)
27         return HtmlResponse(url=request.url, body=page_count, encoding='utf-8', request=request)
28         # return None
29 
30     def process_response(self, request, response, spider):
31         # Called with the response returned from the downloader.
32 
33         # Must either;
34         # - return a Response object
35         # - return a Request object
36         # - or raise IgnoreRequest
37         return response
38 
39     def process_exception(self, request, exception, spider):
40         # Called when a download handler or a process_request()
41         # (from other downloader middleware) raises an exception.
42 
43         # Must either:
44         # - return None: continue processing this exception
45         # - return a Response object: stops process_exception() chain
46         # - return a Request object: stops process_exception() chain
47         pass
48 
49     def spider_opened(self, spider):
50         spider.logger.info('Spider opened: %s' % spider.name)