爬虫_scrapy_爬取指定博文的评论

1.创建scrapy项目

2.创建weibo.py文件

根据观察每次请求的评论请求接口地址例如：

https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4794437614699837&is_show_bulletin=2&is_mix=0&max_id=138998668569843&count=20&uid=5145725878
weibo.py文件代码

import json
from datetime import datetime
import time
import scrapy
from bs4 import BeautifulSoup
from scrapy_xinlangweibo.items import ScrapyXinlangweiboItem


class WeiboSpider(scrapy.Spider):
    # 爬虫的名字 用于运行爬虫的时候使用的值（运行命令 scrapy crawl weibo）
    # https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4794437614699837&is_show_bulletin=2&is_mix=0&max_id=138998668569843&count=20&uid=5145725878
    name = 'weibo'
    # 允许访问的域名
    allowed_domains = ['www.weibo.com']

    #微博id 4794437614699837    id1 4763201759608894
    id=4763201759608894
    #博主用户id 5145725878  uid1 5993531560
    uid=5993531560
    #控制页码
    max_id = 0

    # 排序方式 0：表示热度排序 1：时间排序
    flow=0
    # 起始的url 指的是第一次要访问的域名
    start_urls = ['https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={}&is_show_bulletin=2&is_mix=0&count=10&uid={}'.format(id,uid)]
    #start_urls=['https://weibo.com/ajax/statuses/buildComments?flow={}&is_reload=1&id={}&is_show_bulletin=2&is_mix=0&max_id={}&count=20&uid={}'.format(flow,id,max_id,uid)]
    #分页请求
    con_url='https://weibo.com/ajax/statuses/buildComments?flow={}&is_reload=1&id={}&is_show_bulletin=2&is_mix=0&max_id={}&count=20&uid={}'
    page = 1
    #用于睡眠等
    time=time

    def parse(self, response):
        #UA反爬-测试
        #print(response.request.headers['User-Agent'])
        #cookie反爬-测试
        #print(response.request.headers['cookie'])
        #原文-验证
        jsonObj  = json.loads(response.text)
        #print(jsonObj )

        #评论列表
        data=jsonObj["data"]

        #判断data是否为空
        if len(data)>0:
            # 分页标识始终存在(存在数据则去新的max_id作为新的请求头参数，否则用原理的的一直请求)
            self.max_id = jsonObj["max_id"]
            for item in data:
                try:
                    # 评论id
                    commentId = item["id"]
                    # 评论内容
                    commentContent = BeautifulSoup(item["text"], "html.parser").text
                    # 评论时间
                    time_format = datetime.strptime(item["created_at"], '%a %b %d %H:%M:%S %z %Y')
                    time_format = str(time_format)
                    times = time_format[0:19]
                    createdAt = times
                    # 来源
                    source = item["source"]
                    # 点赞数
                    fabulous = item["like_counts"]
                    # 评论数
                    comment = item["total_number"]
                    # 评论者昵称
                    name = item["user"]["screen_name"]
                    # 采集时间
                    createTime = datetime.now()
                    # 分页数
                    maxId = self.max_id

                    # 提交管道执行持久化操作
                    wb = ScrapyXinlangweiboItem(name=name, createdAt=createdAt, source=source, comment=comment, fabulous=fabulous,
                                                createTime=createTime, commentContent=commentContent,maxId=maxId)
                    yield wb
                except:
                  continue
        #循环获取
        if self.max_id > 0:
            try:
                self.page = self.page + 1
                print('MAX_ID>>>', self.max_id)
                url = self.con_url.format(self.flow,self.id,self.max_id,self.uid)
                #print('REQUEST_URL>>>', url)
                #短暂睡眠~
                self.time.sleep(20)
                yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
            except:
                print("爬取处理 第 {} 页html数据时出错! ".format(self.page))
            else:
                print("成功爬取第{}页数据，爬取有效微博数：{}".format(self.page, len(data)))
        else:
            print("爬取全部数据!")

3.items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

# 新浪微博数据结构
class ScrapyXinlangweiboItem(scrapy.Item):
    # define the fields for your item here like:
    # 博主名称
    name = scrapy.Field()
    # 性别
    sex = scrapy.Field()
    # 发微时间
    time = scrapy.Field()
    # 来自
    source = scrapy.Field()
    #博文
    txt =scrapy.Field()
    # 转发
    forward =scrapy.Field()
    # 评论
    comment = scrapy.Field()
    # 点赞
    fabulous = scrapy.Field()
    # 评论内容
    commentContent = scrapy.Field()
    # 采集时间
    createTime = scrapy.Field()
    # 评论时间
    createdAt = scrapy.Field()
    # 分页数
    maxId = scrapy.Field()
    pass

4.UA池、代理IP池、cookie池

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class ScrapyXinlangweiboSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class ScrapyXinlangweiboDownloaderMiddleware:
    #ua池
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1',
        '(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
        'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11',
        '(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6'
        '(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6',
        '(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1',
        '(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5',
        '(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5',
        '(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3',
        '(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24',
        '(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24',
        '(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24'
    ]

    # IP代理池
    PROXY_http = [
        'http://62.234.158.92:80',
        'http://39.100.106.214:80',
        'http://39.99.210.228:80',
        'http://112.101.93.4:8888'
        'http://153.180.102.104:80',
        'http://195.208.131.189:56055',
        'http://61.135.217.7:80',
        'http://219.141.153.38:80',
        'http://583349285:2zectsyx@139.196.76.78:16816'
    ]
    PROXY_https = [
        'https://62.234.158.92:80',
        'https://39.100.106.214:80',
        'https://39.99.210.228:80',
        'https://120.83.49.90:9000',
        'https://95.189.112.214:35508',
        'https://113.226.18.243:80',
        'https://121.31.100.209:8123',
        'https://14.117.177.135:808',
        'https://171.223.230.46:61234',
        'https://117.57.90.121:25435',
        'https://175.11.214.29:808',
        'https://118.190.145.138:9001',
        'https://182.112.89.23:8118',
        'https://221.228.17.172:8181',
        'https://115.46.70.48:8123',
        'https://110.88.30.36:808',
        'https://110.87.104.153:8118',
        'https://1.195.25.204:61234',
        'https://119.186.241.31:61234',
        'https://175.155.152.41:61234',
        'https://27.31.103.233:21973',
        'https://125.105.110.4:3128',
        'https://114.222.24.111:808',
        'https://140.250.180.229:61234',
        'https://120.83.98.216:61234',
        'https://175.155.223.179:61234',
        'https://115.198.37.56:6666',
        'https://115.46.74.192:8123',
        'https://106.56.102.39:8070',
        'https://125.121.121.155:6666',
        'https://219.157.147.113:8118',
        'https://117.66.167.57:8118',
        'https://183.128.242.93:6666',
        'https://115.198.39.24:6666',
        'https://114.223.162.171:8118',
        'https://115.46.89.82:8123',
        'https://58.208.16.70:37436',
        'https://123.188.6.176:1133',
        'https://112.195.51.225:61234',
        'https://112.193.131.17:8118',
        'https://221.234.250.204:8010',
        'https://49.79.67.119:61234',
        'https://220.184.215.223:6666',
        'https://180.121.134.176:808',
        'https://122.246.48.118:8010',
        'https://119.7.59.13:61234',
        'https://27.54.248.42:8000',
        'https://59.32.37.99:8010',
        'https://220.191.100.253:6666',
        'https://112.193.70.85:61234',
        'https://60.167.128.91:48963',
        'https://119.4.70.128:61234',
        'https://182.88.166.148:8123',
        'https://113.117.65.112:61234',
        'https://115.226.129.195:61234',
        'https://106.75.71.122:80',
        'https://125.122.171.167:6666',
        'https://125.118.144.247:6666',
        'https://60.184.173.221:8070',
        'https://60.190.250.120:8080',
        'https://36.6.146.199:47025',
        'https://106.56.102.78:808',
        'https://119.7.225.218:61234',
        'https://583349285:2zectsyx@139.196.76.78:16816'
    ]

    #cookie池
    cookie_list=[
        'SUB=_2A25PiFDGDeRhGeRH7lYR9C_LzTmIHXVtc3COrDV8PUJbkNAKLRH2kW1NTbHM8wP7UPnTjvTAmox62rVYXbj0cyiW; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5oz75NHD95QE1K-XehBpS0qfWs4Dqcj1i--Xi-iFiKnpehnp9sMt; SINAGLOBAL=4669199551025.64.1653350613001; _s_tentry=-; Apache=9310493089723.533.1658795311962; ULV=1658795312024:3:2:2:9310493089723.533.1658795311962:1658709963132; XSRF-TOKEN=PMY7HEw_E1A51awyzJEml7gS; WBPSESS=Q8ByW2eX66YxwLnF-6h7NbWzJdod3892U-ZXN9pdE9QrbKDsJsoAQn3RCW8SlWji79oYfXnJFhKrQwCUGGOZXF5ZAuRZsyVYCnw6mBAvWTX5sYbh4j4N_skQz50KRJ8JDI487rDXbHNazhVdKMJ4Ug==',
        'XSRF-TOKEN=aHnFGRklNEU4rdayhzyEhoZb; PC_TOKEN=f714a6b772; login_sid_t=8d0e4f48b2ca90b52181f0b23b277a16; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=weibo.com; Apache=8185391731761.271.1658820521668; SINAGLOBAL=8185391731761.271.1658820521668; ULV=1658820521671:1:1:1:8185391731761.271.1658820521668:; wb_view_log=3440*14401; WBtopGlobal_register_version=2022072615; crossidccode=CODE-yf-1Ogf0O-463hJp-xz3rYFEpMBzBQ8D3cc764; SSOLoginState=1658820594; SUB=_2A25P2-eiDeRhGeNJ71YS9S7Jwz2IHXVtJ4nqrDV8PUJbkNANLUetkW1NS9gGQVNCt2O718FvhzkNoj-tGVOvwedC; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW_z_5hzMdm7L6VJJrXnUf45NHD95QfS0BXe0-7SKnpWs4DqcjgehBfeCH8SC-R1F-41FH8SbHWBEHWentt; WBPSESS=KHlQbysJR7-_YNKdbmdSub9zk3fLWZIS4gEZ7V7aipfCOcZk0-5iV9g4Li4-DK3ur5W-8J2x02MCBEQ7TOYrcjJ34r2RndtyULSYCcvDvJwRz8MSmcO9WCIqGKwYO-ivKC2t9jlFlM-Yisjv_HbNuw==',
        'XSRF-TOKEN=VHs0rjmnY88BHKKR8STwXyKa; PC_TOKEN=600b614632; _s_tentry=www.baidu.com; UOR=www.baidu.com,open.weibo.com,www.baidu.com; Apache=4628955910613.021.1658820786220; SINAGLOBAL=4628955910613.021.1658820786220; ULV=1658820786226:1:1:1:4628955910613.021.1658820786220:; login_sid_t=b82084eadbb38ebc50eeebfaa383b342; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; wb_view_log=1920*10801; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFF90r4yx1dYiQLs_GgQzOX5JpX5o275NHD95QcSh5XShM41KMfWs4DqcjUB.HkB.vexFH81CHW1CHWBEH8SE-4SFHFeoqR1hnt; SSOLoginState=1658820861; SUB=_2A25P2-lQDeRhGeBK61YV9yfFzDmIHXVskV2YrDV8PUNbmtB-LUbWkW9NR_ov0TIBeUK5-S-1R4SovXtuDCjk8OPS; ALF=1690356861; WBPSESS=_yD7d1e0Lx4Q-bSq9ElPWdiywqHVlO6SWR-pLheG9PfKKZe_DQqOGIXITXz70o28K-G7t1QHVTpZmfjSSOjAHkkB-7Eho3zU53JSHhuELaj0iYRXPIuPS6cLm-CYtc5h1Jto6Sd59cNSGTFT1uZHCg==',
        'SINAGLOBAL=1115780942584.974.1658635968559; ULV=1658635968565:1:1:1:1115780942584.974.1658635968559:; XSRF-TOKEN=sT1VFPal6Esg0qCjb8tVD8Mq; SUB=_2A25P2-vfDeRhGeBJ6VoV9ynNyj2IHXVskVoXrDV8PUNbmtB-LWn5kW9NRkZg4J5iNTEgfo49HotkSza3PQkTaHdG; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFXRGd-qUob5L7Bm_6kME4P5JpX5KzhUgL.FoqNeonXS0MpeK22dJLoI0YLxK-L1-zL1-zLxKqL1-eL1hnLxKBLB.zLB.BLxKML1-2L1hBLxK-LBo5L12qLxKnL1-zL12zLxKBLB.zL1KBt; ALF=1690357519; SSOLoginState=1658821519; WBPSESS=vsdYZqdoCl1sU3su5usyXiWJQIXJDgJxdJcFSAC-EtdryyK_v8pHujB9YV92Pc8W9RdrvuTKHytoQZ0miMn2Nkg5ksGlZ4Ofy1XunAhJHyILZlio-HHpXCNRMsTyov0vYZDagqIA_Q2xQOQOATr6sg==; PC_TOKEN=8b446a2d7c',
        'SINAGLOBAL=4669199551025.64.1653350613001; _s_tentry=-; Apache=9310493089723.533.1658795311962; ULV=1658795312024:3:2:2:9310493089723.533.1658795311962:1658709963132; XSRF-TOKEN=PMY7HEw_E1A51awyzJEml7gS; login_sid_t=d946353afd8df8809cf52b84c9d56e62; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; wb_view_log=1920*10801; SUB=_2A25P29F2DeRhGeFK4loT-S3NzjWIHXVskUW-rDV8PUNbmtAKLUGjkW9NQqQjrxHSUSK14a35RIh8zYbU7z-Rr_GG; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4Dbhgg_lARclhPbOYA5xm5JpX5KzhUgL.FoMX1KnE1KepSK.2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNSh.Reo.0eK-4; ALF=1690358950; SSOLoginState=1658822950; PC_TOKEN=18dbaa03eb; wvr=6; wb_view_log_7498293159=1920*10801; webim_unReadCount=%7B%22time%22%3A1658822966290%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D; WBPSESS=Dt2hbAUaXfkVprjyrAZT_NVqfvP7CID8jIb0n0yBlmOtM57xlaAyx9zUEpyxzH25quqtSoCuMUPVbND4gXMuunuX3C3LAAdZ1T3_PYXo9aJlUl_g2zwsjVo9BxyPsQuoB_Q8GdWCbC0yg3bu9Pve0PfGGcLlsX-p9i4rSnCGw5g=',
        'PC_TOKEN=6844a497ab; login_sid_t=80940089e5c99a3eecfec30d09123bc2; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=passport.weibo.com; Apache=7108406010788.637.1658823054338; SINAGLOBAL=7108406010788.637.1658823054338; ULV=1658823054340:1:1:1:7108406010788.637.1658823054338:; wb_view_log=1707*10671.5; WBtopGlobal_register_version=2022072616; crossidccode=CODE-tc-1OgfEG-24jNO2-26adT2x3JOUCXHL4a50d5; UOR=,,graph.qq.com; SUB=_2A25P29G0DeRhGeRL7lQX8ifEyDmIHXVskUR8rDV8PUNbmtAKLUfMkW9NUxIqr1Ql1TsFoO7-bUG4g9-WNiIlkoKv; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFU3-ypVYr3Nx9PSyu4g9VZ5JpX5KzhUgL.FozfSKqceo.Re0-2dJLoIEnLxK.LB.-L1K.LxKqLBo2L1-qLxK.L1-BL1KzLxK-LBo.LBoB41hzc; ALF=1690359138; SSOLoginState=1658823140; wvr=6',
        'XSRF-TOKEN=0hLGEnnKp5G9LN5YcjImd8ws',
        'SINAGLOBAL=4669199551025.64.1653350613001; wb_view_log=1920*10801; wvr=6; wb_view_log_7498293159=1920*10801; webim_unReadCount=%7B%22time%22%3A1658822966290%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D; XSRF-TOKEN=JBlRQkpft06UZV7iraI0uErM; PC_TOKEN=83467e908a; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5KMhUgL.Foz4SKB7Sh2NSo-2dJLoI79-KLU1xHzt; ALF=1690372094; SSOLoginState=1658836094; SCF=AqGF443jTI-VnS382ShffcHWQtfFQOpHNwMOX8cXFlasO4rTnDyDrnElJ_LLl2Ds7NTvIUsdguQyrchkMZqBfCY.; SUB=_2A25P26QuDeRhGeRH7lYR9C_LzTmIHXVskJLmrDV8PUNbmtAKLW72kW9NTbHM80kR_n2kXr3TqD4At7-q0D0_uxpF; _s_tentry=weibo.com; Apache=2356552200861.233.1658836108863; ULV=1658836108904:4:3:3:2356552200861.233.1658836108863:1658795312024; WBPSESS=Dt2hbAUaXfkVprjyrAZT_ChHS3X1TW-AxSmMe79akhJ6zWbLhSGfgAlvYpclUtRZTDkEqbTwyBiyymCqtMzosRwY7DuzYTrtbFieO-tZNJTDHyfrTmjNAJIwnGRrHRZDt_viO5WKWsUkQx4H8xge1ACYKutmbhgaY40kD1T9OKTeupUgTD0oNMcuJrSAJrgWOtNxE6AfEQSDjC6nzNjIxA=='
    ]

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    # 拦截所有未发生异常的请求（正常的请求）UA池、cookie池
    def process_request(self, request, spider):
        #print('process_request')
        # 使用UA池进行请求的UA伪装
        # 请求头信息  获取到的是字典
        # 这一步可有可无 因为你可以在settings中设置一个共同的User-Agent
        request.headers['User-Agent'] = random.choice(self.user_agent_list)
        request.headers['cookie']= random.choice(self.cookie_list)

        request.headers['referer']='https://weibo.com/5993531560/LDCKJjIDP'
        request.headers['sec-ch-ua']='".Not/A)Brand";v = "99", "Google Chrome";v = "103", "Chromium";v = "103"'
        request.headers['sec-ch-ua-mobile']='?0'
        request.headers['sec-ch-ua-platform']='"Windows"'
        request.headers['client-version']='v2.34.73'
        request.headers['accept']='application/json,text/plain,*/*'

        #print(request.headers['User-Agent'])
        #print(request.headers['cookie'])
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    #拦截所有异常请求代理ip在此编写
    def process_exception(self, request, exception, spider):
        # 这一步是必须要用的 因为当你访问一个网站次数过多的时候  你可以使用代理IP继续爬取该网站的数据
        #使用代理池进行请求代理ip的设置
        #request.url 返回的是请求对象所对应的URL
        #print('process_exception')
        if request.url.split(':')[0] == 'http':
            request.meta['proxy'] = random.choice(self.PROXY_http)
        else:
            request.meta['proxy'] = random.choice(self.PROXY_https)
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

5.管道

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class ScrapyXinlangweiboPipeline:
    def process_item(self, item, spider):
        return item

#加载settings文件
from scrapy.utils.project import get_project_settings
#导入pymysql
import pymysql
# 管道-Mysql持久化
class MysqlPipeline:
    def open_spider(self,spider):
        settings = get_project_settings()
        self.host = settings['DB_HOST']
        self.port = settings['DB_PORT']
        self.user = settings['DB_USER']
        self.password = settings['DB_PASSWORD']
        self.database = settings['DB_NAME']
        self.charset = settings['DB_CHARSET']
        self.connect()

    def connect(self):
        self.conn =pymysql.connect(
                            host=self.host,
                            port=self.port,
                            user=self.user,
                            password=self.password,
                            db=self.database,
                            charset=self.charset
        )

        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # 插入数据库
        # sql = 'insert into xinlangweibo(name,time,source,txt,forward,comment,fabulous,createTime,commentContent) values("{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(
        #     item['name'], item['time'].strip(), item['source'], item['txt'], item['forward'], item['comment'],
        #     item['fabulous'], item['createTime'], item['commentContent'])
        #name=name,time=time,source=source,comment=comment,fabulous=fabulous,createTime=createTime,commentContent=commentContent
        sql = 'insert into xinlangweibo1(name,createdAt,source,comment,fabulous,createTime,commentContent,maxId) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format(
            item['name'], item['createdAt'], item['source'], item['comment'],item['fabulous'], item['createTime'], item['commentContent'], item['maxId'])
        # 执行sql语句
        self.cursor.execute(sql)
        # 提交
        self.conn.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

6.设置

# Scrapy settings for scrapy_xinlangweibo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapy_xinlangweibo'

SPIDER_MODULES = ['scrapy_xinlangweibo.spiders']
NEWSPIDER_MODULE = 'scrapy_xinlangweibo.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_xinlangweibo (+http://www.yourdomain.com)'

# Obey robots.txt rules
#修改默认的robotstxt协议，注释即可
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False


# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
#   'Accept-Language': 'zh-CN,zh;q=0.9',
#   'cookie': 'SUB=_2A25PiFDGDeRhGeRH7lYR9C_LzTmIHXVtc3COrDV8PUJbkNAKLRH2kW1NTbHM8wP7UPnTjvTAmox62rVYXbj0cyiW; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5oz75NHD95QE1K-XehBpS0qfWs4Dqcj1i--Xi-iFiKnpehnp9sMt; SINAGLOBAL=4669199551025.64.1653350613001; _s_tentry=-; Apache=9310493089723.533.1658795311962; ULV=1658795312024:3:2:2:9310493089723.533.1658795311962:1658709963132; XSRF-TOKEN=PMY7HEw_E1A51awyzJEml7gS; WBPSESS=Q8ByW2eX66YxwLnF-6h7NbWzJdod3892U-ZXN9pdE9QrbKDsJsoAQn3RCW8SlWji79oYfXnJFhKrQwCUGGOZXF5ZAuRZsyVYCnw6mBAvWTX5sYbh4j4N_skQz50KRJ8JDI487rDXbHNazhVdKMJ4Ug==',
#   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
# }
HTTPERROR_ALLOWED_CODES = [400]

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'scrapy_xinlangweibo.middlewares.ScrapyXinlangweiboSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'scrapy_xinlangweibo.middlewares.ScrapyXinlangweiboDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}
#配置数据库,名称一定要大写
DB_HOST = 'Ip'
DB_PORT = 3306
DB_USER = 'root'
DB_PASSWORD = '密码'
DB_NAME = '数据库'
# utf-8的“-”杠不允许写，否则就报错
DB_CHARSET = 'utf8'

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'scrapy_xinlangweibo.pipelines.ScrapyXinlangweiboPipeline': 300,
   'scrapy_xinlangweibo.pipelines.MysqlPipeline': 301
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

posted @ 2022-07-26 22:15 创客未来阅读(677) 评论(0) 收藏举报

刷新页面返回顶部

创客未来

爬虫_scrapy_爬取指定博文的评论

1.创建scrapy项目

2.创建weibo.py文件

3.items.py

4.UA池、代理IP池、cookie池

5.管道

6.设置

公告