爬虫_scrapy_爬取指定博文的评论
1.创建scrapy项目
2.创建weibo.py文件
根据观察每次请求的评论请求接口地址例如:
https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4794437614699837&is_show_bulletin=2&is_mix=0&max_id=138998668569843&count=20&uid=5145725878
weibo.py文件代码
import json from datetime import datetime import time import scrapy from bs4 import BeautifulSoup from scrapy_xinlangweibo.items import ScrapyXinlangweiboItem class WeiboSpider(scrapy.Spider): # 爬虫的名字 用于运行爬虫的时候使用的值(运行命令 scrapy crawl weibo) # https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4794437614699837&is_show_bulletin=2&is_mix=0&max_id=138998668569843&count=20&uid=5145725878 name = 'weibo' # 允许访问的域名 allowed_domains = ['www.weibo.com'] #微博id 4794437614699837 id1 4763201759608894 id=4763201759608894 #博主用户id 5145725878 uid1 5993531560 uid=5993531560 #控制页码 max_id = 0 # 排序方式 0:表示热度排序 1:时间排序 flow=0 # 起始的url 指的是第一次要访问的域名 start_urls = ['https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={}&is_show_bulletin=2&is_mix=0&count=10&uid={}'.format(id,uid)] #start_urls=['https://weibo.com/ajax/statuses/buildComments?flow={}&is_reload=1&id={}&is_show_bulletin=2&is_mix=0&max_id={}&count=20&uid={}'.format(flow,id,max_id,uid)] #分页请求 con_url='https://weibo.com/ajax/statuses/buildComments?flow={}&is_reload=1&id={}&is_show_bulletin=2&is_mix=0&max_id={}&count=20&uid={}' page = 1 #用于睡眠等 time=time def parse(self, response): #UA反爬-测试 #print(response.request.headers['User-Agent']) #cookie反爬-测试 #print(response.request.headers['cookie']) #原文-验证 jsonObj = json.loads(response.text) #print(jsonObj ) #评论列表 data=jsonObj["data"] #判断data是否为空 if len(data)>0: # 分页标识始终存在(存在数据则去新的max_id作为新的请求头参数,否则用原理的的一直请求) self.max_id = jsonObj["max_id"] for item in data: try: # 评论id commentId = item["id"] # 评论内容 commentContent = BeautifulSoup(item["text"], "html.parser").text # 评论时间 time_format = datetime.strptime(item["created_at"], '%a %b %d %H:%M:%S %z %Y') time_format = str(time_format) times = time_format[0:19] createdAt = times # 来源 source = item["source"] # 点赞数 fabulous = item["like_counts"] # 评论数 comment = item["total_number"] # 评论者昵称 name = item["user"]["screen_name"] # 采集时间 createTime = datetime.now() # 分页数 maxId = self.max_id # 提交管道执行持久化操作 wb = ScrapyXinlangweiboItem(name=name, createdAt=createdAt, source=source, comment=comment, fabulous=fabulous, createTime=createTime, commentContent=commentContent,maxId=maxId) yield wb except: continue #循环获取 if self.max_id > 0: try: self.page = self.page + 1 print('MAX_ID>>>', self.max_id) url = self.con_url.format(self.flow,self.id,self.max_id,self.uid) #print('REQUEST_URL>>>', url) #短暂睡眠~ self.time.sleep(20) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) except: print("爬取处理 第 {} 页html数据时出错! ".format(self.page)) else: print("成功爬取第{}页数据,爬取有效微博数:{}".format(self.page, len(data))) else: print("爬取全部数据!")
3.items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy # 新浪微博数据结构 class ScrapyXinlangweiboItem(scrapy.Item): # define the fields for your item here like: # 博主名称 name = scrapy.Field() # 性别 sex = scrapy.Field() # 发微时间 time = scrapy.Field() # 来自 source = scrapy.Field() #博文 txt =scrapy.Field() # 转发 forward =scrapy.Field() # 评论 comment = scrapy.Field() # 点赞 fabulous = scrapy.Field() # 评论内容 commentContent = scrapy.Field() # 采集时间 createTime = scrapy.Field() # 评论时间 createdAt = scrapy.Field() # 分页数 maxId = scrapy.Field() pass
4.UA池、代理IP池、cookie池
# Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import random # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter class ScrapyXinlangweiboSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ScrapyXinlangweiboDownloaderMiddleware: #ua池 user_agent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1', '(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1', 'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11', '(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6' '(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6', '(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1', '(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5', '(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5', 'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5', '(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3', '(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24', '(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24', '(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24' ] # IP代理池 PROXY_http = [ 'http://62.234.158.92:80', 'http://39.100.106.214:80', 'http://39.99.210.228:80', 'http://112.101.93.4:8888' 'http://153.180.102.104:80', 'http://195.208.131.189:56055', 'http://61.135.217.7:80', 'http://219.141.153.38:80', 'http://583349285:2zectsyx@139.196.76.78:16816' ] PROXY_https = [ 'https://62.234.158.92:80', 'https://39.100.106.214:80', 'https://39.99.210.228:80', 'https://120.83.49.90:9000', 'https://95.189.112.214:35508', 'https://113.226.18.243:80', 'https://121.31.100.209:8123', 'https://14.117.177.135:808', 'https://171.223.230.46:61234', 'https://117.57.90.121:25435', 'https://175.11.214.29:808', 'https://118.190.145.138:9001', 'https://182.112.89.23:8118', 'https://221.228.17.172:8181', 'https://115.46.70.48:8123', 'https://110.88.30.36:808', 'https://110.87.104.153:8118', 'https://1.195.25.204:61234', 'https://119.186.241.31:61234', 'https://175.155.152.41:61234', 'https://27.31.103.233:21973', 'https://125.105.110.4:3128', 'https://114.222.24.111:808', 'https://140.250.180.229:61234', 'https://120.83.98.216:61234', 'https://175.155.223.179:61234', 'https://115.198.37.56:6666', 'https://115.46.74.192:8123', 'https://106.56.102.39:8070', 'https://125.121.121.155:6666', 'https://219.157.147.113:8118', 'https://117.66.167.57:8118', 'https://183.128.242.93:6666', 'https://115.198.39.24:6666', 'https://114.223.162.171:8118', 'https://115.46.89.82:8123', 'https://58.208.16.70:37436', 'https://123.188.6.176:1133', 'https://112.195.51.225:61234', 'https://112.193.131.17:8118', 'https://221.234.250.204:8010', 'https://49.79.67.119:61234', 'https://220.184.215.223:6666', 'https://180.121.134.176:808', 'https://122.246.48.118:8010', 'https://119.7.59.13:61234', 'https://27.54.248.42:8000', 'https://59.32.37.99:8010', 'https://220.191.100.253:6666', 'https://112.193.70.85:61234', 'https://60.167.128.91:48963', 'https://119.4.70.128:61234', 'https://182.88.166.148:8123', 'https://113.117.65.112:61234', 'https://115.226.129.195:61234', 'https://106.75.71.122:80', 'https://125.122.171.167:6666', 'https://125.118.144.247:6666', 'https://60.184.173.221:8070', 'https://60.190.250.120:8080', 'https://36.6.146.199:47025', 'https://106.56.102.78:808', 'https://119.7.225.218:61234', 'https://583349285:2zectsyx@139.196.76.78:16816' ] #cookie池 cookie_list=[ 'SUB=_2A25PiFDGDeRhGeRH7lYR9C_LzTmIHXVtc3COrDV8PUJbkNAKLRH2kW1NTbHM8wP7UPnTjvTAmox62rVYXbj0cyiW; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5oz75NHD95QE1K-XehBpS0qfWs4Dqcj1i--Xi-iFiKnpehnp9sMt; SINAGLOBAL=4669199551025.64.1653350613001; _s_tentry=-; Apache=9310493089723.533.1658795311962; ULV=1658795312024:3:2:2:9310493089723.533.1658795311962:1658709963132; XSRF-TOKEN=PMY7HEw_E1A51awyzJEml7gS; WBPSESS=Q8ByW2eX66YxwLnF-6h7NbWzJdod3892U-ZXN9pdE9QrbKDsJsoAQn3RCW8SlWji79oYfXnJFhKrQwCUGGOZXF5ZAuRZsyVYCnw6mBAvWTX5sYbh4j4N_skQz50KRJ8JDI487rDXbHNazhVdKMJ4Ug==', 'XSRF-TOKEN=aHnFGRklNEU4rdayhzyEhoZb; PC_TOKEN=f714a6b772; login_sid_t=8d0e4f48b2ca90b52181f0b23b277a16; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=weibo.com; Apache=8185391731761.271.1658820521668; SINAGLOBAL=8185391731761.271.1658820521668; ULV=1658820521671:1:1:1:8185391731761.271.1658820521668:; wb_view_log=3440*14401; WBtopGlobal_register_version=2022072615; crossidccode=CODE-yf-1Ogf0O-463hJp-xz3rYFEpMBzBQ8D3cc764; SSOLoginState=1658820594; SUB=_2A25P2-eiDeRhGeNJ71YS9S7Jwz2IHXVtJ4nqrDV8PUJbkNANLUetkW1NS9gGQVNCt2O718FvhzkNoj-tGVOvwedC; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW_z_5hzMdm7L6VJJrXnUf45NHD95QfS0BXe0-7SKnpWs4DqcjgehBfeCH8SC-R1F-41FH8SbHWBEHWentt; WBPSESS=KHlQbysJR7-_YNKdbmdSub9zk3fLWZIS4gEZ7V7aipfCOcZk0-5iV9g4Li4-DK3ur5W-8J2x02MCBEQ7TOYrcjJ34r2RndtyULSYCcvDvJwRz8MSmcO9WCIqGKwYO-ivKC2t9jlFlM-Yisjv_HbNuw==', 'XSRF-TOKEN=VHs0rjmnY88BHKKR8STwXyKa; PC_TOKEN=600b614632; _s_tentry=www.baidu.com; UOR=www.baidu.com,open.weibo.com,www.baidu.com; Apache=4628955910613.021.1658820786220; SINAGLOBAL=4628955910613.021.1658820786220; ULV=1658820786226:1:1:1:4628955910613.021.1658820786220:; login_sid_t=b82084eadbb38ebc50eeebfaa383b342; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; wb_view_log=1920*10801; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFF90r4yx1dYiQLs_GgQzOX5JpX5o275NHD95QcSh5XShM41KMfWs4DqcjUB.HkB.vexFH81CHW1CHWBEH8SE-4SFHFeoqR1hnt; SSOLoginState=1658820861; SUB=_2A25P2-lQDeRhGeBK61YV9yfFzDmIHXVskV2YrDV8PUNbmtB-LUbWkW9NR_ov0TIBeUK5-S-1R4SovXtuDCjk8OPS; ALF=1690356861; WBPSESS=_yD7d1e0Lx4Q-bSq9ElPWdiywqHVlO6SWR-pLheG9PfKKZe_DQqOGIXITXz70o28K-G7t1QHVTpZmfjSSOjAHkkB-7Eho3zU53JSHhuELaj0iYRXPIuPS6cLm-CYtc5h1Jto6Sd59cNSGTFT1uZHCg==', 'SINAGLOBAL=1115780942584.974.1658635968559; ULV=1658635968565:1:1:1:1115780942584.974.1658635968559:; XSRF-TOKEN=sT1VFPal6Esg0qCjb8tVD8Mq; SUB=_2A25P2-vfDeRhGeBJ6VoV9ynNyj2IHXVskVoXrDV8PUNbmtB-LWn5kW9NRkZg4J5iNTEgfo49HotkSza3PQkTaHdG; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFXRGd-qUob5L7Bm_6kME4P5JpX5KzhUgL.FoqNeonXS0MpeK22dJLoI0YLxK-L1-zL1-zLxKqL1-eL1hnLxKBLB.zLB.BLxKML1-2L1hBLxK-LBo5L12qLxKnL1-zL12zLxKBLB.zL1KBt; ALF=1690357519; SSOLoginState=1658821519; WBPSESS=vsdYZqdoCl1sU3su5usyXiWJQIXJDgJxdJcFSAC-EtdryyK_v8pHujB9YV92Pc8W9RdrvuTKHytoQZ0miMn2Nkg5ksGlZ4Ofy1XunAhJHyILZlio-HHpXCNRMsTyov0vYZDagqIA_Q2xQOQOATr6sg==; PC_TOKEN=8b446a2d7c', 'SINAGLOBAL=4669199551025.64.1653350613001; _s_tentry=-; Apache=9310493089723.533.1658795311962; ULV=1658795312024:3:2:2:9310493089723.533.1658795311962:1658709963132; XSRF-TOKEN=PMY7HEw_E1A51awyzJEml7gS; login_sid_t=d946353afd8df8809cf52b84c9d56e62; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; wb_view_log=1920*10801; SUB=_2A25P29F2DeRhGeFK4loT-S3NzjWIHXVskUW-rDV8PUNbmtAKLUGjkW9NQqQjrxHSUSK14a35RIh8zYbU7z-Rr_GG; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4Dbhgg_lARclhPbOYA5xm5JpX5KzhUgL.FoMX1KnE1KepSK.2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNSh.Reo.0eK-4; ALF=1690358950; SSOLoginState=1658822950; PC_TOKEN=18dbaa03eb; wvr=6; wb_view_log_7498293159=1920*10801; webim_unReadCount=%7B%22time%22%3A1658822966290%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D; WBPSESS=Dt2hbAUaXfkVprjyrAZT_NVqfvP7CID8jIb0n0yBlmOtM57xlaAyx9zUEpyxzH25quqtSoCuMUPVbND4gXMuunuX3C3LAAdZ1T3_PYXo9aJlUl_g2zwsjVo9BxyPsQuoB_Q8GdWCbC0yg3bu9Pve0PfGGcLlsX-p9i4rSnCGw5g=', 'PC_TOKEN=6844a497ab; login_sid_t=80940089e5c99a3eecfec30d09123bc2; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=passport.weibo.com; Apache=7108406010788.637.1658823054338; SINAGLOBAL=7108406010788.637.1658823054338; ULV=1658823054340:1:1:1:7108406010788.637.1658823054338:; wb_view_log=1707*10671.5; WBtopGlobal_register_version=2022072616; crossidccode=CODE-tc-1OgfEG-24jNO2-26adT2x3JOUCXHL4a50d5; UOR=,,graph.qq.com; SUB=_2A25P29G0DeRhGeRL7lQX8ifEyDmIHXVskUR8rDV8PUNbmtAKLUfMkW9NUxIqr1Ql1TsFoO7-bUG4g9-WNiIlkoKv; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFU3-ypVYr3Nx9PSyu4g9VZ5JpX5KzhUgL.FozfSKqceo.Re0-2dJLoIEnLxK.LB.-L1K.LxKqLBo2L1-qLxK.L1-BL1KzLxK-LBo.LBoB41hzc; ALF=1690359138; SSOLoginState=1658823140; wvr=6', 'XSRF-TOKEN=0hLGEnnKp5G9LN5YcjImd8ws', 'SINAGLOBAL=4669199551025.64.1653350613001; wb_view_log=1920*10801; wvr=6; wb_view_log_7498293159=1920*10801; webim_unReadCount=%7B%22time%22%3A1658822966290%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D; XSRF-TOKEN=JBlRQkpft06UZV7iraI0uErM; PC_TOKEN=83467e908a; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5KMhUgL.Foz4SKB7Sh2NSo-2dJLoI79-KLU1xHzt; ALF=1690372094; SSOLoginState=1658836094; SCF=AqGF443jTI-VnS382ShffcHWQtfFQOpHNwMOX8cXFlasO4rTnDyDrnElJ_LLl2Ds7NTvIUsdguQyrchkMZqBfCY.; SUB=_2A25P26QuDeRhGeRH7lYR9C_LzTmIHXVskJLmrDV8PUNbmtAKLW72kW9NTbHM80kR_n2kXr3TqD4At7-q0D0_uxpF; _s_tentry=weibo.com; Apache=2356552200861.233.1658836108863; ULV=1658836108904:4:3:3:2356552200861.233.1658836108863:1658795312024; WBPSESS=Dt2hbAUaXfkVprjyrAZT_ChHS3X1TW-AxSmMe79akhJ6zWbLhSGfgAlvYpclUtRZTDkEqbTwyBiyymCqtMzosRwY7DuzYTrtbFieO-tZNJTDHyfrTmjNAJIwnGRrHRZDt_viO5WKWsUkQx4H8xge1ACYKutmbhgaY40kD1T9OKTeupUgTD0oNMcuJrSAJrgWOtNxE6AfEQSDjC6nzNjIxA==' ] @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 拦截所有未发生异常的请求(正常的请求)UA池、cookie池 def process_request(self, request, spider): #print('process_request') # 使用UA池进行请求的UA伪装 # 请求头信息 获取到的是字典 # 这一步可有可无 因为你可以在settings中设置一个共同的User-Agent request.headers['User-Agent'] = random.choice(self.user_agent_list) request.headers['cookie']= random.choice(self.cookie_list) request.headers['referer']='https://weibo.com/5993531560/LDCKJjIDP' request.headers['sec-ch-ua']='".Not/A)Brand";v = "99", "Google Chrome";v = "103", "Chromium";v = "103"' request.headers['sec-ch-ua-mobile']='?0' request.headers['sec-ch-ua-platform']='"Windows"' request.headers['client-version']='v2.34.73' request.headers['accept']='application/json,text/plain,*/*' #print(request.headers['User-Agent']) #print(request.headers['cookie']) return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response #拦截所有异常请求代理ip在此编写 def process_exception(self, request, exception, spider): # 这一步是必须要用的 因为当你访问一个网站次数过多的时候 你可以使用代理IP继续爬取该网站的数据 #使用代理池进行请求代理ip的设置 #request.url 返回的是请求对象所对应的URL #print('process_exception') if request.url.split(':')[0] == 'http': request.meta['proxy'] = random.choice(self.PROXY_http) else: request.meta['proxy'] = random.choice(self.PROXY_https) pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
5.管道
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter class ScrapyXinlangweiboPipeline: def process_item(self, item, spider): return item #加载settings文件 from scrapy.utils.project import get_project_settings #导入pymysql import pymysql # 管道-Mysql持久化 class MysqlPipeline: def open_spider(self,spider): settings = get_project_settings() self.host = settings['DB_HOST'] self.port = settings['DB_PORT'] self.user = settings['DB_USER'] self.password = settings['DB_PASSWORD'] self.database = settings['DB_NAME'] self.charset = settings['DB_CHARSET'] self.connect() def connect(self): self.conn =pymysql.connect( host=self.host, port=self.port, user=self.user, password=self.password, db=self.database, charset=self.charset ) self.cursor = self.conn.cursor() def process_item(self, item, spider): # 插入数据库 # sql = 'insert into xinlangweibo(name,time,source,txt,forward,comment,fabulous,createTime,commentContent) values("{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format( # item['name'], item['time'].strip(), item['source'], item['txt'], item['forward'], item['comment'], # item['fabulous'], item['createTime'], item['commentContent']) #name=name,time=time,source=source,comment=comment,fabulous=fabulous,createTime=createTime,commentContent=commentContent sql = 'insert into xinlangweibo1(name,createdAt,source,comment,fabulous,createTime,commentContent,maxId) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format( item['name'], item['createdAt'], item['source'], item['comment'],item['fabulous'], item['createTime'], item['commentContent'], item['maxId']) # 执行sql语句 self.cursor.execute(sql) # 提交 self.conn.commit() return item def close_spider(self,spider): self.cursor.close() self.conn.close()
6.设置
# Scrapy settings for scrapy_xinlangweibo project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapy_xinlangweibo' SPIDER_MODULES = ['scrapy_xinlangweibo.spiders'] NEWSPIDER_MODULE = 'scrapy_xinlangweibo.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scrapy_xinlangweibo (+http://www.yourdomain.com)' # Obey robots.txt rules #修改默认的robotstxt协议,注释即可 # ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', # 'Accept-Language': 'zh-CN,zh;q=0.9', # 'cookie': 'SUB=_2A25PiFDGDeRhGeRH7lYR9C_LzTmIHXVtc3COrDV8PUJbkNAKLRH2kW1NTbHM8wP7UPnTjvTAmox62rVYXbj0cyiW; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5oz75NHD95QE1K-XehBpS0qfWs4Dqcj1i--Xi-iFiKnpehnp9sMt; SINAGLOBAL=4669199551025.64.1653350613001; _s_tentry=-; Apache=9310493089723.533.1658795311962; ULV=1658795312024:3:2:2:9310493089723.533.1658795311962:1658709963132; XSRF-TOKEN=PMY7HEw_E1A51awyzJEml7gS; WBPSESS=Q8ByW2eX66YxwLnF-6h7NbWzJdod3892U-ZXN9pdE9QrbKDsJsoAQn3RCW8SlWji79oYfXnJFhKrQwCUGGOZXF5ZAuRZsyVYCnw6mBAvWTX5sYbh4j4N_skQz50KRJ8JDI487rDXbHNazhVdKMJ4Ug==', # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36' # } HTTPERROR_ALLOWED_CODES = [400] # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scrapy_xinlangweibo.middlewares.ScrapyXinlangweiboSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'scrapy_xinlangweibo.middlewares.ScrapyXinlangweiboDownloaderMiddleware': 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} #配置数据库,名称一定要大写 DB_HOST = 'Ip' DB_PORT = 3306 DB_USER = 'root' DB_PASSWORD = '密码' DB_NAME = '数据库' # utf-8的“-”杠不允许写,否则就报错 DB_CHARSET = 'utf8' # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scrapy_xinlangweibo.pipelines.ScrapyXinlangweiboPipeline': 300, 'scrapy_xinlangweibo.pipelines.MysqlPipeline': 301 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!