scrapy循环爬取色花堂标题和浏览次数
爬虫部分代码
复制import scrapy
class ItcastSpider(scrapy.Spider):
name = 'sehuatang'
start_urls = ['https://rtuytuytuewr.xyz/forum-2-2.html']
def parse(self, response):
tr_list=response.xpath('//table//tr')[5:-2]
for tr in tr_list:
item={}
item["common"]=tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
item["num"]=tr.xpath('./td[@class="num"]/em/text()').extract_first()
yield item
page_count=str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()').extract_first()).replace('/',"").replace("页","")
current_page=str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
if int(page_count)!=int(current_page):
next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
next_url='https://rtuytuytuewr.xyz/'+next_url
print(next_url,int(page_count),int(current_page))
yield scrapy.Request(
url=next_url,
callback=self.parse
)
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步