scrapy 递归解析和post请求
实现方案:使用Request方法手动发起请求。
示例一
import scrapy from moviePro.items import MovieproItem class MovieSpider(scrapy.Spider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/9.html'] #接收一个请求传递过来的数据 def detail_parse(self,response): item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()')
.extract_first() item['desc'] = desc yield item def parse(self, response): li_list = response.xpath('//div[@class="stui-pannel_bd"]/ul/li') for li in li_list: name = li.xpath('.//h4[@class="title text-overflow"]/a/text()').extract_first() detail_url = 'https://www.4567tv.tv'+li.xpath('.//h4[@class="title text-overflow"]/a/@href').extract_first() item = MovieproItem() item['name'] = name #meta是一个字典,字典中所有的键值对都可以传递给指定好的回调函数 yield scrapy.Request(url=detail_url,callback=self.detail_parse,meta={'item':item})
实例二
import scrapy from qiushibaike.items import QiushibaikeItem # scrapy.http import Request class QiushiSpider(scrapy.Spider): name = 'qiushi' allowed_domains = ['www.qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/'] #爬取多页 pageNum = 1 #起始页码 url = 'https://www.qiushibaike.com/text/page/%s/' #每页的url def parse(self, response): div_list=response.xpath('//*[@id="content-left"]/div') for div in div_list: #//*[@id="qiushi_tag_120996995"]/div[1]/a[2]/h2 author=div.xpath('.//div[@class="author clearfix"]//h2/text()').extract_first() author=author.strip('\n') content=div.xpath('.//div[@class="content"]/span/text()').extract_first() content=content.strip('\n') item=QiushibaikeItem() item['author']=author item['content']=content yield item #提交item到管道进行持久化 #爬取所有页码数据 if self.pageNum <= 13: #一共爬取13页(共13页) self.pageNum += 1 url = format(self.url % self.pageNum) #递归爬取数据:callback参数的值为回调函数(将url请求后,得到的相应数据继续进行parse解析),递归调用parse函数 yield scrapy.Request(url=url,callback=self.parse)
post请求
重写start_requests方法,让其发起post请求
def start_requests(self): #请求的url post_url = 'http://fanyi.baidu.com/sug' # post请求参数 formdata = { 'kw': 'wolf', } # 发送post请求 yield scrapy.FormRequest(url=post_url, formdata=formdata, callback=self.parse) def parse(self, response): print(response.text)