爬虫框架scrapy(2)post请求,传递item参数,加速爬虫效率,UA池及代理池
scrapy 发送post请求
class PostSpider(scrapy.Spider): name = 'post' start_urls = ['https://fanyi.baidu.com/sug'] #原始作用:将起始url列表中的url进行get请求的发送. #通过如下操作进行父类方法的重写,让其进行post请求的发送 def start_requests(self): data = { 'kw':'dog' } for url in self.start_urls: yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data) def parse(self, response): print(response.text)
核心:
重写父类 start_requests方法,默认的 start_requests方法提交的是yield scrapy.Request(url=url,formdata=formdata,callback=self.parse)这种get请求,
改写为 yield scrapy.FormRequest(url=url,formdata=formdata,callback=self.parse) formdata为传递参数
传递item参数,在参数中添加 meta
def parse(self, response): div_list = response.xpath('//div[@class="col-xs-1-5 movie-item"]') for div in div_list: item = MovieproItem() item['title'] = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first() item['score'] = div.xpath('.//div[@class="meta"]/h1/em/text()').extract_first() detail_url ='https:'+ div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first() yield scrapy.Request(url=detail_url,callback=self.getdata,meta={'item':item}) def getdata(self,respose): item =respose.meta['item'] item["deactor"]=respose.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first() item["desc"]=respose.xpath('//div[@class="col-xs-12 movie-introduce"]/p/text()').extract_first() yield item
加速爬虫效率在settings中配置以下
加速爬虫效率 CONCURRENT_REQUESTS = 10 开启线程数量 LOG_LEVEL = 'ERROR' 打印日志等级 COOKIES_ENABLED = False 对于不需要处理cookies RETRY_ENABLED = False 是否重试 DOWNLOAD_TIMEOUT = 5 超时处理
UA池和代理池
在中间件文件中按照如下配置,代理池可以再www.goubanjia.com中找取,注意http与https
class ProxyproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. proxy_http = ['http://39.137.168.229:8080', 'http://103.218.240.182:80', 'http://80.26.152.146:60133'] proxy_https = ['https://221.6.201.18:9999', 'https://220.180.50.14:53281', 'https://140.227.200.38:3128'] user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def process_request(self, request, spider): print('下载中间件',request) if request.url.split(':')[0] == 'http': request.meta['proxy'] = random.choice(self.proxy_http) else: request.meta['proxy'] = random.choice(self.proxy_https) request.headers['User-Agent'] = random.choice(self.user_agent_list) return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)