Loading

Scrapy 框架 手动发送请求 POST 请求的发送

 

手动发送请求

import scrapy

from choutiSpider.items import ChoutispiderItem


class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://dig.****.com/r/scoff/hot/1']

    # 通用模板 url
    url = 'https://dig.****.com/r/scoff/hot/%s'
    page_num = 1

    def parse(self, response):
        div_list = response.xpath('//div[@id="content-list"]/div')
        # print(div_list)

        print(self.page_num)
        for div in div_list:
            content = div.xpath('./div[@class="news-content"]/div[1]/a/text()').extract_first().strip()
            author = div.xpath('./div[@class="news-content"]/div[2]/a[4]/b/text()').extract_first()
            # print(content, author)
            item = ChoutispiderItem()
            item['author'] = author
            item['content'] = content
            # 提交数据 到管道中
            yield item
        # 手动发送请求  分页爬取 
        if self.page_num < 120:
            self.page_num += 1
            new_url = self.url % self.page_num
            # 发送请求 提交
            yield scrapy.Request(url=new_url, callback=self.parse)

post 请求发送

# 在scrapy框架中默认情况下cookie会被自动处理,无需手动!

class PostdemoSpider(scrapy.Spider):
    name = 'postdemo'
    allowed_domains = ['www.xxx.com']
    start_urls = ['https://fanyi.****.com/sug']
    # 调用父类的 发送请求的 方法
    def start_requests(self):
        for url in self.start_urls:
            data = {
                'kw': 'cat'
            }
            yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse)

    def parse(self, response):
        print(response.text)

在scrapy框架中默认情况下cookie会被自动处理,无需手动!

settings 配置:

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

请求传参:

  • 二级详情页面 的 item 传递
import scrapy

from boosPro.items import BoosproItem


class BoosSpider(scrapy.Spider):
    name = 'boos'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1']

    url = 'https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=%s'
    page_num = 1

    def parse(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')
        for li in li_list:
            item = BoosproItem()
            title = li.xpath('.//div[@class="job-title"]/text()').extract_first()
            # 薪资 salary
            salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
            # 公司 company
            company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
            detail_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first()
            item['title'] = title
            item['salary'] = salary
            item['company'] = company

            # 对详情页的url进行手动请求的发送
            yield scrapy.Request(url=detail_url, callback=self.parsrDetail, meta={'item': item})

        if self.page_num <= 3:
            self.page_num += 1
            newUrl = self.url % self.page_num
            yield scrapy.Request(url=newUrl, callback=self.parse)

    # 用来解析详情页的相关的数据
    def parsrDetail(self, response):
        # 接收meta
        item = response.meta['item']
        job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
        company_content = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[2]/div/text()').extract_first()

        job_desc = ' '.join(job_desc)
        item['job_desc'] = job_desc
        item['company_content'] = company_content
        # print(job_desc, 1111111)

        yield item
posted @ 2019-07-01 11:54  Ellisonzhang  阅读(525)  评论(0编辑  收藏  举报