蜜罐之url动手脚

路径陷阱:列表页获取的url动手脚,与实际 

 

 

import json

import scrapy
from yangguang.items import GuSuItem



class GusuSpider(scrapy.Spider):
    name = 'gusu'
    # allowed_domains = ['gusu.gov.cn']
    # start_urls = ['http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/']

    def start_requests(self):
        url = 'http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/'
        data = {
            'type': '12',
            'pagesize': '10',
            'keywords': '',
            'currpage': '2',
            'deptcode': '014152419',
            'check': 'do'
        }
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'
        }
        request = scrapy.FormRequest(url, formdata=data, headers=headers, callback=self.parse)
        yield request

    def parse(self, response):
        txt = response.text
        data = json.loads(txt)
        infolist = data['infolist']
        for info in infolist:
            item = GuSuItem()
            page_id = info['consult_link'].split('/')[-1]
            text = page_id.split('?')[0]
            item['url'] = info['consult_link'].replace(text,'detail')
            print(item)
            yield scrapy.Request(url=item['url'], callback=self.parse_detail, meta={"item": item})

    def parse_detail(self,response):
        item = response.meta['item']
        table = response.xpath('//table[@class="tablecon"]/tbody')
        item['title'] = table.xpath('./tr[1]/td[2]/text()').extract_first()
        item['publish_date'] = table.xpath('./tr[2]/td[2]/text()').extract_first()
        item['content'] = table.xpath('./tr[3]/td[2]/text()').extract_first()
        print(item)
        yield item

 

posted @ 2021-04-26 16:58  Eliphaz  阅读(91)  评论(0编辑  收藏  举报