蜜罐之url动手脚
路径陷阱:列表页获取的url动手脚,与实际
import json import scrapy from yangguang.items import GuSuItem class GusuSpider(scrapy.Spider): name = 'gusu' # allowed_domains = ['gusu.gov.cn'] # start_urls = ['http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/'] def start_requests(self): url = 'http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/' data = { 'type': '12', 'pagesize': '10', 'keywords': '', 'currpage': '2', 'deptcode': '014152419', 'check': 'do' } headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36' } request = scrapy.FormRequest(url, formdata=data, headers=headers, callback=self.parse) yield request def parse(self, response): txt = response.text data = json.loads(txt) infolist = data['infolist'] for info in infolist: item = GuSuItem() page_id = info['consult_link'].split('/')[-1] text = page_id.split('?')[0] item['url'] = info['consult_link'].replace(text,'detail') print(item) yield scrapy.Request(url=item['url'], callback=self.parse_detail, meta={"item": item}) def parse_detail(self,response): item = response.meta['item'] table = response.xpath('//table[@class="tablecon"]/tbody') item['title'] = table.xpath('./tr[1]/td[2]/text()').extract_first() item['publish_date'] = table.xpath('./tr[2]/td[2]/text()').extract_first() item['content'] = table.xpath('./tr[3]/td[2]/text()').extract_first() print(item) yield item