scrapy基本爬虫,采集多页

# -*- coding: utf-8 -*-
import csv

import scrapy


class GjSpider(scrapy.Spider):
    name = 'gj'
    allowed_domains = ['ganji.com']
    start_urls = ['http://sz.ganji.com/zufang/']



    def parse(self, response):
        houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]')
        for houst in houseList:
            title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
            size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
            chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
            price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
            address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
            address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first()

            item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
            yield item

        next_links = response.xpath('.//div[@class="pageBox"]//a[contains(@class,"next")]/@href').extract()
        if(len(next_links) > 0) :
            next_link = next_links[0]
            print(next_link)
            yield scrapy.Request(next_link,self.parse)

  

posted @ 2020-03-16 17:39  brady-wang  阅读(360)  评论(0编辑  收藏  举报