scrapy基本爬虫,采集多页
# -*- coding: utf-8 -*- import csv import scrapy class GjSpider(scrapy.Spider): name = 'gj' allowed_domains = ['ganji.com'] start_urls = ['http://sz.ganji.com/zufang/'] def parse(self, response): houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]') for houst in houseList: title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first() size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first() chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first() price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first() address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first() address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first() item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)} yield item next_links = response.xpath('.//div[@class="pageBox"]//a[contains(@class,"next")]/@href').extract() if(len(next_links) > 0) : next_link = next_links[0] print(next_link) yield scrapy.Request(next_link,self.parse)