初识scrapy

参考:https://www.cnblogs.com/wupeiqi/articles/6229292.html

上面使用HtmlXPathSelector,python3.7没有这个模块,我就直接使用Selector了

ImportError: cannot import name 'HtmlXPathSelector' from 'scrapy.selector'

# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
from scrapy.selector import Selector
from scrapy.http import Request

class XiaohuarSpider(scrapy.Spider):
    name = 'xiaohuar'
    allowed_domains = ['xiaohuar.com']
    start_urls = ['http://www.xiaohuar.com/list-1-0.html']
    visited_set = set()
    def parse(self, response):
        #1、当前页面的所有校花爬下来
        #获取div并且属性为class ="item masonry_brick"
        #print(response)
        self.visited_set.add(response.url) #为了爬取的url不在爬取
        hxs = Selector(response)
        item_list =hxs.xpath("//div[@class='item masonry_brick']")
        for item in item_list:
            #.//当前标签的吱吱孙孙
            v= item.xpath(".//span[@class='price']/text()").extract_first()
            print(v)
        #2、在当前页中获去

        page_list = hxs.xpath('//a[re:test(@href,"http://www.xiaohuar.com/list-1-\d+.html")]/@href').extract()
        for url in page_list:
            #callback是当下载页面完成后交给那个处理
            if url in self.visited_set:
                pass
            else:
                obj = Request(url=url,method="GET",callback=self.parse)
                #yield是交给模块下载页面
                yield obj






posted @ 2019-10-16 01:07  智、心  阅读(614)  评论(0编辑  收藏  举报