初识scrapy
参考:https://www.cnblogs.com/wupeiqi/articles/6229292.html
上面使用HtmlXPathSelector,python3.7没有这个模块,我就直接使用Selector了
ImportError: cannot import name 'HtmlXPathSelector' from 'scrapy.selector'
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
from scrapy.selector import Selector
from scrapy.http import Request
class XiaohuarSpider(scrapy.Spider):
name = 'xiaohuar'
allowed_domains = ['xiaohuar.com']
start_urls = ['http://www.xiaohuar.com/list-1-0.html']
visited_set = set()
def parse(self, response):
#1、当前页面的所有校花爬下来
#获取div并且属性为class ="item masonry_brick"
#print(response)
self.visited_set.add(response.url) #为了爬取的url不在爬取
hxs = Selector(response)
item_list =hxs.xpath("//div[@class='item masonry_brick']")
for item in item_list:
#.//当前标签的吱吱孙孙
v= item.xpath(".//span[@class='price']/text()").extract_first()
print(v)
#2、在当前页中获去
page_list = hxs.xpath('//a[re:test(@href,"http://www.xiaohuar.com/list-1-\d+.html")]/@href').extract()
for url in page_list:
#callback是当下载页面完成后交给那个处理
if url in self.visited_set:
pass
else:
obj = Request(url=url,method="GET",callback=self.parse)
#yield是交给模块下载页面
yield obj