scrapy中间件 下载中间件Downloader Middlewares和开发代理中间件
1.创建一个scrapy项目
scrapy startproject SpiderAnything
2.生成一个爬虫 itcash爬虫名字, itcash.cn爬虫范围
scrapy genspider itcast 'itcast.cn'
# 启动爬虫
scrapy list 查看有哪些爬虫
scrapy crawl itcast
3.提取数据
完善spider, 使用xpath
# -*- coding: utf-8 -*- import scrapy class ItcastSpider(scrapy.Spider): name = 'itcast' # 爬虫名字 allowed_domains = ['itcast.cn'] # 爬取范围 start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] # 最开始请求的url地址 def parse(self, response): # 处理start_urls地址对应的响应 extract提取文字 # res = response.xpath("//div[@class='li_txt']//h3/text()").extract() # print(res) # 分组 li_list = response.xpath("//div[@class='li_txt']") for li in li_list: item = {} # 存入pipeline settings开启ITEM_PIPELINES item['tea_name'] = li.xpath(".//h3/text()").extract_first() # 返回列表第一个,无则None item['tea_position'] = li.xpath(".//h4/text()").extract_first() # 减少内存占用 存入pipeline yield item # Request, BaseItem对象, dict, None
settings ITEM_PIPELINES = { 'SpiderAnything.pipelines.SpideranythingPipeline': 300, # 权重 权重越小优先级越高,就使用该pipeline 'SpiderAnything.pipelines.SpideranythingPipeline1': 301, }