Yyield && scrapy案例:当当网爬取数据 &&开启多管道下载
yield
开启多管道在setting.py中新开一个管道 ,pipelines 下写详细,注意url地址。
当当网爬取案例
import scrapy from scrapy_dangdang_095.items import ScrapyDangdang095Item class DangSpider(scrapy.Spider): name = 'dang' # 如果是多页下载的话 那么必须要调整的是allowed_domains的范围 一般情况下只写域名 allowed_domains = ['category.dangdang.com'] start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html'] base_url = 'http://category.dangdang.com/pg' page = 1 def parse(self, response): # pipelines 下载数据 # items 定义数据结构的 # src = //ul[@id="component_59"]/li//img/@src # alt = //ul[@id="component_59"]/li//img/@alt # price = //ul[@id="component_59"]/li//p[@class="price"]/span[1]/text() # 所有的seletor的对象 都可以再次调用xpath方法 li_list = response.xpath('//ul[@id="component_59"]/li') for li in li_list: src = li.xpath('.//img/@data-original').extract_first() # 第一张图片和其他的图片的标签的属性是不一样的 # 第一张图片的src是可以使用的 其他的图片的地址是data-original if src: src = src else: src = li.xpath('.//img/@src').extract_first() name = li.xpath('.//img/@alt').extract_first() price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first() book = ScrapyDangdang095Item(src=src,name=name,price=price) # 获取一个book就将book交给pipelines yield book # 每一页的爬取的业务逻辑全都是一样的,所以我们只需要将执行的那个页的请求再次调用parse方法 # 就可以了 # http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html # http://category.dangdang.com/pg3-cp01.01.02.00.00.00.html # http://category.dangdang.com/pg4-cp01.01.02.00.00.00.html if self.page < 100: self.page = self.page + 1 url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html' # 怎么去调用parse方法 # scrapy.Request就是scrpay的get请求 # url就是请求地址 # callback是你要执行的那个函数 注意不需要加() yield scrapy.Request(url=url,callback=self.parse)
items.py:定义数据结构
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ScrapyDangdang095Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 通俗的说就是你要下载的数据都有什么 # 图片 src = scrapy.Field() # 名字 name = scrapy.Field() # 价格 price = scrapy.Field()
pipelines.py :现在setting解开管道注释
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter # 如果想使用管道的话 那么就必须在settings中开启管道 class ScrapyDangdang095Pipeline: # 在爬虫文件开始的之前就执行的一个方法 def open_spider(self,spider): self.fp = open('book.json','w',encoding='utf-8') # item就是yield后面的book对象 def process_item(self, item, spider): # 以下这种模式不推荐 因为每传递过来一个对象 那么就打开一次文件 对文件的操作过于频繁 # # (1) write方法必须要写一个字符串 而不能是其他的对象 # # (2) w模式 会每一个对象都打开一次文件 覆盖之前的内容 # with open('book.json','a',encoding='utf-8')as fp: # fp.write(str(item)) self.fp.write(str(item)) return item # 在爬虫文件执行完之后 执行的方法 def close_spider(self,spider): self.fp.close() import urllib.request # 多条管道开启 # (1) 定义管道类 # (2) 在settings中开启管道 # 'scrapy_dangdang_095.pipelines.DangDangDownloadPipeline':301 class DangDangDownloadPipeline: def process_item(self, item, spider): url = 'http:' + item.get('src') filename = './books/' + item.get('name') + '.jpg' urllib.request.urlretrieve(url = url, filename= filename) return item
setting.py
# Scrapy settings for scrapy_dangdang_095 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapy_dangdang_095' SPIDER_MODULES = ['scrapy_dangdang_095.spiders'] NEWSPIDER_MODULE = 'scrapy_dangdang_095.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scrapy_dangdang_095 (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scrapy_dangdang_095.middlewares.ScrapyDangdang095SpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'scrapy_dangdang_095.middlewares.ScrapyDangdang095DownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 管道可以有很多个 那么管道是有优先级的 优先级的范围是1到1000 值越小优先级越高 'scrapy_dangdang_095.pipelines.ScrapyDangdang095Pipeline': 300, # DangDangDownloadPipeline 'scrapy_dangdang_095.pipelines.DangDangDownloadPipeline':301 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· 字符编码:从基础到乱码解决
· 提示词工程——AI应用必不可少的技术
2022-01-04 Android 项目结构