19 03 13 关于 scrapy 框架的 对环球网的整体爬取(存储于 mongodb 数据库里)
关于 spinder 在这个框架里面 和不用数据库 相同
# -*- coding: utf-8 -*- import scrapy from yang_guan.items import YangGuanItem from copy import deepcopy from scrapy.spiders import CrawlSpider class YgSpider(scrapy.Spider): name = 'yg' allowed_domains = ['huanqiu.com'] start_urls = ['http://www.huanqiu.com/', ] def parse (self, response): # 总页面 第一个一定要用parse 用来传递start_urls item = YangGuanItem() # item = {} class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a") print(class_news_urls_li) for class_news_url in class_news_urls_li: item["class_tittle"] = class_news_url.xpath("./text()").extract_first() print(item) new_url = class_news_url.xpath("./@href").extract_first() print(new_url) yield scrapy.Request( new_url, callback=self.second_class, meta={"item": deepcopy(item)}, # 由于是多线程 所以要用深拷贝进入item ) def second_class(self, response): # 二级页面 item = response.meta["item"] print(response.url) second_urls = response.xpath(".//div/h2/em") for second_url in second_urls: secoond_news_url = second_url.xpath("./a/@href").extract_first() yield scrapy.Request( secoond_news_url, callback=self.parse_detail_analyze, meta={"item": deepcopy(item)} ) def parse_detail_analyze(self, response): # 进入第三成 总细节的抓取 http://china.huanqiu.com/leaders/' item = response.meta["item"] li_list = response.xpath("//ul[@class='listPicBox']/li") for li in li_list: # item = YangGuanItem() item["title"] = li.xpath("./h3/a/text()").extract_first() item["img_url"] = li.xpath("./a/img/@src").extract_first() item["detail"] = li.xpath("./h5/text()").extract_first() yield item next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first() # 遇见翻页就要这样写 yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})
关于 pipelines 的 管道设定
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo class YangGuanPipeline(object): def __init__(self): # 建立mongodb 数据库连接 client = pymongo.MongoClient('127.0.0.1', 27017) # 连接数据库,['scrapy_huan_qiu] db = client['scrapy_huan_qiu'] # 连接所用的集合 self.post = db['zong_huan_qiu'] print("*"*100) def process_item(self, item, spider): postItem = dict(item) self.post.insert(postItem) return item
setting 的设置
# -*- coding: utf-8 -*- # Scrapy settings for yang_guan project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html # 好像记得是ip代理 PROXIES = [ {'ip_port': '111.11.228.75:80', 'user_pass': ''}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, {'ip_port': '122.224.249.122:8088', 'user_pass': ''},] BOT_NAME = 'yang_guan' SPIDER_MODULES = ['yang_guan.spiders'] NEWSPIDER_MODULE = 'yang_guan.spiders' # LOG_LEVEL = "WARNING" # Crawl responsibly by identifying yourself (and your website) on the user-agent # 计算机型号防止反爬虫 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' # Obey robots.txt rules # 不遵守爬虫机器人协议 ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'yang_guan.middlewares.YangGuanSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'yang_guan.middlewares.YangGuanDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 开启管道 由于这次没有编写 items 也无法保存进入数据库 ITEM_PIPELINES = { 'yang_guan.pipelines.YangGuanPipeline': 300, } # 关于 debug等级 和生成log日志 # LOG_FILE = "dg.log" # LOG_LEVEL = "DEBUG" # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
关于item 的设置 这个一定要有 用spider 里面的 yield 来进行传递 字典
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() class_tittle = scrapy.Field() img_url = scrapy.Field() detail = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() class_tittle = scrapy.Field() img_url = scrapy.Field() detail = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() class_tittle = scrapy.Field() img_url = scrapy.Field() detail = scrapy.Field()