swagger上的接口写入数据库
一、依赖
virtualenv -p python3.6 xx
pip install scrapy
pip install pymysql
二、
1、创建项目和spider1
scrapy startproject scraw_swagger
scrapy genspider spider1 xxx.com (执行之后在项目的spiders目录下会生成一个spider1.py的文件)
以下代码主要实现了将swagger的第一级目录爬下来存在一个叫:interfaces_path的文件下
# -*- coding: utf-8 -*- import scrapy import json from scraw_swagger import settings class Spider1Spider(scrapy.Spider): name = 'spider1' allowed_domains = ['xxx.com''] scrawl_domain = settings.interface_domain+'/api-docs' start_urls = [scrawl_domain] def parse(self, response): # 调试代码 # filename = 'mid_link' # open(filename, 'wb').write(response.body) # ///////// response = response.body response_dict = json.loads(response) apis = response_dict['apis'] n = len(apis) temppath = [] i = 0 domain = settings.interface_domain+'/api-docs' filename = 'interfaces_path' file = open(filename, 'w') for i in range(0, n): subapi = apis[i] path = subapi['path'] path = ','+domain + path temppath.append(path) file.write(path)
2、创建spider2
scrapy genspider spider2 xxx.com (执行之后在项目的spiders目录下会生成一个spider2.py的文件)
以下代码主要实现了获取interfaces_path的文件下的地址对应的内容
# # -*- coding: utf-8 -*- import scrapy from scraw_swagger.items import ScrawSwaggerItem import json from scraw_swagger import settings class Spider2Spider(scrapy.Spider): name = 'spider2' allowed_domains = ['xxx.com'] file = open('interfaces_path', 'r') file = file.read() list_files = [] files = file.split(',') n = len(files) for i in range(1, n): file = files[i] list_files.append(file) start_urls = list_files def parse(self, response): outitem = ScrawSwaggerItem() out_interface = [] out_domain = [] out_method = [] out_param_name = [] out_data_type = [] out_param_required = [] # 调试代码 # filename = response.url.split("/")[-1] # open('temp/'+filename, 'wb').write(response.body) # /////// response = response.body response_dict = json.loads(response) items = response_dict['apis'] items_len = len(items) for j in range(0, items_len): path = items[j]['path'] # interface组成list operations = items[j]['operations'][0] method = operations['method'] parameters = operations['parameters'] parameters_len = len(parameters) param_name = [] param_required = [] data_type = [] for i in range(0, parameters_len): name = parameters[i]['name'] param_name.append(name) required = parameters[i]['required'] param_required.append(required) type = parameters[i]['type'] data_type.append(type) out_interface.append(path) interface_domain = settings.interface_domain out_domain.append(interface_domain) out_method.append(method) out_data_type.append(data_type) out_param_name.append(param_name) out_param_required.append(param_required) outitem['interface'] = out_interface outitem['domain'] = out_domain outitem['method'] = out_method outitem['param_name'] = out_param_name outitem['param_required'] = out_param_required outitem['data_type'] = out_data_type yield outitem
3、settings.py文件
# -*- coding: utf-8 -*- # Scrapy settings for scraw_swagger project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html interface_domain = 'test' token = 'test' # 调试代码 # interface_domain = 'http://xxxx..net' # token = 'xxxxxx' # /////////////// BOT_NAME = 'scraw_swagger' SPIDER_MODULES = ['scraw_swagger.spiders'] NEWSPIDER_MODULE = 'scraw_swagger.spiders' FEED_EXPORT_ENCODING = 'utf-8' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scraw_swagger (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scraw_swagger.middlewares.ScrawSwaggerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'scraw_swagger.middlewares.ScrawSwaggerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scraw_swagger.pipelines.ScrawSwaggerPipeline': 300, # 'scraw_swagger.pipelines.MysqlTwistedPipline': 200, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' MYSQL_HOST = '' MYSQL_DBNAME = '' MYSQL_USER = '' MYSQL_PASSWD = '' MYSQL_PORT = 3306
4、存入数据库。编写pipelines.py文件。提取返回的item,并将对应的字段存入数据库
# -*- coding: utf-8 -*- import pymysql from scraw_swagger import settings from twisted.enterprise import adbapi import pymysql.cursors # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class ScrawSwaggerPipeline(object): def process_item(self, item, spider): try: # 插入数据sql sql = """ insert into interfaces (domain, interface, method, param_name ,data_type, param_required) VALUES (%s, %s, %s, %s, %s, %s) """ domain = item['domain'] n = len(domain) for i in range(0, n): domain = str(item['domain'][i]) interface = str(item["interface"][i]) method = str(item["method"][i]) param_name = str(item["param_name"][i]) data_type = str(item["data_type"][i]) param_required = str(item["param_required"][i]) a = (domain, interface, method, param_name, data_type, param_required) self.cursor.execute(sql, a) self.connect.commit() except Exception as error: # 出现错误时打印错误日志 print(error) # self.connect.close() return item def __init__(self): # 连接数据库 self.connect = pymysql.connect( host=settings.MYSQL_HOST, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, port=settings.MYSQL_PORT, charset='utf8', use_unicode=True) # 通过cursor执行增删查改 self.cursor = self.connect.cursor()