Scrapy之异步配置
pipelines.py配置
import pymysql from twisted.enterprise import adbapi from yangguang.items import GuSuItem class YangguangPipeline: def __init__(self, dbpool): self.dbpool = dbpool ''' 在settings中添加以下配置 MYSQL_HOST = 'localhost' MYSQL_PORT = '3306' MYSQL_USER = 'root' MYSQL_PASS = '123456' MYSQL_DB = 'open_source_intelligence' ''' @classmethod def from_settings(cls, settings): adbparams = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DB'], user=settings['MYSQL_USER'], password=settings['MYSQL_PASS'], charset='utf8', cursorclass=pymysql.cursors.DictCursor ) dbpool = adbapi.ConnectionPool('pymysql', **adbparams) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # 添加异常处理 query.addCallback(self.handle_error) # 处理异常 def do_insert(self, cursor, item): # 新闻表 if isinstance(item, GuSuItem): # 对数据库进行插入操作,并不需要commit,twisted会自动commit select_sql_news = """select title from op_news where title='{}'""".format( item['title']) cursor.execute(select_sql_news) data = cursor.fetchall() if not data: insert_sql = """insert into op_news(title, content, publish_date, url) values (%s, %s, %s, %s)""" try: cursor.execute(insert_sql, ( item['title'], item['content'], item['publish_date'], item['url'])) print("新闻表数据插入成功!!!!!!") except Exception as e: print(e) else: print(item['title'], ':已存在!!!') def handle_error(self, failure): if failure: print(failure)
setting.py配置
ITEM_PIPELINES = { 'yangguang.pipelines.YangguangPipeline': 300, #开启管道 } #连接mysql配置 MYSQL_HOST = 'localhost' MYSQL_PORT = '3306' MYSQL_USER = 'root' MYSQL_PASS = '123456' MYSQL_DB = 'open_source_intelligence'
gusu.py
import json import scrapy from yangguang.items import GuSuItem class GusuSpider(scrapy.Spider): name = 'gusu' # allowed_domains = ['gusu.gov.cn'] 二次请求必须注释,否则第二次请求无法生效;否则启用dont_filter=True # start_urls = ['http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/'] def start_requests(self): url = 'http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/' data = { 'type': '12', 'pagesize': '10', 'keywords': '', 'currpage': '2', 'deptcode': '014152419', 'check': 'do' } headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36' } request = scrapy.FormRequest(url, formdata=data, headers=headers, callback=self.parse) yield request def parse(self, response): txt = response.text data = json.loads(txt) infolist = data['infolist'] for info in infolist: item = GuSuItem() page_id = info['consult_link'].split('/')[-1] text = page_id.split('?')[0] item['url'] = info['consult_link'].replace(text,'detail') print(item) yield scrapy.Request(url=item['url'], callback=self.parse_detail, meta={"item": item}) def parse_detail(self,response): item = response.meta['item'] table = response.xpath('//table[@class="tablecon"]/tbody') item['title'] = table.xpath('./tr[1]/td[2]/text()').extract_first() item['publish_date'] = table.xpath('./tr[2]/td[2]/text()').extract_first() item['content'] = table.xpath('./tr[3]/td[2]/text()').extract_first() print(item) yield item
item.py
import scrapy class YangguangItem(scrapy.Item): # define the fields for your item here like: url = scrapy.Field() title = scrapy.Field() #scrapy.Field()字典 content = scrapy.Field() publish_date = scrapy.Field() picture = scrapy.Field() class GuSuItem(scrapy.Item): url = scrapy.Field() title = scrapy.Field() # scrapy.Field()字典 content = scrapy.Field() publish_date = scrapy.Field()