canal同步数据到es
canal同步数据到es
canal介绍:https://gitee.com/mirrors/canal?utm_source=alading&utm_campaign=repo
1. docker安装canal
docker pull canal/canal-server:latest
启动命令脚本
# vim start.sh docker run --name canal \ -e canal.instance.master.address=192.168.3.215:13307 \ -e canal.instance.dbUsername=root \ -e canal.instance.dbPassword=123456 \ -p 13333:11111 \ -d canal/canal-server:latest
2. 编写脚本同步逻辑
from canal.client import Client from canal.protocol import EntryProtocol_pb2 import random import datetime import time import logging.handlers import pymysql import threading from elasticsearch import Elasticsearch import traceback import logging import logging.handlers import os def getLogger(): logger = logging.getLogger() logger.setLevel('INFO') formatter = logging.Formatter('%(asctime)s:%(filename)s:%(funcName)s:[line:%(lineno)d] %(levelname)s %(message)s') chlr = logging.StreamHandler() # 输出到控制台的handler chlr.setFormatter(formatter) chlr.setLevel('INFO') # 也可以不设置,不设置就默认用logger的level LOG_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../output/log', 'canal_watch'+'.log')) fhlr = logging.handlers.RotatingFileHandler( LOG_FILE, mode='w', maxBytes=8 * 1024 * 1024, encoding="utf-8", backupCount=300) # 输出到文件的handler fhlr.setFormatter(formatter) logger.addHandler(chlr) logger.addHandler(fhlr) return logger logger = getLogger() class ElasticOpt: def __init__(self, host, port): self.host = host self.port = port self.es = Elasticsearch([{"host": "{0}".format(host), "port": port}]) @property def es_cli(self): return self.es # 手动创建索引,也可以使用代码创建索引 def create_station(self): mapping = { "mappings": { "properties": { "id": { "type": "integer" }, "name": { "type": "keyword" }, "product_line": { "type": "keyword" }, "facility": { "type": "keyword" }, "city": { "type": "keyword" }, "country": { "type": "keyword" }, "device_provider": { "type": "keyword" }, "type": { "type": "keyword" }, "stage": { "type": "keyword" }, "creat_time": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis" } } } } self.es.indices.create(index='el_hub_station', body=mapping) def insert_station(self, data=None): body = { "id": int(data['id']), "name": data['name'], "product_line": data['product_line'], "facility": data['facility'], "city": data['city'], "country": data['country'], "device_provider": data['device_provider'], "type": data['type'], "stage": data['stage'], "creat_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") } self.es.index(index="el_hub_station", doc_type="_doc", body=body) self.es.indices.refresh(index="el_hub_station") def insert_module(self, data=None): body = { "id": int(data['id']), "station_id": int(data['station_id']), "serial_id": data['serial_id'], "img_name": data['img_name'], "grid_lines_by_row": data['grid_lines_by_row'], "grid_lines_by_col": data['grid_lines_by_col'], "shift": data['shift'], "rows": int(data['rows']), "cols": int(data['cols']), "monocrystal": int(data['monocrystal']), "half_plate": int(data['half_plate']), "module_type": data['module_type'], "is_ng": int(data['is_ng']), "is_ng_ai": int(data['is_ng_ai']), "creat_time": data['creat_time'] } self.es.index(index="el_hub_module", doc_type="_doc", body=body) self.es.indices.refresh(index="el_hub_module") # 手动创建索引,也可以使用代码创建索引 def create_module(self): mapping = { "mappings": { "properties": { "id": { "type": "integer" }, "station_id": { "type": "integer" }, "serial_id": { "type": "keyword" }, "img_name": { "type": "keyword" }, "grid_lines_by_row": { "type": "keyword" }, "grid_lines_by_col": { "type": "keyword" }, "shift": { "type": "text" }, "rows": { "type": "integer" }, "cols": { "type": "integer" }, "monocrystal": { "type": "integer" }, "half_plate": { "type": "integer" }, "module_type": { "type": "keyword" }, "is_ng": { "type": "integer" }, "is_ng_ai": { "type": "integer" }, "creat_time": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis" } } } } self.es.indices.create(index='el_hub_module', body=mapping) def update_module(self, data=None, is_ng=False, is_ng_ai=False): # time.sleep(1) query = { "query": { "bool": { "must": [{ "term": {"station_id": int(data['station_id'])} }, { "term": {"serial_id": data['serial_id']} } ], "filter": { "term": {"id": int(data['id'])}} } }, "script": { "source": "" # ctx._source.is_ng=1;ctx._source.is_ng_ai=1; } } if is_ng: if is_ng_ai: query['script']['source'] = "ctx._source.is_ng=1;ctx._source.is_ng_ai=1;" else: query['script']['source'] = "ctx._source.is_ng=1;" else: if is_ng_ai: query['script']['source'] = "ctx._source.is_ng_ai=1;" logger.info("query1233:%s"%query) data = self.es.update_by_query(index="el_hub_module",doc_type="_doc", body=query,wait_for_completion=True,timeout='2m') self.es.indices.refresh(index="el_hub_module") logger.info("data:%s"%data) return data def insert_defect(self, data=None, table=None): station_id = table.split('_')[-1] body = { "id": int(data['id']), "module_id": int(data['module_id']), "station_id": int(station_id), "name": data['name'], "display_name": data['display_name'], "confidence": float(data['confidence']), "area_ratio": float(data['area_ratio']), "coord_x1": int(data['coord_x1']), "coord_y1": int(data['coord_y1']), "coord_x2": int(data['coord_x2']), "coord_y2": int(data['coord_y2']), "row": int(data['row']), "col": int(data['col']), "is_ai": 1 if data['is_ai'] is True else 0, "creat_time": data['creat_time'] } self.es.index(index="el_hub_defect", doc_type="_doc", body=body) self.es.indices.refresh(index="el_hub_defect") # 手动创建索引,也可以使用代码创建索引 def create_defect(self): mapping = { "mappings": { "properties": { "id": { "type": "integer" }, "module_id": { "type": "integer" }, "name": { "type": "keyword" }, "display_name": { "type": "keyword" }, "confidence": { "type": "float" }, "area_ratio": { "type": "float" }, "coord_x1": { "type": "integer" }, "coord_y1": { "type": "integer" }, "coord_x2": { "type": "integer" }, "coord_y2": { "type": "integer" }, "row": { "type": "integer" }, "col": { "type": "integer" }, "is_ai": { "type": "integer" }, "creat_time": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis" } } } } self.es.indices.create(index='el_hub_defect', body=mapping) class CanalOpt: def __init__(self, host, port): self.host = host self.port = port def excute(self): es_list = os.getenv( 'ES_ADDR', "192.168.3.215:9215") es_host, es_port = es_list.split(':') es = ElasticOpt(es_host, es_port) while True: try: client = Client() client.connect(host=self.host, port=self.port) client.check_valid(username=b'', password=b'') client.subscribe(client_id=b'1001', destination=b'example', filter=b'.*\\..*') message = client.get(50) entries = message['entries'] for entry in entries: entry_type = entry.entryType if entry_type in [EntryProtocol_pb2.EntryType.TRANSACTIONBEGIN, EntryProtocol_pb2.EntryType.TRANSACTIONEND]: continue row_change = EntryProtocol_pb2.RowChange() row_change.MergeFromString(entry.storeValue) # event_type = row_change.eventType header = entry.header database = header.schemaName table = header.tableName event_type = header.eventType for row in row_change.rowDatas: format_data = dict() logger.info('event_type:%s' % event_type) logger.info('EntryProtocol_pb2.EventType:%s' % EntryProtocol_pb2.EventType) if event_type == EntryProtocol_pb2.EventType.INSERT: for column in row.afterColumns: if column.isNull is True: format_data[column.name] = None else: format_data[column.name] = column.value if table.startswith('defect_ai'): format_data['is_ai'] = True else: format_data['is_ai'] = False bin_log_data = dict( db=database, table=table, event_type=event_type, data=format_data ) logger.info('table:%s' % table) logger.info('bin_log_data:%s' % bin_log_data) logger.info('format_data:%s' % format_data) if table.startswith('Module'): es.insert_module(bin_log_data['data']) logger.info('insert es module:%s' % bin_log_data['data']) elif table.startswith('Defect') or table.startswith('defect_ai'): es.insert_defect(bin_log_data['data'], table) logger.info('insert es defect:%s' % bin_log_data['data']) elif table.startswith('station'): es.insert_station(bin_log_data['data']) logger.info('insert es station:%s' % bin_log_data['data']) else: pass elif event_type == EntryProtocol_pb2.EventType.UPDATE: for column in row.afterColumns: if column.isNull is True: format_data[column.name] = None else: format_data[column.name] = column.value if table.startswith('defect_ai'): format_data['is_ai'] = True else: format_data['is_ai'] = False bin_log_data = dict( db=database, table=table, event_type=event_type, data=format_data ) logger.info('table:%s' % table) logger.info('update bin_log_data:%s' % bin_log_data) logger.info('update format_data:%s' % format_data) if table.startswith('Module'): if bin_log_data['data']['is_ng'] == '1': if bin_log_data['data']['is_ng_ai'] == '1': result = es.update_module( bin_log_data['data'], is_ng=True, is_ng_ai=True) else: result = es.update_module( bin_log_data['data'], is_ng=True, is_ng_ai=False) else: if bin_log_data['data']['is_ng_ai'] == '1': result = es.update_module( bin_log_data['data'], is_ng=False, is_ng_ai=True) logger.info('update es module:%s' % result) time.sleep(1) client.disconnect() except Exception as e: logger.info('canal 记录出错:{0}'.format(e)) # 发生异常所在的文件 logger.info(e.__traceback__.tb_frame.f_globals["__file__"]) # 发生异常所在的行数 logger.info(e.__traceback__.tb_lineno) def canal_consumer(): try: canal_list = os.getenv('CANAL_HOST', '192.168.3.215:32121') canal_host, canal_port = canal_list.split(':') canal = CanalOpt(canal_host, int(canal_port)) logger.info('start canal , ready backup mysql data to es .....') threading.Thread(target=canal.excute, daemon=True).start() except Exception as e: logger.error(e) logger.error(traceback.format_exc()) if __name__ == "__main__": # canal = CanalOpt('192.168.3.215', 32121) # canal.excute() canal_consumer()