canal同步数据到es

canal同步数据到es

canal介绍:https://gitee.com/mirrors/canal?utm_source=alading&utm_campaign=repo

1. docker安装canal

docker pull canal/canal-server:latest

  启动命令脚本

# vim start.sh

docker run --name canal \
-e canal.instance.master.address=192.168.3.215:13307 \
-e canal.instance.dbUsername=root \
-e canal.instance.dbPassword=123456 \
-p 13333:11111 \
-d canal/canal-server:latest

2. 编写脚本同步逻辑

from canal.client import Client
from canal.protocol import EntryProtocol_pb2
import random
import datetime
import time
import logging.handlers
import pymysql
import threading
from elasticsearch import Elasticsearch
import traceback
import logging
import logging.handlers
import os


def getLogger():
    logger = logging.getLogger()
    logger.setLevel('INFO')
    formatter = logging.Formatter('%(asctime)s:%(filename)s:%(funcName)s:[line:%(lineno)d] %(levelname)s %(message)s')
    chlr = logging.StreamHandler()  # 输出到控制台的handler
    chlr.setFormatter(formatter)
    chlr.setLevel('INFO')  # 也可以不设置,不设置就默认用logger的level
    LOG_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../output/log', 'canal_watch'+'.log'))
    fhlr = logging.handlers.RotatingFileHandler(
        LOG_FILE, mode='w', maxBytes=8 * 1024 * 1024, encoding="utf-8", backupCount=300)  # 输出到文件的handler
    fhlr.setFormatter(formatter)
    logger.addHandler(chlr)
    logger.addHandler(fhlr)
    return logger


logger = getLogger()


class ElasticOpt:
    def __init__(self, host, port):
        self.host = host
        self.port = port
        self.es = Elasticsearch([{"host": "{0}".format(host), "port": port}])

    @property
    def es_cli(self):
        return self.es
    
    # 手动创建索引,也可以使用代码创建索引
    def create_station(self):
        mapping = {
            "mappings": {
                "properties": {
                    "id": {
                        "type": "integer"
                    },
                    "name": {
                        "type": "keyword"
                    },
                    "product_line": {
                        "type": "keyword"
                    },
                    "facility": {
                        "type": "keyword"
                    },
                    "city": {
                        "type": "keyword"
                    },
                    "country": {
                        "type": "keyword"
                    },
                    "device_provider": {
                        "type": "keyword"
                    },
                    "type": {
                        "type": "keyword"
                    },
                    "stage": {
                        "type": "keyword"
                    },
                    "creat_time": {
                        "type":   "date",
                        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
                    }
                }
            }
        }
        self.es.indices.create(index='el_hub_station', body=mapping)

    def insert_station(self, data=None):
        body = {
            "id": int(data['id']),
            "name": data['name'],
            "product_line": data['product_line'],
            "facility": data['facility'],
            "city": data['city'],
            "country": data['country'],
            "device_provider": data['device_provider'],
            "type": data['type'],
            "stage": data['stage'],
            "creat_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        self.es.index(index="el_hub_station", doc_type="_doc", body=body)
        self.es.indices.refresh(index="el_hub_station")

    def insert_module(self, data=None):
        body = {
            "id": int(data['id']),
            "station_id": int(data['station_id']),
            "serial_id": data['serial_id'],
            "img_name": data['img_name'],
            "grid_lines_by_row": data['grid_lines_by_row'],
            "grid_lines_by_col": data['grid_lines_by_col'],
            "shift": data['shift'],
            "rows": int(data['rows']),
            "cols": int(data['cols']),
            "monocrystal": int(data['monocrystal']),
            "half_plate": int(data['half_plate']),
            "module_type": data['module_type'],
            "is_ng": int(data['is_ng']),
            "is_ng_ai": int(data['is_ng_ai']),
            "creat_time": data['creat_time']
        }
        self.es.index(index="el_hub_module", doc_type="_doc", body=body)
        self.es.indices.refresh(index="el_hub_module")

    # 手动创建索引,也可以使用代码创建索引
    def create_module(self):
        mapping = {
            "mappings": {
                "properties": {
                    "id": {
                        "type": "integer"
                    },
                    "station_id": {
                        "type": "integer"
                    },
                    "serial_id": {
                        "type": "keyword"
                    },
                    "img_name": {
                        "type": "keyword"
                    },
                    "grid_lines_by_row": {
                        "type": "keyword"
                    },
                    "grid_lines_by_col": {
                        "type": "keyword"
                    },
                    "shift": {
                        "type": "text"
                    },
                    "rows": {
                        "type": "integer"
                    },
                    "cols": {
                        "type": "integer"
                    },
                    "monocrystal": {
                        "type": "integer"
                    },
                    "half_plate": {
                        "type": "integer"
                    },
                    "module_type": {
                        "type": "keyword"
                    },
                    "is_ng": {
                        "type": "integer"
                    },
                    "is_ng_ai": {
                        "type": "integer"
                    },
                    "creat_time": {
                        "type":   "date",
                        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
                    }
                }
            }
        }
        self.es.indices.create(index='el_hub_module', body=mapping)

    def update_module(self, data=None, is_ng=False, is_ng_ai=False):
        # time.sleep(1)
        query = {
            "query": {
                "bool": {
                    "must": [{
                        "term": {"station_id": int(data['station_id'])}
                    },
                        {
                            "term": {"serial_id": data['serial_id']}
                    }
                    ],
                    "filter": {
                        "term": {"id": int(data['id'])}}
                }
            },
            "script": {
                "source": ""  # ctx._source.is_ng=1;ctx._source.is_ng_ai=1;
            }
        }
        if is_ng:
            if is_ng_ai:
                query['script']['source'] = "ctx._source.is_ng=1;ctx._source.is_ng_ai=1;"
            else:
                query['script']['source'] = "ctx._source.is_ng=1;"
        else:
            if is_ng_ai:
                query['script']['source'] = "ctx._source.is_ng_ai=1;"
        logger.info("query1233:%s"%query)
        data = self.es.update_by_query(index="el_hub_module",doc_type="_doc", body=query,wait_for_completion=True,timeout='2m')
        self.es.indices.refresh(index="el_hub_module")
        logger.info("data:%s"%data)
        return data

    def insert_defect(self, data=None, table=None):
        station_id = table.split('_')[-1]
        body = {
            "id": int(data['id']),
            "module_id": int(data['module_id']),
            "station_id": int(station_id),
            "name": data['name'],
            "display_name": data['display_name'],
            "confidence": float(data['confidence']),
            "area_ratio": float(data['area_ratio']),
            "coord_x1": int(data['coord_x1']),
            "coord_y1": int(data['coord_y1']),
            "coord_x2": int(data['coord_x2']),
            "coord_y2": int(data['coord_y2']),
            "row": int(data['row']),
            "col": int(data['col']),
            "is_ai": 1 if data['is_ai'] is True else 0,
            "creat_time": data['creat_time']
        }
        self.es.index(index="el_hub_defect", doc_type="_doc", body=body)
        self.es.indices.refresh(index="el_hub_defect")

    # 手动创建索引,也可以使用代码创建索引
    def create_defect(self):
        mapping = {
            "mappings": {
                "properties": {
                    "id": {
                        "type": "integer"
                    },
                    "module_id": {
                        "type": "integer"
                    },
                    "name": {
                        "type": "keyword"
                    },
                    "display_name": {
                        "type": "keyword"
                    },
                    "confidence": {
                        "type": "float"
                    },
                    "area_ratio": {
                        "type": "float"
                    },
                    "coord_x1": {
                        "type": "integer"
                    },
                    "coord_y1": {
                        "type": "integer"
                    },
                    "coord_x2": {
                        "type": "integer"
                    },
                    "coord_y2": {
                        "type": "integer"
                    },
                    "row": {
                        "type": "integer"
                    },
                    "col": {
                        "type": "integer"
                    },
                    "is_ai": {
                        "type": "integer"
                    },
                    "creat_time": {
                        "type":   "date",
                        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
                    }
                }
            }
        }
        self.es.indices.create(index='el_hub_defect', body=mapping)


class CanalOpt:
    def __init__(self, host, port):
        self.host = host
        self.port = port

    def excute(self):
        es_list = os.getenv(
            'ES_ADDR', "192.168.3.215:9215")
        es_host, es_port = es_list.split(':')
        es = ElasticOpt(es_host, es_port)
        while True:
            try:
                client = Client()
                client.connect(host=self.host, port=self.port)
                client.check_valid(username=b'', password=b'')
                client.subscribe(client_id=b'1001',
                                 destination=b'example', filter=b'.*\\..*')

                message = client.get(50)
                entries = message['entries']
                for entry in entries:
                    entry_type = entry.entryType
                    if entry_type in [EntryProtocol_pb2.EntryType.TRANSACTIONBEGIN,
                                      EntryProtocol_pb2.EntryType.TRANSACTIONEND]:
                        continue
                    row_change = EntryProtocol_pb2.RowChange()
                    row_change.MergeFromString(entry.storeValue)
                    # event_type = row_change.eventType
                    header = entry.header
                    database = header.schemaName
                    table = header.tableName
                    event_type = header.eventType
                    for row in row_change.rowDatas:
                        format_data = dict()
                        logger.info('event_type:%s' % event_type)
                        logger.info('EntryProtocol_pb2.EventType:%s' %
                                    EntryProtocol_pb2.EventType)

                        if event_type == EntryProtocol_pb2.EventType.INSERT:
                            for column in row.afterColumns:
                                if column.isNull is True:
                                    format_data[column.name] = None
                                else:
                                    format_data[column.name] = column.value

                            if table.startswith('defect_ai'):
                                format_data['is_ai'] = True
                            else:
                                format_data['is_ai'] = False
                            bin_log_data = dict(
                                db=database,
                                table=table,
                                event_type=event_type,
                                data=format_data
                            )
                            logger.info('table:%s' % table)
                            logger.info('bin_log_data:%s' % bin_log_data)
                            logger.info('format_data:%s' % format_data)
                            if table.startswith('Module'):
                                es.insert_module(bin_log_data['data'])
                                logger.info('insert es module:%s' %
                                            bin_log_data['data'])
                            elif table.startswith('Defect') or table.startswith('defect_ai'):
                                es.insert_defect(bin_log_data['data'], table)
                                logger.info('insert es defect:%s' %
                                            bin_log_data['data'])
                            elif table.startswith('station'):
                                es.insert_station(bin_log_data['data'])
                                logger.info('insert es station:%s' %
                                            bin_log_data['data'])
                            else:
                                pass
                        elif event_type == EntryProtocol_pb2.EventType.UPDATE:
                            for column in row.afterColumns:
                                if column.isNull is True:
                                    format_data[column.name] = None
                                else:
                                    format_data[column.name] = column.value
                            if table.startswith('defect_ai'):
                                format_data['is_ai'] = True
                            else:
                                format_data['is_ai'] = False
                            bin_log_data = dict(
                                db=database,
                                table=table,
                                event_type=event_type,
                                data=format_data
                            )
                            logger.info('table:%s' % table)
                            logger.info('update bin_log_data:%s' %
                                        bin_log_data)
                            logger.info('update format_data:%s' % format_data)
                            if table.startswith('Module'):
                                if bin_log_data['data']['is_ng'] == '1':
                                    if bin_log_data['data']['is_ng_ai'] == '1':
                                        result = es.update_module(
                                            bin_log_data['data'], is_ng=True, is_ng_ai=True)
                                    else:
                                        result = es.update_module(
                                            bin_log_data['data'], is_ng=True, is_ng_ai=False)
                                else:
                                    if bin_log_data['data']['is_ng_ai'] == '1':
                                        result = es.update_module(
                                            bin_log_data['data'], is_ng=False, is_ng_ai=True)
                                logger.info('update es module:%s' % result)

                time.sleep(1)

                client.disconnect()
            except Exception as e:
                logger.info('canal 记录出错:{0}'.format(e))
                # 发生异常所在的文件
                logger.info(e.__traceback__.tb_frame.f_globals["__file__"])
                # 发生异常所在的行数
                logger.info(e.__traceback__.tb_lineno)


def canal_consumer():
    try:
        canal_list = os.getenv('CANAL_HOST', '192.168.3.215:32121')
        canal_host, canal_port = canal_list.split(':')
        canal = CanalOpt(canal_host, int(canal_port))
        logger.info('start canal , ready backup mysql data to es .....')
        threading.Thread(target=canal.excute, daemon=True).start()
    except Exception as e:
        logger.error(e)
        logger.error(traceback.format_exc())




if __name__ == "__main__":
    # canal = CanalOpt('192.168.3.215', 32121)
    # canal.excute()
    canal_consumer()

 

posted @ 2021-12-06 14:16  Mr沈  阅读(742)  评论(0编辑  收藏  举报