mysql数据同步至redis
通过mysql binlog日志进行
import pymysql import redis from pymysqlreplication import BinLogStreamReader from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent from pymysqlreplication.event import QueryEvent import json from urllib.parse import quote # 处理流程:1、将mysql表中mysql_to_redis_sign字段更新为当前时间戳,为了生成更新binlog日志,生成后才会将数据输入到redis # 2、清空redis中db0 的所有数据 # 3、将数据插入到redis mysql_settings = {'host': 'xxx', 'port': 3306, 'user': 'root', 'password': 'xxx'} binlog_position_file = 'binlog_position.json' schema = 'analyze' table = 'web_link_data' redis_settings = {'host': 'xxx', 'port': 6379, 'db': 9, 'password': 'xxx'} # 连接到 MySQL 数据库 connection = pymysql.connect(host=mysql_settings['host'], user=mysql_settings['user'], password=mysql_settings['password'], cursorclass=pymysql.cursors.DictCursor) try: with connection.cursor() as cursor: # 锁定 web_link_data 表 cursor.execute("LOCK TABLES analyze.web_link_data WRITE") # 创建更新语句 update_query = """ UPDATE analyze.web_link_data SET mysql_to_redis_sign = UNIX_TIMESTAMP() """ print(update_query) # 执行更新操作 cursor.execute(update_query) # 提交修改 connection.commit() print(f"Updated rows: {cursor.rowcount}") # 解锁表 cursor.execute("UNLOCK TABLES") finally: # 关闭数据库连接 connection.close() try: r = redis.Redis(**redis_settings) # 清空数据库 r.flushdb() # 使用 pipeline 来优化批量写入性能 pipeline = r.pipeline() try: with open(binlog_position_file, 'r') as f: saved_position = json.load(f) binlog_file = saved_position['binlog_file'] binlog_position = saved_position['binlog_position'] except FileNotFoundError: binlog_file = None binlog_position = None stream = BinLogStreamReader(connection_settings=mysql_settings, server_id=100, blocking=True, resume_stream=True, only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent], log_file=binlog_file, log_pos=binlog_position) for binlogevent in stream: if binlogevent.schema == schema and binlogevent.table == table: for row in binlogevent.rows: row = dict(row) if isinstance(binlogevent, WriteRowsEvent): print('新增数据:',row) row = { 'id': row['values'].get('UNKNOWN_COL0', '') if row['values'].get('UNKNOWN_COL0','') is not None else '', 'host': row['values'].get('UNKNOWN_COL1', '') if row['values'].get('UNKNOWN_COL1','') is not None else '', 'url': row['values'].get('UNKNOWN_COL2', '') if row['values'].get('UNKNOWN_COL2',' ') is not None else '', 'class': row['values'].get('UNKNOWN_COL3', '') if row['values'].get('UNKNOWN_COL3',' ') is not None else '', 'class_sec': row['values'].get('UNKNOWN_COL4', '') if row['values'].get('UNKNOWN_COL4',' ') is not None else '', 'url_host': row['values'].get('UNKNOWN_COL5', '')if row['values'].get('UNKNOWN_COL5',' ') is not None else '', 'http': row['values'].get('UNKNOWN_COL6', '')if row['values'].get('UNKNOWN_COL6',' ') is not None else '', 'title': row['values'].get('UNKNOWN_COL7', '')if row['values'].get('UNKNOWN_COL7',' ') is not None else '', 'ip': row['values'].get('UNKNOWN_COL8', '')if row['values'].get('UNKNOWN_COL8',' ') is not None else '', 'ip_area': row['values'].get('UNKNOWN_COL9', '')if row['values'].get('UNKNOWN_COL9',' ') is not None else '', 'jump_url': row['values'].get('UNKNOWN_COL10', '')if row['values'].get('UNKNOWN_COL10',' ') is not None else '', 'import_source': row['values'].get('UNKNOWN_COL11', '')if row['values'].get('UNKNOWN_COL11',' ') is not None else '', 'is_gather': row['values'].get('UNKNOWN_COL12', '')if row['values'].get('UNKNOWN_COL12',' ') is not None else '', 'import_time': row['values'].get('UNKNOWN_COL13', '') if row['values'].get('UNKNOWN_COL13',' ') is not None else '', 'gather_time': row['values'].get('UNKNOWN_COL15', '')if row['values'].get('UNKNOWN_COL15',' ') is not None else '', } id = row['id'] # 对 URL 进行编码 row['url'] = quote(row['url']) row['jump_url'] = quote(row['jump_url']) pipeline.hmset(f"record:{id}", mapping=row) fields = [ 'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather', 'import_time','gather_time'] # 为每个字段 添加索引 for field in fields: if row.get(field): # 确保字段存在并且有值 print(row) pipeline.sadd(f"{field}:{row[field]}", row['id']) pipeline.execute() elif isinstance(binlogevent, DeleteRowsEvent): print('删除数据:', row) id = row['values']['UNKNOWN_COL0'] # 获取旧数据 old_data = r.hgetall(f"record:{id}") if old_data: # 删除旧数据记录 pipeline.delete(f"record:{id}") # 从旧索引中移除 fields = ['host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather','import_time','gather_time'] for field in fields: if old_data.get(field.encode()): pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id) # 执行所有操作 pipeline.execute() elif isinstance(binlogevent, UpdateRowsEvent): for row in binlogevent.rows: print('更改数据:', row) before_values = row['before_values'] after_values = row['after_values'] id = before_values['UNKNOWN_COL0'] old_data = r.hgetall(f"record:{id}") # 更新Redis记录 new_data = { 'id': after_values.get('UNKNOWN_COL0', ''), 'host': after_values.get('UNKNOWN_COL1', ''), 'url': quote(str(after_values.get('UNKNOWN_COL2', ''))), 'class': after_values.get('UNKNOWN_COL3', ''), 'class_sec': after_values.get('UNKNOWN_COL4', ''), 'url_host': after_values.get('UNKNOWN_COL5', ''), 'http': after_values.get('UNKNOWN_COL6', ''), 'title': after_values.get('UNKNOWN_COL7', ''), 'ip': after_values.get('UNKNOWN_COL8', ''), 'ip_area': after_values.get('UNKNOWN_COL9', ''), 'jump_url': quote(str(after_values.get('UNKNOWN_COL10', ''))), 'import_source': after_values.get('UNKNOWN_COL11', ''), 'is_gather': after_values.get('UNKNOWN_COL12', ''), 'import_time': after_values.get('UNKNOWN_COL13', ''), 'gather_time': after_values.get('UNKNOWN_COL15', '') } pipeline.hmset(f"record:{id}", mapping=new_data) # 更新索引 fields = [ 'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather','import_time','gather_time' ] # 删除旧索引并添加新索引 for field in fields: if old_data.get(field.encode()): pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id) if new_data[field]: pipeline.sadd(f"{field}:{new_data[field]}", id) pipeline.execute() with open(binlog_position_file, 'w') as f: json.dump({'binlog_file': stream.log_file, 'binlog_position': stream.log_pos}, f) finally: if stream: stream.close()