使用presto 进行跨库数据对比
需求:
- 在dolphinscheduler上通过sqoop脚本将源库mysql数据同步至doris ,现需要对比 doris 和 源库mysql 表数据是否一致 。
对比粒度:
- 明细+汇总
实现:
- 使用 information_schema.columns 获取相关的表所有字段 (如果是mysql和hive进行对比需要因DDL可能有差异 如 decimal(12,4) 类型,double ,datetime/timestamp(x) 需要根据字段类型做处理 )
- 是map将字段名和值进行组合
- 使用唯一键进行关联
- 使用transform函数将数据转为 {kay:[a.value,b.value]} 格式 (参考脚本2)
- 使用filter函数进行过滤 ,a.value<>b.value (参考脚本2)
- ...
知识点:
- map
- array
- transform
- json
- 官网function:https://trino.io/docs/current/functions/array.html
遗留问题:
- mysql有写字段是 json 类型的,在json 转 varchar 的时候 会出现异常 ,尝试使用 json_format(json null )/json_format(json CAST('{}' AS json)) / json_format(json coalesce(....)) 都无法解决,最终没有对比json类型的字段 使用这种 cast( json_format(json '{"":""}') as varchar)方式屏蔽json类型的字段 。
脚本1
1 SELECT 2 concat( 3 'select a.* from (' 4 ,concat(' ' ,array_join (TRANSFORM(table_catalog_schema ,_->concat('select id, map(array[' ,column_name_str ,'],split(' ,array_join(TRANSFORM(column_name_list ,_->concat('coalesce(cast(' ,CASE WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'decimal%' OR element_at(column_type_list,array_position(column_name_list,_)) ='double' THEN concat('REGEXP_REPLACE(cast(',_,' as varchar),','''([0]+$)''',',','''''',')') WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'timestam%' OR element_at(column_type_list,array_position(column_name_list,_)) LIKE 'datetime' THEN concat('substring( cast(',_,' as varchar) ,1,19)') WHEN element_at(column_type_list,array_position(column_name_list,_)) = 'json' THEN concat('json_format(json ','''{"":""}''',')') ELSE _ END ,' as varchar)',',',chr(39),'NULL',chr(39),')')),concat('||','''^''','||')) ,' ,''^''))' ,' from ',_,'.',table_name ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ' ,'date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY )' ,' and ' ,'date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,CASE WHEN array_position(table_catalog_schema,_)<> dcnt THEN ' union all ' ELSE ' ' END )),' ')) 5 ,') a left join (' 6 ,concat(' ',concat('select id, map(array[',column_name_str ,'],split(' ,array_join(TRANSFORM(column_name_list ,_->concat('coalesce(cast(' ,CASE WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'decimal%' OR element_at(column_type_list,array_position(column_name_list,_)) ='double' THEN concat('REGEXP_REPLACE(cast(',_,' as varchar),','''([0]+$)''',',','''''',')') WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'timestam%' OR element_at(column_type_list,array_position(column_name_list,_)) LIKE 'datetime' THEN concat('substring( cast(',_,' as varchar) ,1,19)') WHEN element_at(column_type_list,array_position(column_name_list,_)) = 'json' THEN concat('json_format(json ','''{"":""}''',')') ELSE _ END ,' as varchar)',',',chr(39),'NULL',chr(39),')')),concat('||','''^''','||')) ,' ,''^''))' ,' from doris.',doris_tabName ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ','date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY )',' and ','date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,' and msg_is_valid =1 ' )) 7 ,') b on a.id=b.id WHERE b.id IS NULL ' 8 ) detal_SQL -- 根据明细对比的SQL 9 ,concat( 10 'select a.*,b.*,a.cnt-b.cnt from (' 11 ,array_join (TRANSFORM(table_catalog_schema,_->concat('select ''',substr(_,strpos(_,'.')+1)||'.'||table_name,''' cate_log_schema, count(1) cnt from ',_,'.',table_name ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ','date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY )',' and ','date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,CASE WHEN array_position(table_catalog_schema,_)<> dcnt THEN ' group by 1 union all ' ELSE ' group by 1' END )),' ') 12 ,') a left join ( ' 13 ,concat('select concat(msg_source_db,''.'',msg_source_table) cate_log_schema',', count(1) cnt from doris.',doris_tabName ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ','date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY )',' and ','date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,' and msg_is_valid =1 group by 1' ) 14 ,') b on a.cate_log_schema=b.cate_log_schema ' 15 ) agg_Sql_Str -- 根据汇总数据对比的Sql 16 FROM ( 17 SELECT table_name 18 ,max(doris_tabName) doris_tabName 19 --,array_agg(DISTINCT doris_tabName) doris_tabName 20 ,array_agg(DISTINCT concat(table_catalog,'.',table_schema)) table_catalog_schema 21 ,count(DISTINCT concat(table_catalog,'.',table_schema)) dcnt 22 ,max(column_name_list) column_name_list 23 ,max(column_name_str) column_name_str 24 ,max(columnCnt) columnCnt 25 ,max(column_type_list) column_type_list 26 FROM ( 27 SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list FROM oms4_som01.information_schema.columns WHERE table_schema NOT IN ('information_schema','sys','pt','maxwell','db_oms4_som998','db_ecs_oms4_som_nike_prod') GROUP BY 1,2,3 28 UNION ALL SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list FROM oms4_som02.information_schema.columns WHERE table_schema NOT IN ('information_schema','sys','pt','maxwell','db_oms4_som999') GROUP BY 1,2,3 29 UNION ALL SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list FROM oms4_baseinfo.information_schema.columns WHERE table_schema NOT IN ('db_sec_compare_prod') GROUP BY 1,2,3 30 UNION ALL SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list FROM oms4_finance.information_schema.columns WHERE table_schema NOT IN ('db_sec_compare_prod') GROUP BY 1,2,3 31 )t INNER JOIN ( 32 SELECT * 33 FROM (VALUES -- (doris表名, 源库表明) 34 ('db_ods.oms4_db_base_info_bi_brand','bi_brand'), 35 ('db_ods.oms4_db_base_info_bi_sales_entity_brand_ref','bi_sales_entity_brand_ref'), 36 ('db_ods.oms4_db_base_info_bi_sys_goods_tax_rate','bi_sys_goods_tax_rate'), 37 ('db_ods.oms4_rf_refund','rf_refund'), 38 ('db_ods.oms4_db_oms4_som_trade_order_guide_info','trade_order_guide_info'), 39 ('db_ods.oms4_db_oms4_som_trade_order_item','trade_order_item'), 40 ('db_ods.oms4_db_oms4_som_trade_order_member_info','trade_order_member_info'), 41 ('db_ods.oms4_db_oms4_som_trade_order_payment_info','trade_order_payment_info'), 42 ('db_ods.oms4_db_oms4_som_trade_return_order','trade_return_order') 43 --('db_ods.oms4_db_oms4_som_trade_order_guide_info','trade_order_guide_info'), 44 --('db_ods.oms4_db_oms4_som_trade_order_delivery_info','trade_order_delivery_info'), 45 --('db_ods.oms4_db_oms4_som_so_order_timing_promise','so_order_timing_promise') 46 ) AS tab (doris_tabName,src_db_tabName) 47 ) s ON t.table_name=s.src_db_tabName 48 GROUP BY table_name 49 ) tt
脚本2
1 SELECT hiveTableName, 2 concat('select t.* from ( SELECT a.*,b.*, map_filter(transform_values(a.datas,(k,v)->b.datas[k]||array[v]),(k,v)->v[1]<>v[2]) diff_fields from (' 3 ,concat('select ''dmztc'' ds, ''',hiveTableName,''' src_tname , cast(id as varchar) id, map( array[', src_column_name_char,']' ,' , split(',format_column_name2,',','''^''',')) as datas FROM ',catalogs,SrcTableName ,' where ',concat('substring(CAST(',increnmentField,' AS varchar),1,10)'),'=',chr(39),CAST(current_date -INTERVAL '1' day AS varchar),chr(39)) 4 ,') a left join (' 5 ,concat('select ''hive'' ds,''',hiveTableName,''' hive_tname ,id,map( array[', src_column_name_char,']' ,' , split(',format_column_name2,',','''^''',')) as datas FROM hive.' ,hiveTableName , ' where pt= ',chr(39),cast( current_date -INTERVAL '1' DAY AS varchar),chr(39) ,'and ',concat('substring(CAST(',increnmentField,' AS varchar),1,10)'),'=',chr(39),CAST(current_date -INTERVAL '1' day AS varchar),chr(39)) 6 ,') b on a.id=b.id ) t order by CARDINALITY(diff_fields) desc ') 7 FROM ( 8 SELECT concat(table_catalog,'.',table_schema,'.') catalogs ,table_name SrcTableName, max(hiveTableName) hiveTableName ,max(increnmentField) increnmentField 9 ,array_agg(column_name) src_column_name_array 10 ,array_join(array_agg(concat(chr(39),column_name,chr(39))),',') src_column_name_char 11 ,ARRAY_join(array_agg(concat('coalesce(cast(' 12 ,CASE WHEN data_type LIKE 'decimal%' OR data_type ='double' THEN concat('REGEXP_REPLACE(cast(',column_name,' as varchar),','''([0]+$)''',',','''''',')') -- 处理 13 WHEN data_type LIKE 'timestam%' OR data_type LIKE 'datetime' THEN concat('substring( cast(',column_name,' as varchar) ,1,19)') ELSE column_name END 14 ,' as varchar)',',',chr(39),'NULL',chr(39),')')),concat('||','''^''','||')) format_column_name2 15 FROM ( 16 SELECT table_catalog ,table_schema ,table_name ,column_name ,data_type FROM dmztc.information_schema.columns i 17 UNION ALL 18 SELECT table_catalog ,table_schema ,table_name ,column_name ,data_type FROM dm_tencent_ad.information_schema.columns i 19 ) i 20 INNER JOIN ( 21 SELECT * FROM (VALUES -- (db_ods.或者db_working表明,源库表明,增量更新字段) 22 --('db_ods.dm_db_tencent_ad_prod_mp_adgroup_his_effect', 'mp_adgroup_his_effect', 'last_modify_time'), 23 --('db_ods.dm_db_tencent_ad_prod_tencent_account_hour_his_effect_industry', 'tencent_account_hour_his_effect_industry', 'last_modify_time'), 24 --('db_ods.dm_db_tencent_ad_prod_tencent_account_hour_his_effect_interaction', 'tencent_account_hour_his_effect_interaction', 'last_modify_time'), 25 --('db_ods.dm_db_tencent_ad_prod_tencent_ad_hour_his_effect_basic', 'tencent_ad_hour_his_effect_basic', 'last_modify_time'), 26 --('db_ods.dm_db_tencent_ad_prod_tencent_ad_hour_his_effect_industry', 'tencent_ad_hour_his_effect_industry', 'last_modify_time') 27 ('db_working.dm_db_tencent_ad_prod_mp_ad_his_effect_tmp', 'mp_ad_his_effect', 'last_modify_time'), 28 ('db_working.dm_db_tencent_ad_prod_mp_adertiser_his_effect_tmp', 'mp_adertiser_his_effect', 'last_modify_time'), 29 ('db_working.dm_db_tencent_ad_prod_tencent_account_his_effect_industry_tmp', 'tencent_account_his_effect_industry', 'last_modify_time'), 30 ('db_working.dm_db_tencent_ad_prod_tencent_account_his_effect_interaction_tmp', 'tencent_account_his_effect_interaction', 'last_modify_time'), 31 ('db_working.dm_db_tencent_ad_prod_tencent_ad_his_effect_basic_tmp', 'tencent_ad_his_effect_basic', 'last_modify_time'), 32 ('db_working.dm_db_tencent_ad_prod_tencent_ad_his_effect_industry_tmp', 'tencent_ad_his_effect_industry', 'last_modify_time'), 33 ('db_working.dm_db_tencent_ad_prod_tencent_ad_hour_his_effect_interaction_tmp', 'tencent_ad_hour_his_effect_interaction', 'last_modify_time'), 34 ('db_working.dm_db_tencent_ad_prod_tencent_advertising_his_effect_basic_tmp', 'tencent_advertising_his_effect_basic', 'last_modify_time'), 35 ('db_working.dm_db_tencent_ad_prod_tencent_advertising_hour_his_effect_industry_tmp', 'tencent_advertising_hour_his_effect_industry', 'last_modify_time'), 36 ('db_working.dm_db_tencent_ad_prod_tencent_advertising_hour_his_effect_interaction_tmp', 'tencent_advertising_hour_his_effect_interaction', 'last_modify_time'), 37 ('db_working.dm_db_tencent_ad_prod_tencent_campaign_hour_his_effect_basic_tmp', 'tencent_campaign_hour_his_effect_basic', 'last_modify_time'), 38 ('db_working.dm_db_tencent_ad_prod_tencent_campaign_hour_his_effect_industry_tmp', 'tencent_campaign_hour_his_effect_industry', 'last_modify_time') 39 40 ) AS config (hiveTableName,SrcTableName,increnmentField) 41 ) c ON i.table_name =c.SrcTableName 42 GROUP BY 1 ,2 43 )