【presto】测试各种优化建议对presto性能的提升

使用大表 join 小表对性能的提升


案例一:

原sql:

select
t3.repay_dt,
count(1),
count(case when t3.his_had_overdue=1 then a.loan_id end),
count(case when t3.his_had_overdue=1 then a.loan_id end)*1.0000/count(1) ,
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 and t3.his_overdue_days>1 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 and t3.his_overdue_days>3 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 and t3.his_overdue_days>5 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 then a.loan_id end) ) ,
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 and t3.his_overdue_days>10 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 then a.loan_id end) )
from riskn.test a
join riskn.test t2 on a.loan_id=t2.loan_id
and t2.etl_dt='2021-06-06'
join riskn.test2 t3 on t3.loan_id =t2.loan_id
and t3.etl_dt='2021-06-06' and t3.repay_per_num=1
and t3.repay_plan_type!='prepay'
and(( t3.repay_stat not in (30) and t3.source_system ='1' )or  t3.source_system='1')
where a.etl_dt='2021-06-06'
and a.biz_line='1'
and a.bank_id=1
and a.loan_dt>=date'2019-12-31'
and t3.repay_dt>=date'2021-05-31'
and t3.repay_dt<=date'2021-06-06'
and a.app_name='1'
group by 1
order by 1

资源使用情况:

在这里插入图片描述

  • 峰值内存:834MB
  • 用户内存:2.89G

执行计划:

在这里插入图片描述

优化建议:

使用大表去 join 小表
在这里插入图片描述

更改操作:

- from riskn.dws_loan_order_wide_001 a
- join riskn.dwd_order_cash_result_001 t2 on a.loan_id=t2.loan_id
 
 
+ from riskn.dwd_order_cash_result_001 t2
+ join riskn.dws_loan_order_wide_001 a on a.loan_id=t2.loan_id

优化后的sql:

select
t3.repay_dt ,
count(1) ,
count(case when t3.his_had_overdue=1 then a.loan_id end),
count(case when t3.his_had_overdue=1 then a.loan_id end)*1.0000/count(1),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 and t3.his_overdue_days>1 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 and t3.his_overdue_days>3 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 and t3.his_overdue_days>5 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 and t3.his_overdue_days>10 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 then a.loan_id end) ),
from riskn.test t2
join riskn.testa a on a.loan_id=t2.loan_id
and t2.etl_dt='2021-06-06'
join riskn.testb t3 on t3.loan_id =t2.loan_id
and t3.etl_dt='2021-06-06' and t3.repay_per_num=1
and t3.repay_plan_type!='1'
and(( t3.repay_stat not in (30) and t3.source_system ='1' )or  t3.source_system='1')
where a.etl_dt='2021-06-06'
and a.biz_line='1'
and a.bank_id=1
and a.loan_dt>=date'2019-12-31'
and t3.repay_dt>=date'2021-05-31'
and t3.repay_dt<=date'2021-06-06'
and a.app_name='1'
group by 1
order by 1

执行计划:

在这里插入图片描述

资源使用情况:

在这里插入图片描述

案例二:

在这里插入图片描述

内存使用了减少了近3倍

原sql:

select
json_extract_scalar(apply.extend_field, '$.1') as customer_type,
        try(count(distinct case when  t1.decision='1' then t1.apply_id end )*1.0000/count(distinct t1.apply_id)) innerrule_rate,
        try(count(distinct case when apply.credit_result ='pass' then apply.apply_id end )*1.0000/(count(distinct case when t2.decision is not null then t2.apply_id end )-count(distinct case when t2.decision='2' then t2.apply_id end ) ))model_rate,
        1-try(count(distinct case when t2.decision='2' then t2.apply_id end )*1.0000/count(distinct t2.apply_id)) outrule_rate,
try(count(distinct case when apply.step ='1' and apply.credit_result ='pass' then apply.apply_id end )*1.0000/
(count(distinct case when t2.decision is not null then t2.apply_id end )-count(distinct case when t2.decision='2' then t2.apply_id end ) ))model_rate,       
try(count(distinct case when apply.credit_result='pass' then apply.apply_id end )*1.0000/count(distinct apply.apply_id )) approve_rate,
        count(distinct apply.apply_id ) apply_cnt,count(distinct case when apply.credit_result='pass' then apply.apply_id end )approve_cnt--, count(apply.apply_id)
 
 
        from rsk.rreal_flink_bl_sgfq_risk_control_sgfq_credit_result_presto apply
 
         join rsk.rreal_flink_decision_engine_ruleset_run_record_presto t1
        on t1.apply_id=apply.apply_id
        and regexp_like(t1.ruleset_name,'rule' ) and t1.ruleset_name not like '%outer%'
        and t1.etl_dt>='2020-01-17' and  t1.step in ('1','2')
        and t1.ruleset_name not  like '%3%'
 
 
 
        left  join rsk.test t2
        on  apply.apply_id= t2.apply_id
        and regexp_like(t2.ruleset_name,'rule')
        and regexp_like(t2.ruleset_name,'outer')
        and t2.etl_dt>='2020-01-17' and t2.step  in ('3','2')
        and t2.ruleset_name  not  like '%1%'
        where
        apply.step in ('3','4')
        and apply.app_name='jsd'
        --and apply.etl_dt='2019-10-11'
        and apply.etl_dt='2021-06-07'
        and json_extract_scalar(apply.extend_field, '$.apply_type')!= '257'       
and format_datetime(from_unixtime(cast (substring(apply.create_time,1,10) as bigint)),'yyyy-MM-dd HH:mm:ss')>='2021-06-07 00:00:00'
 
        group by 1

结论

大表 jion 小表性能较之小表 join 大表要提升三倍。所以大表join优化项比较关键。

使用WITH


原SQL语句:

select distinct a.etl_dt,a.cost,de.hightype,de.type,a.desc as description,
 
case when a.cost>0 then 'test'
else '非投放渠道' end as pay_type
from
(select cost_dt  as etl_dt,
        sum(cost) as cost,
 
        desc 
from rdw.test
where  cost_dt>=date('2021-06-06')
and cost_dt<=date('2021-06-06')
and platform in ('4')
group by cost_dt,desc)a
left join
   (select distinct hightype,
            type,
            description,
            etl_dt
     from  rmk.hha
     where date(etl_dt)  between date('2021-06-06') and date('2021-06-06')
 
     and platform='APP'
     and app_subname in ('null','')
 
    ) as de
on trim(lower(cast(a.desc as varchar)))=trim(lower(cast(de.description as varchar)))
and date(de.etl_dt)=date(a.etl_dt)
group by a.etl_dt,a.cost,de.hightype,de.type,a.desc,case when a.cost>0 then '投放'
else '非投放' end

优化方向:

  • 使用with替换
  • join 字段避免出现函数
  • group by后面跟数字
  • 大表join小表

优化后:

with a as (select distinct date(cost_dt) as etl_dt,
        sum(cost) as cost,
        trim(lower(cast(desc as varchar))) as desc
from rdw.test
where  cost_dt>=date('2021-06-06')
and cost_dt<=date('2021-06-06')
and platform in ('4')
group by cost_dt,desc),
 
de as (select distinct hightype,
            type,
            trim(lower(cast(description as varchar))) as description,
            date(etl_dt) as etl_dt
     from  rmk.hha
     where date(etl_dt)  between date('2021-06-06') and date('2021-06-06')
     and platform='APP'
     and app_subname in ('null','')
)
 
select a.etl_dt,a.cost,de.hightype,de.type,a.desc as description,
case when a.cost>0 then '投放渠道'
else '非投放渠道' end as pay_type
from de right join a on a.desc = de.description
and de.etl_dt = a.etl_dt
group by 1,2,3,4,5,6

结论:

经过测试,对性能提升不大。

posted @ 2022-11-10 19:26  彬在俊  阅读(108)  评论(0编辑  收藏  举报