天晴如许

在网站数据采集和网站分析行业,让数据采集标准化和自动化;由海量日志基于hadoop分析到实时分析;提供Web Analytics工具发挥数据运营威力。

导航

SQL优化和技巧

[使用case when只进行一次全表扫描]

根据realurl进行排序,以下两个语句结果一致,但使用case when的由于只有一次全表扫描,因此更加快速:

SELECT url, pv, uv
FROM
(
  (
    SELECT /*+parallel(a 16)*/ url, pv, uv
    FROM cndss.cn_url_pv_sumdt0 PARTITION(p20101104)
    where url_id not in(11,62,103,25255782,25255781,24565843,24565842)
      and pv >= 10
      and (url like '/china.alibaba.com%'
       or url like '/upload.china.alibaba.com%'
       or url like '/redirect.china.alibaba.com%'
       or url like '/render.china.alibaba.com%')
  )
  UNION ALL  
  (
    SELECT /*+parallel(a 16)*/ replace(replace(replace(replace(b.url,'(.+)','*'),'\',''),'^',''),'$','') as url, sum(a.pv) as pv, sum(a.uv) as uv
    FROM cndss.cn_url_pv_sumdt0 PARTITION(p20101104) a, cnlog.url_dimt0 b
    where a.url_id = b.url_id
      and a.url_id in(11,62,103,25255782,25255781,24565843,24565842)
    group by replace(replace(replace(replace(b.url,'(.+)','*'),'\',''),'^',''),'$','')
  )
)
order by pv desc;

--下面是对应的优化后的sql

select real_url, sum(pv) as pv, sum(uv) as uv
from (SELECT /*+ordered use_hash(b a) parallel(a 16)*/
     CASE
       WHEN a.url_id in (11,62,103,25255782,25255781,24565843,24565842) THEN
       replace(replace(replace(replace(b.url,'(.+)','*'),'\',''),'^',''),'$','') 
       else
        a.url
     END AS real_url,
     pv,
     uv
      FROM cnlog.url_dimt0        b,
           cndss.cn_url_pv_sumdt0 PARTITION(p20101104) a
     where a.url_id = b.url_id
       and (a.url like '/china.alibaba.com%' or
           a.url like '/upload.china.alibaba.com%' or
           a.url like '/redirect.china.alibaba.com%' or
           a.url like '/render.china.alibaba.com%'))
group by real_url
having sum(pv) >= 10
order by 2 desc;

[分组统计、开窗汇总、百分比计算、字符串对齐的功能实现]

开窗汇总:

select *     
from    
(    
    select t.*,    
           row_number() over(partition by t.YYYYMMDD order by t.agent_count desc) as rw    
    FROM    
    (    
        SELECT /*+ PARALLEL(a,8) */ yyyymmdd, agent, count(session_id) as agent_count
        from cnlog.SESSION_AGENT_FATDT0
        WHERE yyyymmdd BETWEEN TO_DATE(20100801, 'yyyymmdd') AND TO_DATE(20100816, 'yyyymmdd') + 86399 / 86400
        group by yyyymmdd, agent
        order by yyyymmdd asc, count(session_id)
    ) t    
)    
where rw <= 10    
order by YYYYMMDD, rw;

分组得到oracle SQL的汇总结果:
    select g1,g2,sum(v1),count(v1) from xyq_tg group by grouping sets((g1,g2),null);

百分比计算:

    SELECT ROUND(0.1213*100,2) || '%' FROM DUAL;

字符串对齐:

    SELECT LPAD(ROUND(0.1213*100,2) || '%',7,' ') FROM DUAL;

分组统计、开窗汇总、百分比计算、字符串对齐的功能实现:

SELECT group_name, 
       COUNT(member_id) as member_count,
       LPAD(ROUND((COUNT(member_id)/AVG(all_mem_cnt))*100,2) || '%',7,' ') AS member_per, 
       SUM(pv) AS sum_pv,
       LPAD(ROUND((SUM(pv)/AVG(all_pv_cnt))*100,2) || '%',7,' ') AS pv_per
FROM
(
    SELECT
        CASE WHEN pv<=1          THEN 'a.0~1      pv'
        WHEN pv<=3    and pv>1   THEN 'b.1~3      pv'
        WHEN pv<=5    and pv>3   THEN 'c.3~5      pv'
        WHEN pv<=10   and pv>5   THEN 'd.5~10     pv'
        WHEN pv<=15   and pv>10  THEN 'e.10~15    pv'
        WHEN pv<=20   and pv>15  THEN 'f.15~20    pv'
        WHEN pv<=30   and pv>20  THEN 'g.20~30    pv'
        WHEN pv<=40   and pv>30  THEN 'h.30~40    pv'
        WHEN pv<=50   and pv>40  THEN 'i.40~50    pv'
        WHEN pv<=100  and pv>50  THEN 'j.50~100   pv'
        WHEN pv<=150  and pv>100 THEN 'k.100~150  pv'
        WHEN pv<=200  and pv>150 THEN 'l.150~200  pv'
        WHEN pv<=300  and pv>200 THEN 'm.200~300  pv'
        WHEN pv<=500  and pv>300 THEN 'n.300~500  pv'
        WHEN pv<=1000 and pv>500 THEN 'o.500~1000 pv'
        ELSE 'p.1000~    pv'     END  as group_name, 
        member_id, 
        pv,
        COUNT(member_id) OVER() AS all_mem_cnt, 
        SUM(pv) OVER() AS all_pv_cnt
    FROM
    (
        SELECT /*+parallel(a 16)*/ member_id, COUNT(1) AS pv
        FROM cnlog.path_fatdt0 PARTITION(p20101018) a
        WHERE PROFILE_ID = 20
        GROUP BY MEMBER_ID
        ORDER BY COUNT(1) DESC, member_id ASC
    )
)
GROUP BY group_name
ORDER BY group_name ASC;

[创建临时存储过程返回结果集]

执行匿名存储过程删除分区,清除历史数据:

DECLARE
  begin_time DATE := TO_DATE('20080124', 'yyyymmdd');
  end_time DATE := TO_DATE('20080130', 'yyyymmdd');
  cur_time DATE;
  
  v_string VARCHAR2(1024) := '';
  TYPE StrArray IS TABLE OF VARCHAR2(1024);
  patlist StrArray;
  dellist StrArray;
BEGIN
  patlist := StrArray(
    'analog_res_path_model',
    'analog_res_basic_model',
    'analog_tmp_querystring',
    'analog_tmp_cookieid',
    'analog_tmp_entrypage',
    'analog_tmp_refpage',
    'analog_tmp_listcookie',
    'analog_tmp_flatcookie');
  
  cur_time := begin_time;
  WHILE cur_time < end_time
  LOOP
    FOR i IN patlist.FIRST..patlist.LAST LOOP
      v_string := 'ALTER TABLE ' || patlist(i) || ' TRUNCATE PARTITION p' || TO_CHAR(TRUNC(cur_time),'yyyymmdd');
      DBMS_OUTPUT.PUT_LINE(v_string);
      EXECUTE IMMEDIATE v_string;
    END LOOP;
    
    cur_time := cur_time + 1;
  END LOOP;

  dellist := StrArray(
  'analog_tmp_cid',
  'analog_tmp_fid',
  'analog_tmp_oid',
  --'analog_tmp_roid',
  --'analog_tmp_mid',
  'analog_tmp_pid',
  --'analog_res_cg_section',
  --'analog_res_cg_subsection',
  --'analog_sys_anadate',
  'analog_res_other_topagent',
  'analog_res_page_topentrys',
  'analog_res_page_topexit',
  'analog_res_page_topgroups',
  'analog_res_page_toppages',
  'analog_res_page_topsingle',
  'analog_res_stickness_pvse',
  'analog_res_stickness_staytime',
  'analog_res_summary_countries',
  'analog_res_summary_daily',
  'analog_res_summary_hourly',
  'analog_res_visit_toprefer',
  'ANALOG_TMP_REFPAGE',
  'ANALOG_TMP_ENTRYPAGE',
  'analog_tmp_querystring',
  'ANALOG_TMP_COOKIEID',
  --'ANALOG_TMP_SIGNIN_COUNTRIES',
  'ANALOG_RES_BASIC_MODEL',
  --'ANALOG_RES_CG_PROFILE',
  'analog_res_querytrace');
  
  cur_time := begin_time;
  WHILE cur_time < end_time
  LOOP
    FOR i IN dellist.FIRST..dellist.LAST LOOP
      v_string := 'DELETE FROM ' || dellist(i) || ' WHERE report_time = ' || TO_CHAR(TRUNC(cur_time),'yyyymmdd');
      DBMS_OUTPUT.PUT_LINE(v_string);
    EXECUTE IMMEDIATE v_string;
    COMMIT;
    END LOOP;
    
    cur_time := cur_time + 1;
  END LOOP;  
END;
/

  

[使用hits进行指定关联]

使用hits进行指定关联:

SELECT  /*+ ordered PARALLEL(p,8) */ 
   q.QUERY_VALUE, count(1) as stopwords_count
FROM
  (  
    select /*+ PARALLEL(p,8) */ SESSION_ID, STEP
    from cnlog.path_fatdt0 p
    WHERE URL_ID = 46718752
    and yyyymmdd BETWEEN TO_DATE(20100818, 'yyyymmdd') AND TO_DATE(20100818, 'yyyymmdd') + 86399 / 86400
  )p, 
  CNLOG.ENTRY_LISTQUERY_FATDT0 q, 
  cnlog.query_dimt0 d
where q.yyyymmdd BETWEEN TO_DATE(20100818, 'yyyymmdd') AND TO_DATE(20100818, 'yyyymmdd') + 86399 / 86400
and   p.SESSION_ID = q.SESSION_ID
and   p.STEP = q.STEP
and   d.QUERY_NAME in ('keywords')
and   d.QUERY_NAME_ID = q.QUERY_NAME_ID
group by q.QUERY_VALUE
order by count(1) desc;

posted on 2010-11-16 13:53  天晴如许  阅读(420)  评论(0编辑  收藏  举报