4. 数仓分层 && 用户行为数仓搭建

整体架构

技术框架

1. 配置表主题

2. 配置表层级

3. 数据仓库分层

1. 原始数据层ODS层搭建

表的位置

使用DataHub Connector将DataHub中的数据推送到MaxCompute中

创建DataConnector

数据成功加载

2. DWD层搭建

日志格式分析

自定义UDTF(解析具体事件字段)

直接在FunctionStudio中开发

1.打开FunctionStudio界面

2.新建工程




3. 分析日志格式,导入依赖,编写自定义UDTF代码

导入依赖

// 导入依赖
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.28.odps</version>
</dependency>
编写UDTF代码

// 编写自定义UDTF代码
String event = (String)args[0];
JSONArray jsonArray = JSONArray.parseArray(event);
for (int i = 0; i < jsonArray.size(); i++) {
    JSONObject jsonObject = jsonArray.getJSONObject(i);
    String ett = jsonObject.getString("ett");
    String eventName = jsonObject.getString("en");
    String eventJson = jsonObject.getString("kv");

    forward(Long.parseLong(ett), eventName, eventJson);
}
打包部署

3.DWD层建表,手动提交到生产环境,放到dwd这个文件夹中

1. 手动将 ODS 层数据导入 DWD 层

insert overwrite table dwd_start_log1 PARTITION (ds, hh, mm)
SELECT 
    -- 公共字段
    -- 通过Hive内置函数get_json_object逐个提取为字段
    GET_JSON_OBJECT(log_string, '$.cm.mid') mid,
    GET_JSON_OBJECT(log_string, '$.cm.uid') user_id,
    GET_JSON_OBJECT(log_string,'$.cm.vc') version_code,
    GET_JSON_OBJECT(log_string,'$.cm.vn') version_name,
    GET_JSON_OBJECT(log_string,'$.cm.l') lang,
    GET_JSON_OBJECT(log_string,'$.cm.sr') source,
    GET_JSON_OBJECT(log_string,'$.cm.os') os,
    GET_JSON_OBJECT(log_string,'$.cm.ar') area,
    GET_JSON_OBJECT(log_string,'$.cm.md') model,
    GET_JSON_OBJECT(log_string,'$.cm.ba') brand,
    GET_JSON_OBJECT(log_string,'$.cm.sv') sdk_version,
    GET_JSON_OBJECT(log_string,'$.cm.hw') height_width,
    GET_JSON_OBJECT(log_string,'$.cm.g') email,
    GET_JSON_OBJECT(log_string,'$.cm.hw') sv,
    GET_JSON_OBJECT(log_string,'$.cm.ln') ln,
    GET_JSON_OBJECT(log_string,'$.cm.la') la,
    -- 事件字段
    GET_JSON_OBJECT(event_view.event_json, '$.entry') entry,
    GET_JSON_OBJECT(event_view.event_json, '$.loading_time') loading_time,
    GET_JSON_OBJECT(event_view.event_json, '$.action') action,
    GET_JSON_OBJECT(event_view.event_json, '$.open_ad_type') open_ad_type,
    GET_JSON_OBJECT(event_view.event_json, '$.detail') detail,
    -- 分区时间
    event_view.event_time,
    -- 分区值
    ds,
    hh,
    mm
FROM ods_base_log1
    LATERAL VIEW FlatEventUDTF(GET_JSON_OBJECT(log_string, '$.et')) event_view as event_time, event_name, event_json
where ds = '20220518' and event_view.event_name='start';

2. 查询导入的数据

select * from dwd_start_log1 where ds='20220518';

3. ods层到dwd层写一个定时任务

1. 创建一个ODPS节点

insert overwrite table dwd_start_log1 PARTITION (ds, hh, mm)
SELECT 
    -- 公共字段
    -- 通过Hive内置函数get_json_object逐个提取为字段
    GET_JSON_OBJECT(log_string, '$.cm.mid') mid,
    GET_JSON_OBJECT(log_string, '$.cm.uid') user_id,
    GET_JSON_OBJECT(log_string,'$.cm.vc') version_code,
    GET_JSON_OBJECT(log_string,'$.cm.vn') version_name,
    GET_JSON_OBJECT(log_string,'$.cm.l') lang,
    GET_JSON_OBJECT(log_string,'$.cm.sr') source,
    GET_JSON_OBJECT(log_string,'$.cm.os') os,
    GET_JSON_OBJECT(log_string,'$.cm.ar') area,
    GET_JSON_OBJECT(log_string,'$.cm.md') model,
    GET_JSON_OBJECT(log_string,'$.cm.ba') brand,
    GET_JSON_OBJECT(log_string,'$.cm.sv') sdk_version,
    GET_JSON_OBJECT(log_string,'$.cm.hw') height_width,
    GET_JSON_OBJECT(log_string,'$.cm.g') email,
    GET_JSON_OBJECT(log_string,'$.cm.hw') sv,
    GET_JSON_OBJECT(log_string,'$.cm.ln') ln,
    GET_JSON_OBJECT(log_string,'$.cm.la') la,
    -- 事件字段
    GET_JSON_OBJECT(event_view.event_json, '$.entry') entry,
    GET_JSON_OBJECT(event_view.event_json, '$.loading_time') loading_time,
    GET_JSON_OBJECT(event_view.event_json, '$.action') action,
    GET_JSON_OBJECT(event_view.event_json, '$.open_ad_type') open_ad_type,
    GET_JSON_OBJECT(event_view.event_json, '$.detail') detail,
    -- 分区时间
    event_view.event_time,
    -- 分区值
    ds,
    hh,
    mm
FROM ods_base_log1
    LATERAL VIEW FlatEventUDTF(GET_JSON_OBJECT(log_string, '$.et')) event_view as event_time, event_name, event_json
where ds = '${bizdate}' and event_view.event_name='start';

4. DWS层

1. 手动将DWD层数据导入DWS层

新建表

导入数据

// 导入数据
INSERT OVERWRITE TABLE dws_uv_detail_d_sql PARTITION (ds, hh, mm)
SELECT
    mid,
    user_id,
    version_code,
    version_name,
    lang,
    source,
    os,
    area,
    model,
    brand,
    sdk_version,
    email,
    height_width,
    network,
    lng,
    lat,
    event_time,
    ds,
    hh,
    mm
FROM
(
    SELECT
        *,
        ROW_NUMBER() OVER(PARTITION BY mid ORDER BY event_time) as rn
    from dwd_start_log1
    where ds = '20220518'
) st where rn = 1;

// 查询数据
select * from dws_uv_detail_d_sql where ds = '20220518';

2. ods层到dwd层写一个定时任务

insert overwrite table dws_uv_detail_d partition(ds,hh,mm)
select 
 mid,
 user_id,
 version_code,
 version_name,
 lang,
 source,
 os,
 area, 
 model,
 brand,
 sdk_version,
 email,
 height_width,
 network,
 lng,
 lat,
 event_time,
 ds,
 hh,
 mm
from 
( 
 select 
 *,
 ROW_NUMBER() OVER(PARTITION BY mid ORDER BY event_time 
asc) rn 
 from dwd_start_log
 where ds = '${bizdate}' 
) st where rn = 1;

5. ADS层分析业务的指标

创建表

脚本编写

insert OVERWRITE table ads_uv_source_d PARTITION 
(ds='${bizdate}')
SELECT 
 source,
 COUNT(*) ct 
from dws_uv_detail_d
where ds='${bizdate}'
group by source;

创建虚拟节点,开始执行

运行


成功将数据导入ads层

posted @ 2022-05-19 23:05  jsqup  阅读(140)  评论(0编辑  收藏  举报