11.1 DWS层
目标:统计当日(dau)、当周、当月活动的每个设备明细
11.1.1 每日活跃设备明细
1)建表语句
drop table if exists dws_uv_detail_day;
create external table dws_uv_detail_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
)
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day';
2)数据导入
以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_day
partition(dt='2020-10-14')
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang))lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat
from dwd_start_log
where dt='2020-10-14'
group by mid_id;
3)查询导入结果
select * from dws_uv_detail_day limit 1;
select count(*) from dws_uv_detail_day;
4)思考
不同渠道来源的每日活跃数统计怎么计算?
11.1.2 每周活跃设备明细
根据日用户访问明细,获得周用户访问明细。
1)建表语句
drop table if exists dws_uv_detail_wk;
create external table dws_uv_detail_wk(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`monday_date` string COMMENT '周一日期',
`sunday_date` string COMMENT '周日日期'
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/';
2)数据导入
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_wk partition(wk_dt)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_add(next_day('2020-10-14','MO'),-7),
date_add(next_day('2020-10-14','MO'),-1),
concat(date_add( next_day('2020-10-14','MO'),-7), '_' , date_add(next_day('2020-10-14','MO'),-1)
)
from dws_uv_detail_day
where dt>=date_add(next_day('2020-10-14','MO'),-7) and dt<=date_add(next_day('2020-10-14','MO'),-1)
group by mid_id;
3)查询导入结果
select * from dws_uv_detail_wk limit 1;
select count(*) from dws_uv_detail_wk;
11.1.3 每月活跃设备明细
1)建表语句
drop table if exists dws_uv_detail_mn;
create external table dws_uv_detail_mn(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/';
2)数据导入
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_mn partition(mn)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_format('2020-10-14','yyyy-MM')
from dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('2020-10-14','yyyy-MM')
group by mid_id;
3)查询导入结果
select * from dws_uv_detail_mn limit 1;
select count(*) from dws_uv_detail_mn ;
11.1.4 DWS层加载数据脚本
1)创建脚本
[kgg@hadoop102 bin]$ vim dws_uv_log.sh
在脚本中编写如下内容
2)增加脚本执行权限
chmod 777 dws_uv_log.sh
3)脚本使用
dws_uv_log.sh 2019-02-11
4)查询结果
select count(*) from dws_uv_detail_day where dt='2019-02-11';
select count(*) from dws_uv_detail_wk;
select count(*) from dws_uv_detail_mn ;
5)脚本执行时间
企业开发中一般在每日凌晨30分~1点
11.2 ADS层
目标:当日、当周、当月活跃设备数
11.2.1 活跃设备数
1)建表语句
drop table if exists ads_uv_count;
create external table ads_uv_count(
`dt` string COMMENT '统计日期',
`day_count` bigint COMMENT '当日用户数量',
`wk_count` bigint COMMENT '当周用户数量',
`mn_count` bigint COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count/';
2)导入数据
insert into table ads_uv_count
select
'2020-10-14' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('2020-10-14','MO'),-1)='2020-10-14','Y','N') ,
if(last_day('2020-10-14')='2020-10-14','Y','N')
from
(
select
'2020-10-14' dt,
count(*) ct
from dws_uv_detail_day
where dt='2020-10-14'
)daycount join
(
select
'2020-10-14' dt,
count (*) ct
from dws_uv_detail_wk
where wk_dt=concat(date_add(next_day('2020-10-14','MO'),-7),'_' ,date_add(next_day('2020-10-14','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'2020-10-14' dt,
count (*) ct
from dws_uv_detail_mn
where mn=date_format('2020-10-14','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
3)查询导入结果
select * from ads_uv_count ;
11.2.2 ADS层加载数据脚本
1)创建脚本
[kgg@hadoop102 bin]$ vim ads_uv_log.sh
在脚本中编写如下内容
2)增加脚本执行权限
chmod 777 ads_uv_log.sh
3)脚本使用
ads_uv_log.sh 2019-02-11
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
select * from ads_uv_count;
本文来自博客园,作者:大码王,转载请注明原文链接:https://www.cnblogs.com/huanghanyu/