基于Hive的数据分析测试流程概要
- 1. 环境搭建
根据测试的需要,我们需要将hive清洗后的数据传到本地的mysql中,这里我们选择使用sqoop进行数据的导入。
- 2. 数据导入
建表后,将目标的csv文件格式导入到hive中,这里我选择的是通过浏览器导入的,相应的建立表的sql语句如下。
create table sales_sample_20214044 (
`day_id` string comment '时间',
`sale_nbr` string comment '卖出方',
`buy_nbr` string comment '买入方',
`cnt` int comment '数量',
`round` int comment '金额')
row format delimited
fields terminated by ','
lines terminated by '\n';
- 3. 数据清洗
1)根据如下题目要求对数据进行清洗,对应的sql语句如下:
insert overwrite table sales_sample_20214044
select concat('2022-09-',day_id),sale_nbr,buy_nbr,cnt,round from sales_sample_20214044;
2)接下来根据题目具体分析
-- 1.统计每天各个机场的销售数量和销售金额。
-- 创建机场表存放数据
create table test_jichang
(
day_id string,
sale_nbr string,
cnt string,
round string
) row format delimited fields terminated by ',';
--向机场表插入数据
insert into table test_jichang
select day_id, sale_nbr, sum(cnt) as cnt, sum(round) as round
from test_sales_wash
group by sale_nbr, day_id
having sale_nbr like 'C%';
--2.统计每天各个代理商的销售数量和销售金额。
--创建代理商表存放数据
create table test_dailishang
(
day_id string,
sale_nbr string,
cnt string,
round string
) row format delimited fields terminated by ',';
--向代理商表插入数据
insert into table test_dailishang
select day_id, sale_nbr, sum(cnt) as cnt, sum(round) as round
from test_sales_wash
group by sale_nbr, day_id
having sale_nbr like 'O%';
--3.统计每天各个代理商的销售活跃度
-- 创建表存放每天代理商卖出的活跃度
create table test_dailishang_mc
(
day_id string,
sale_nbr string,
count int
) row format delimited fields terminated by ',';
--插入数据到代理商卖出活跃表
insert into table test_dailishang_mc
select day_id, sale_nbr, count(*) as count
from test_sales_wash
where sale_nbr like 'O%'
group by sale_nbr, day_id;
--创建表存放每天代理商买入的活跃度
create table test_dailishang_mr
(
day_id string,
sale_nbr string,
count int
) row format delimited fields terminated by ',';
--插入数据到代理商买入活跃表
insert into table test_dailishang_mr
select day_id, buy_nbr as sale_nbr, count(*) as count
from test_sales_wash
where buy_nbr like 'O%'
group by buy_nbr, day_id;
--创建表统计每天代理商的活跃度
create table test_dailishang_h
(
day_id string,
sale_nbr string,
sale_number int
) row format delimited fields terminated by ',';
--插入数据
insert into table test_dailishang_h
select test_dailishang_mc.day_id as day_id,
test_dailishang_mc.sale_nbr as sale_nbr,
test_dailishang_mc.count + test_dailishang_mr.count as sale_number
from test_dailishang_mc
join test_dailishang_mr on (test_dailishang_mc.sale_nbr = test_dailishang_mr.sale_nbr);
--查看结果
select *
from test_dailishang_h
where day_id = '2021-09-01'
limit 40;
--4.汇总统计 9 月 1 日到 9 月 15 日之间各个代理商的销售利润。
-- 创建表存放每天代理商卖出的数量和金额
create table test_daiout
(
day_id string,
sale_nbr string,
cnt int,
round int
) row format delimited fields terminated by ',';
--插入数据
insert into table test_daiout
select day_id, sale_nbr, sum(cnt) as cnt, sum(round) as round
from test_sales_wash
where sale_nbr like 'O%'
group by day_id, sale_nbr;
-- 创建表存放每天代理商买入的数量和金额
create table test_daiin
(
day_id string,
buy_nbr string,
cnt int,
round int
) row format delimited fields terminated by ',';
--插入数据
insert into table test_daiin
select day_id, buy_nbr, sum(cnt) as cnt, sum(round) as round
from test_sales_wash
where buy_nbr like 'O%'
group by day_id, buy_nbr;
--创建表存放每天代理商的销售利润
create table test_daili
(
day_id string,
sale_nbr string,
incnt int,
inround int,
outcnt int,
outround int,
lirun int
) row format delimited fields terminated by ',';
--插入数据
insert into table test_daili
select test_daiout.day_id as day_id,
test_daiout.sale_nbr as sale_nbr,
test_daiin.cnt as incnt,
test_daiin.round as inround,
test_daiout.cnt as outcnt,
test_daiout.round as outround,
test_daiout.round - test_daiin.round as lirun
from test_daiout
join test_daiin on (test_daiin.buy_nbr = test_daiout.sale_nbr);
- 4. 将数据导入mysql
对应的mysql的建表命令如下:
-- 创建数据库
create database test;
-- drop database test;
use test;
# create table name(
# id varchar(255),
# name varchar(255)
# )
# charset utf8 collate utf8_general_ci;
create table test_jichang
(
day_id varchar(255),
sale_nbr varchar(255),
cnt varchar(255),
round varchar(255)
);
create table test_dailishang
(
day_id varchar(255),
sale_nbr varchar(255),
cnt varchar(255),
round varchar(255)
);
create table test_dailishang_h
(
day_id varchar(255),
sale_nbr varchar(255),
Sale_number int
);
create table test_daili
(
day_id varchar(255),
sale_nbr varchar(255),
incnt int,
inround int,
outcnt int,
outround int,
lirun int
);
将hive中的数据导入到mysql的命令如下:
sqoop export \
--connect "jdbc:mysql://10.99.121.105:3306/test?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&useSSL=false" \
--username root \
--password sqlsql\
--table test_dailishang_h\
--num-mappers 1 \
--export-dir /user/hive/warehouse/test_dailishang_h \
--input-fields-terminated-by ","
- 5. 运用echars对数据进行展示
这里借鉴了一下同学们的springboot+vue的模板,我对vue的技术倒是有所学习,最近正在学习springboot,只知道它是一个框架的框架。对于echarts也是现学现用的。
- 6. 开发中的问题以及解决办法
在将清洗好的数据导入mysql时总是报错,首先是依赖的缺少,我从同学那里偷了一个jar包解决了,接着尝试发现无法连接本地的mysql,上网找了好多资料也没有发现,最后另辟蹊径,开启了mysql的远程访问权限,并将连接名换成了我的本地ip成功了。