Hive笔记
1、-- 导入数据
create table orders ( order_id string, user_id string, eval_set string, order_number string, order_dow string, order_hour_of_day string, days_since_prior_order string ) row format delimited fields terminated by ',' lines terminated by '\n';
load data local inpath '/home/badou/Documents/data/order_data/orders.csv' overwrite into table orders;
2、每个用户有多少个订单
hive> select user_id,count(1) as order_cnt from orders group by user_id order by order_cnt desc limit 10; Total jobs = 2 Launching Job 1 out of 2 Number of reduce tasks not specified. Estimated from input data size: 1 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapred.reduce.tasks=<number> Starting Job = job_202003192037_0003, Tracking URL = http://master:50030/jobdetails.jsp?jobid=job_202003192037_0003 Kill Command = /usr/local/src/hadoop-1.2.1/libexec/../bin/hadoop job -kill job_202003192037_0003 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 2020-03-19 21:09:32,228 Stage-1 map = 0%, reduce = 0% 2020-03-19 21:09:44,551 Stage-1 map = 62%, reduce = 0% 2020-03-19 21:09:45,568 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 9.29 sec 2020-03-19 21:09:54,697 Stage-1 map = 100%, reduce = 33%, Cumulative CPU 9.29 sec 2020-03-19 21:09:57,727 Stage-1 map = 100%, reduce = 67%, Cumulative CPU 9.29 sec 2020-03-19 21:10:00,763 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 15.25 sec MapReduce Total cumulative CPU time: 15 seconds 250 msec Ended Job = job_202003192037_0003 Launching Job 2 out of 2 Number of reduce tasks determined at compile time: 1 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapred.reduce.tasks=<number> Starting Job = job_202003192037_0004, Tracking URL = http://master:50030/jobdetails.jsp?jobid=job_202003192037_0004 Kill Command = /usr/local/src/hadoop-1.2.1/libexec/../bin/hadoop job -kill job_202003192037_0004 Hadoop job information for Stage-2: number of mappers: 1; number of reducers: 1 2020-03-19 21:10:13,220 Stage-2 map = 0%, reduce = 0% 2020-03-19 21:10:23,341 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 5.42 sec 2020-03-19 21:10:32,465 Stage-2 map = 100%, reduce = 33%, Cumulative CPU 5.42 sec 2020-03-19 21:10:35,559 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 8.74 sec MapReduce Total cumulative CPU time: 8 seconds 740 msec Ended Job = job_202003192037_0004 MapReduce Jobs Launched: Job 0: Map: 1 Reduce: 1 Cumulative CPU: 15.25 sec HDFS Read: 108973054 HDFS Write: 5094362 SUCCESS Job 1: Map: 1 Reduce: 1 Cumulative CPU: 8.74 sec HDFS Read: 5094820 HDFS Write: 104 SUCCESS Total MapReduce CPU Time Spent: 23 seconds 990 msec OK user_id order_cnt 106879 100 3377 100 183036 100 96577 100 194931 100 66482 100 109020 100 12166 100 139897 100 99805 100 Time taken: 74.499 seconds, Fetched: 10 row(s)
3、每个用户平均每个订单平均是多少商品
因为orders表中只有用户和订单的数据,需要关联priors或者trains表,才能获得到订单的数据。因为trains表中的数据量比较少,但是trains中因为是作为标签的数据,只有一个订单的数据。 可以取部分的priors来作为进行代码调试计算。加`limit` ```sql select ord.user_id,avg(pri.products_cnt) as avg_prod from (select order_id,user_id from orders)ord join (select order_id,count(1) as products_cnt from priors group by order_id)pri on ord.order_id=pri.order_id group by ord.user_id limit 10;
4、#### 每个用户在一周中的购买订单的分布
hive> select > user_id, > sum(case order_dow when '0' then 1 else 0 end) as dow_0, > sum(case order_dow when '1' then 1 else 0 end) as dow_1, > sum(case order_dow when '2' then 1 else 0 end) as dow_2, > sum(case order_dow when '3' then 1 else 0 end) as dow_3, > sum(case order_dow when '4' then 1 else 0 end) as dow_4, > sum(case order_dow when '5' then 1 else 0 end) as dow_5, > sum(case order_dow when '6' then 1 else 0 end) as dow_6 > from orders > group by user_id > limit 20; Total jobs = 1 Launching Job 1 out of 1 Number of reduce tasks not specified. Estimated from input data size: 1 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapreduce.job.reduces=<number> Starting Job = job_1584680108277_0002, Tracking URL = http://master:8088/proxy/application_1584680108277_0002/ Kill Command = /usr/local/src/hadoop-2.6.1/bin/hadoop job -kill job_1584680108277_0002 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 2020-03-19 22:28:14,095 Stage-1 map = 0%, reduce = 0% 2020-03-19 22:28:44,411 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 19.47 sec 2020-03-19 22:28:59,770 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 22.56 sec MapReduce Total cumulative CPU time: 22 seconds 560 msec Ended Job = job_1584680108277_0002 MapReduce Jobs Launched: Job 0: Map: 1 Reduce: 1 Cumulative CPU: 22.56 sec HDFS Read: 108968864 HDFS Write: 414 SUCCESS Total MapReduce CPU Time Spent: 22 seconds 560 msec OK user_id dow_0 dow_1 dow_2 dow_3 dow_4 dow_5 dow_6 1 0 3 2 2 4 0 0 10 1 0 1 2 0 2 0 100 1 1 0 2 0 2 0 1000 4 0 1 1 0 0 2 10000 15 12 10 7 9 9 11 100000 2 1 0 4 1 0 2 100001 4 15 17 13 6 9 3 100002 0 3 0 0 3 5 2 100003 0 0 0 0 0 3 1 100004 1 2 2 2 0 2 0 100005 3 5 1 2 6 1 1 100006 5 2 1 1 3 2 0 100007 0 0 1 1 2 3 0 100008 2 5 8 4 3 2 5 100009 4 3 1 0 0 1 0 10001 12 7 2 0 0 1 1 100010 3 2 0 1 1 2 3 100011 3 4 3 4 4 0 1 100012 0 23 2 1 0 0 0 100013 10 3 6 2 7 4 6 Time taken: 59.967 seconds, Fetched: 20 row(s) hive>
5、创建内部表
--内部表建表 create table if not exists inner_test ( aisle_id string, aisle_name string ) row format delimited fields terminated by ',' lines terminated by '\n' stored as textfile location '/data/inner';
6、创建外部表
create external table if not exists ext_test ( aisle_id string, aisle_name string ) row format delimited fields terminated by ',' lines terminated by '\n' stored as textfile location '/data/ext';
总结:
当内部表删除时,元数据跟hdfs上存储的相应位置数据也会跟着删除,而外部表删除时,元数据会删除而hdfs上存储的相应位置数据不会被删除。
---删除内部表 hive> drop table inner_test; OK Time taken: 0.761 seconds hive> desc inner_test; FAILED: SemanticException [Error 10001]: Table not found inner_test hdfs上查看数据 [root@master simon]# hadoop fs -ls /data/inner ls: `/data/inner': No such file or directory ---删除外部表 hive> drop table ext_test; OK Time taken: 0.24 seconds hdfs上查看数据: [root@master simon]# hadoop fs -ls /data/ext Found 1 items -rw-r--r-- 3 root supergroup 2603 2020-03-20 01:38 /data/ext/aisle.csv
7、建分区表
-- 建分区表 create table partition_test( order_id string, user_id string, eval_set string, order_number string, order_hour_of_day string, days_since_prior_order string )partitioned by(order_dow string) row format delimited fields terminated by '\t';
8、动态插入分区表
--动态插入分区表 set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; -- insert into table partition_test insert overwrite table partition_test partition (order_dow='1') select order_id,user_id,eval_set,order_number,order_hour_of_day,days_since_prior_order from orders where order_dow='1';
备注:
- 动态分区指不需要为不同的分区添加不同的插入语句,分区不确定,需要从数据中获取。 `set hive.exec.dynamic.partition=true;`//使用动态分区 `set hive.exec.dynamic.partition.mode=nonstrict;`//无限制模式 如果模式是strict,则必须有一个静态分区,且放在最前面。
9、分区表查询,必须是要加上where条件
select * from partition_test where order_dow='0' limit 10;
10、查看表的分区
hive> show partitions partition_test; OK order_dow=1 Time taken: 0.292 seconds, Fetched: 1 row(s)
11、hive优化
1、优化一
1、优先级 set mapreduce.job.reduces=<number> set mapreduce.job.reduces=5; Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 15 2、set hive.exec.reducers.max=<number> set hive.exec.reducers.max=10; 3、set hive.exec.reducers.bytes.per.reducer=20000; 备注:1的优先级大于2,2的优先级大于3
2、优化二 where条件使得group by允余
map和reduce执行过程是一个同步的过程
3、只有一个reduce
没有group by
set mapreduce.job.reduces=5; select count(1) from orders where order_dow='0';
create table priors ( order_id string, product_id string, add_to_cart_order string, reordered string ) row format delimited fields terminated by ',' lines terminated by '\n';
hive> set hive.groupby.skewindata=true; hive> select add_to_cart_order,count(1) as cnt > from priors > group by add_to_cart_order > limit 10; Total jobs = 2 Launching Job 1 out of 2 Number of reduce tasks not specified. Defaulting to jobconf value of: 30 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapreduce.job.reduces=<number> Starting Job = job_1584680108277_0023, Tracking URL = http://master:8088/proxy/application_1584680108277_0023/ Kill Command = /usr/local/src/hadoop-2.6.1/bin/hadoop job -kill job_1584680108277_0023 Hadoop job information for Stage-1: number of mappers: 3; number of reducers: 30 2020-03-20 22:01:31,483 Stage-1 map = 0%, reduce = 0% 2020-03-20 22:01:57,933 Stage-1 map = 11%, reduce = 0%, Cumulative CPU 13.05 sec
2020-03-20 22:10:21,210 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 2.15 sec
2020-03-20 22:10:27,212 Stage-2 map = 100%, reduce = 2%, Cumulative CPU 3.19 sec
2020-03-20 22:10:36,357 Stage-2 map = 100%, reduce = 4%, Cumulative CPU 4.82 sec
2020-03-20 22:10:37,503 Stage-2 map = 100%, reduce = 8%, Cumulative CPU 6.68 sec
2020-03-20 22:10:44,817 Stage-2 map = 100%, reduce = 12%, Cumulative CPU 11.35 sec
2020-03-20 22:10:46,001 Stage-2 map = 100%, reduce = 16%, Cumulative CPU 13.63 sec
2020-03-20 22:10:49,626 Stage-2 map = 100%, reduce = 20%, Cumulative CPU 15.82 sec
2020-03-20 22:10:50,741 Stage-2 map = 100%, reduce = 21%, Cumulative CPU 17.57 sec
2020-03-20 22:10:53,130 Stage-2 map = 100%, reduce = 23%, Cumulative CPU 21.89 sec
2020-03-20 22:10:55,383 Stage-2 map = 100%, reduce = 24%, Cumulative CPU 24.25 sec
2020-03-20 22:10:58,752 Stage-2 map = 100%, reduce = 31%, Cumulative CPU 28.06 sec
2020-03-20 22:10:59,869 Stage-2 map = 100%, reduce = 33%, Cumulative CPU 29.03 sec
2020-03-20 22:11:04,454 Stage-2 map = 100%, reduce = 38%, Cumulative CPU 32.17 sec
2020-03-20 22:11:09,028 Stage-2 map = 100%, reduce = 40%, Cumulative CPU 35.79 sec
2020-03-20 22:11:12,517 Stage-2 map = 100%, reduce = 42%, Cumulative CPU 39.99 sec
2020-03-20 22:11:13,759 Stage-2 map = 100%, reduce = 43%, Cumulative CPU 40.96 sec
2020-03-20 22:11:16,052 Stage-2 map = 100%, reduce = 44%, Cumulative CPU 43.98 sec
2020-03-20 22:11:17,217 Stage-2 map = 100%, reduce = 46%, Cumulative CPU 46.55 sec
2020-03-20 22:11:20,840 Stage-2 map = 100%, reduce = 47%, Cumulative CPU 49.83 sec
2020-03-20 22:12:10,102 Stage-2 map = 100%, reduce = 49%, Cumulative CPU 51.48 sec
2020-03-20 22:12:11,228 Stage-2 map = 100%, reduce = 51%, Cumulative CPU 52.43 sec
2020-03-20 22:12:12,373 Stage-2 map = 100%, reduce = 53%, Cumulative CPU 53.35 sec
2020-03-20 22:12:13,482 Stage-2 map = 100%, reduce = 58%, Cumulative CPU 56.39 sec
2020-03-20 22:12:14,599 Stage-2 map = 100%, reduce = 60%, Cumulative CPU 58.19 sec
2020-03-20 22:12:22,516 Stage-2 map = 100%, reduce = 62%, Cumulative CPU 61.76 sec
2020-03-20 22:12:23,788 Stage-2 map = 100%, reduce = 64%, Cumulative CPU 64.85 sec
2020-03-20 22:12:24,935 Stage-2 map = 100%, reduce = 66%, Cumulative CPU 66.41 sec
2020-03-20 22:12:32,934 Stage-2 map = 100%, reduce = 67%, Cumulative CPU 74.37 sec
2020-03-20 22:13:02,201 Stage-2 map = 100%, reduce = 71%, Cumulative CPU 76.26 sec
2020-03-20 22:13:05,474 Stage-2 map = 100%, reduce = 84%, Cumulative CPU 82.34 sec
2020-03-20 22:13:18,612 Stage-2 map = 100%, reduce = 91%, Cumulative CPU 91.97 sec
2020-03-20 22:13:19,728 Stage-2 map = 100%, reduce = 93%, Cumulative CPU 94.49 sec
2020-03-20 22:13:21,860 Stage-2 map = 100%, reduce = 98%, Cumulative CPU 98.82 sec
2020-03-20 22:13:28,450 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 102.73 sec
MapReduce Total cumulative CPU time: 1 minutes 42 seconds 730 msec
Ended Job = job_1584680108277_0024
MapReduce Jobs Launched:
Job 0: Map: 3 Reduce: 30 Cumulative CPU: 178.2 sec HDFS Read: 577567986 HDFS Write: 11215 SUCCESS
Job 1: Map: 1 Reduce: 30 Cumulative CPU: 111.49 sec HDFS Read: 17810 HDFS Write: 1115 SUCCESS
Total MapReduce CPU Time Spent: 4 minutes 49 seconds 690 msec
OK
105 13
114 7
123 3
132 2
141 1
106 12
115 6
124 3
133 2
142 1
Time taken: 753.55 seconds, Fetched: 10 row(s)
select > ord.order_id order_id, > tra.product_id product_id, > pri.reordered reordered > from orders ord > join trains tra on ord.order_id=tra.order_id > join priors pri on ord.order_id=pri.order_id > limit 10;