Fork me on GitHub

Hive之Order,Sort,Cluster and Distribute By

  • 测试数据
create table sort_test(
  id int,
  name string
)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

[root@wadeyu ~]# cat sort_test.log
4679	aaa
4728	aaa
3040	aaa
4207	aaa
2231	aaa
1279	aaa
7954	aaa
582	aaa
7096	aaa
4878	aaa
9684	aaa
1540	aaa
4826	aaa
2543	aaa
2323	aaa
1420	aaa
5083	aaa
8965	aaa
1391	aaa
9719	aaa
9901	aaa
2393	aaa
6024	aaa
444	aaa
1574	aaa
8881	aaa
5739	aaa
8689	aaa
1614	aaa
9340	aaa
6726	aaa
109	aaa
6941	aaa
9562	aaa
9019	aaa
4945	aaa
2206	aaa
5910	aaa
8552	aaa
1795	aaa
2720	aaa
9007	aaa
8377	aaa
2179	aaa
3683	aaa
5869	aaa
5448	aaa
5223	aaa
5127	aaa
4616	aaa
2340	aaa
1268	aaa
4332	aaa
2989	aaa
19	aaa
7880	aaa
505	aaa
5975	aaa
5288	aaa
5682	aaa
376	aaa
7502	aaa
6448	aaa
3774	aaa
5541	aaa
9636	aaa
2037	aaa
246	aaa
6151	aaa
7837	aaa
1506	aaa
3749	aaa
9335	aaa
3973	aaa
5160	aaa
7929	aaa
834	aaa
3451	aaa
1766	aaa
6228	aaa
8961	aaa
8177	aaa
2340	aaa
4245	aaa
3226	aaa
2670	aaa
784	aaa
7699	aaa
2054	aaa
6006	aaa
4204	aaa
8905	aaa
6182	aaa
1271	aaa
5415	aaa
5164	aaa
4320	aaa
3736	aaa
2287	aaa
6559	aaa

  • Order By
    • Job中只会启动一个reduce做全局排序,数据量大时,耗时会很久
    • 在strict模式(hive.mapred.mode=strict)下,必须添加limit语句限制返回条数
# 语法格式
colOrder: ( ASC | DESC )
colNullOrder: (NULLS FIRST | NULLS LAST)           -- (Note: Available in Hive 2.1.0 and later)
orderBy: ORDER BY colName colOrder? colNullOrder? (',' colName colOrder? colNullOrder?)*
query: SELECT expression (',' expression)* FROM src orderBy

# 排序
select * from sort_test order by id desc;

+---------------+-----------------+--+
| sort_test.id  | sort_test.name  |
+---------------+-----------------+--+
| 9901          | aaa             |
| 9719          | aaa             |
| 9684          | aaa             |
| 9636          | aaa             |
| 9562          | aaa             |
| 9340          | aaa             |
| 9335          | aaa             |
| 9019          | aaa             |
| 9007          | aaa             |
| 8965          | aaa             |
| 8961          | aaa             |
| 8905          | aaa             |
| 8881          | aaa             |
| 8689          | aaa             |
| 8552          | aaa             |
| 8377          | aaa             |
| 8177          | aaa             |
| 7954          | aaa             |
| 7929          | aaa             |
| 7880          | aaa             |
| 7837          | aaa             |
| 7699          | aaa             |
| 7502          | aaa             |
| 7096          | aaa             |
| 6941          | aaa             |
| 6726          | aaa             |
| 6559          | aaa             |
| 6448          | aaa             |
| 6228          | aaa             |
| 6182          | aaa             |
| 6151          | aaa             |
| 6024          | aaa             |
| 6006          | aaa             |
| 5975          | aaa             |
| 5910          | aaa             |
| 5869          | aaa             |
| 5739          | aaa             |
| 5682          | aaa             |
| 5541          | aaa             |
| 5448          | aaa             |
| 5415          | aaa             |
| 5288          | aaa             |
| 5223          | aaa             |
| 5164          | aaa             |
| 5160          | aaa             |
| 5127          | aaa             |
| 5083          | aaa             |
| 4945          | aaa             |
| 4878          | aaa             |
| 4826          | aaa             |
| 4728          | aaa             |
| 4679          | aaa             |
| 4616          | aaa             |
| 4332          | aaa             |
| 4320          | aaa             |
| 4245          | aaa             |
| 4207          | aaa             |
| 4204          | aaa             |
| 3973          | aaa             |
| 3774          | aaa             |
| 3749          | aaa             |
| 3736          | aaa             |
| 3683          | aaa             |
| 3451          | aaa             |
| 3226          | aaa             |
| 3040          | aaa             |
| 2989          | aaa             |
| 2720          | aaa             |
| 2670          | aaa             |
| 2543          | aaa             |
| 2393          | aaa             |
| 2340          | aaa             |
| 2340          | aaa             |
| 2323          | aaa             |
| 2287          | aaa             |
| 2231          | aaa             |
| 2206          | aaa             |
| 2179          | aaa             |
| 2054          | aaa             |
| 2037          | aaa             |
| 1795          | aaa             |
| 1766          | aaa             |
| 1614          | aaa             |
| 1574          | aaa             |
| 1540          | aaa             |
| 1506          | aaa             |
| 1420          | aaa             |
| 1391          | aaa             |
| 1279          | aaa             |
| 1271          | aaa             |
| 1268          | aaa             |
| 834           | aaa             |
| 784           | aaa             |
| 582           | aaa             |
| 505           | aaa             |
| 444           | aaa             |
| 376           | aaa             |
| 246           | aaa             |
| 109           | aaa             |
| 19            | aaa             |
+---------------+-----------------+--+

  • Sort By
    • 排序前会根据排序字段分区,一个job启动多个reduce进行局部排序
    • 如果有limit语句,会再次启动一个job,取出每个局部排好序的前n条,再进行全局排序
    • 只保证局部有序,不保证全局有序
# Sort By语法
colOrder: ( ASC | DESC )
sortBy: SORT BY colName colOrder? (',' colName colOrder?)*
query: SELECT expression (',' expression)* FROM src sortBy

# 设置开启的reduce个数
set mapreduce.job.reduces=2;

0: jdbc:hive2://> set mapreduce.job.reduces;
+--------------------------+--+
|           set            |
+--------------------------+--+
| mapreduce.job.reduces=2  |
+--------------------------+--+

# 执行局部排序(未带limit)
0: jdbc:hive2://> select * from sort_test sort by id desc;
+---------------+-----------------+--+
| sort_test.id  | sort_test.name  |
+---------------+-----------------+--+
| 9901          | aaa             |
| 9684          | aaa             |
| 9340          | aaa             |
| 9019          | aaa             |
| 9007          | aaa             |
| 8965          | aaa             |
| 8961          | aaa             |
| 8689          | aaa             |
| 8552          | aaa             |
| 8177          | aaa             |
| 7837          | aaa             |
| 7699          | aaa             |
| 7502          | aaa             |
| 6559          | aaa             |
| 6448          | aaa             |
| 6228          | aaa             |
| 6024          | aaa             |
| 6006          | aaa             |
| 5975          | aaa             |
| 5910          | aaa             |
| 5869          | aaa             |
| 5739          | aaa             |
| 5682          | aaa             |
| 5541          | aaa             |
| 5448          | aaa             |
| 5415          | aaa             |
| 5288          | aaa             |
| 5164          | aaa             |
| 5160          | aaa             |
| 5083          | aaa             |
| 4878          | aaa             |
| 4826          | aaa             |
| 4679          | aaa             |
| 4616          | aaa             |
| 4245          | aaa             |
| 4207          | aaa             |
| 3736          | aaa             |
| 3451          | aaa             |
| 3226          | aaa             |
| 3040          | aaa             |
| 2989          | aaa             |
| 2720          | aaa             |
| 2670          | aaa             |
| 2340          | aaa             |
| 2231          | aaa             |
| 2206          | aaa             |
| 2054          | aaa             |
| 2037          | aaa             |
| 1766          | aaa             |
| 1614          | aaa             |
| 1540          | aaa             |
| 1506          | aaa             |
| 1420          | aaa             |
| 1268          | aaa             |
| 834           | aaa             |
| 784           | aaa             |
| 582           | aaa             |
| 444           | aaa             |
| 376           | aaa             |
| 246           | aaa             |
| 19            | aaa             |
| 9719          | aaa             |
| 9636          | aaa             |
| 9562          | aaa             |
| 9335          | aaa             |
| 8905          | aaa             |
| 8881          | aaa             |
| 8377          | aaa             |
| 7954          | aaa             |
| 7929          | aaa             |
| 7880          | aaa             |
| 7096          | aaa             |
| 6941          | aaa             |
| 6726          | aaa             |
| 6182          | aaa             |
| 6151          | aaa             |
| 5223          | aaa             |
| 5127          | aaa             |
| 4945          | aaa             |
| 4728          | aaa             |
| 4332          | aaa             |
| 4320          | aaa             |
| 4204          | aaa             |
| 3973          | aaa             |
| 3774          | aaa             |
| 3749          | aaa             |
| 3683          | aaa             |
| 2543          | aaa             |
| 2393          | aaa             |
| 2340          | aaa             |
| 2323          | aaa             |
| 2287          | aaa             |
| 2179          | aaa             |
| 1795          | aaa             |
| 1574          | aaa             |
| 1391          | aaa             |
| 1279          | aaa             |
| 1271          | aaa             |
| 505           | aaa             |
| 109           | aaa             |
+---------------+-----------------+--+

# 带limit排序(会额外再启动一个job进行全局排序)
0: jdbc:hive2://> select * from sort_test sort by id desc limit 300;
+---------------+-----------------+--+
| sort_test.id  | sort_test.name  |
+---------------+-----------------+--+
| 9901          | aaa             |
| 9719          | aaa             |
| 9684          | aaa             |
| 9636          | aaa             |
| 9562          | aaa             |
| 9340          | aaa             |
| 9335          | aaa             |
| 9019          | aaa             |
| 9007          | aaa             |
| 8965          | aaa             |
| 8961          | aaa             |
| 8905          | aaa             |
| 8881          | aaa             |
| 8689          | aaa             |
| 8552          | aaa             |
| 8377          | aaa             |
| 8177          | aaa             |
| 7954          | aaa             |
| 7929          | aaa             |
| 7880          | aaa             |
| 7837          | aaa             |
| 7699          | aaa             |
| 7502          | aaa             |
| 7096          | aaa             |
| 6941          | aaa             |
| 6726          | aaa             |
| 6559          | aaa             |
| 6448          | aaa             |
| 6228          | aaa             |
| 6182          | aaa             |
| 6151          | aaa             |
| 6024          | aaa             |
| 6006          | aaa             |
| 5975          | aaa             |
| 5910          | aaa             |
| 5869          | aaa             |
| 5739          | aaa             |
| 5682          | aaa             |
| 5541          | aaa             |
| 5448          | aaa             |
| 5415          | aaa             |
| 5288          | aaa             |
| 5223          | aaa             |
| 5164          | aaa             |
| 5160          | aaa             |
| 5127          | aaa             |
| 5083          | aaa             |
| 4945          | aaa             |
| 4878          | aaa             |
| 4826          | aaa             |
| 4728          | aaa             |
| 4679          | aaa             |
| 4616          | aaa             |
| 4332          | aaa             |
| 4320          | aaa             |
| 4245          | aaa             |
| 4207          | aaa             |
| 4204          | aaa             |
| 3973          | aaa             |
| 3774          | aaa             |
| 3749          | aaa             |
| 3736          | aaa             |
| 3683          | aaa             |
| 3451          | aaa             |
| 3226          | aaa             |
| 3040          | aaa             |
| 2989          | aaa             |
| 2720          | aaa             |
| 2670          | aaa             |
| 2543          | aaa             |
| 2393          | aaa             |
| 2340          | aaa             |
| 2340          | aaa             |
| 2323          | aaa             |
| 2287          | aaa             |
| 2231          | aaa             |
| 2206          | aaa             |
| 2179          | aaa             |
| 2054          | aaa             |
| 2037          | aaa             |
| 1795          | aaa             |
| 1766          | aaa             |
| 1614          | aaa             |
| 1574          | aaa             |
| 1540          | aaa             |
| 1506          | aaa             |
| 1420          | aaa             |
| 1391          | aaa             |
| 1279          | aaa             |
| 1271          | aaa             |
| 1268          | aaa             |
| 834           | aaa             |
| 784           | aaa             |
| 582           | aaa             |
| 505           | aaa             |
| 444           | aaa             |
| 376           | aaa             |
| 246           | aaa             |
| 109           | aaa             |
| 19            | aaa             |
+---------------+-----------------+--+

  • Order By 和 Sort By区别
    • Order By全局排序,Sort By局部排序
    • 取TopN时,Sort By 比 Order By效率更高
  • Distribute By
    • 查询语句对指定字段分组
    • 通常结合Sort By语句使用,比如同一个地区,不同商家排序,就需要用到这个
  • Cluster By
    • 分组且排序,等价于 Distribute By 和 Sort By 的结合
-- 使用示例
SELECT col1, col2 FROM t1 CLUSTER BY col1

SELECT col1, col2 FROM t1 DISTRIBUTE BY col1
 
SELECT col1, col2 FROM t1 DISTRIBUTE BY col1 SORT BY col1 ASC, col2 DESC

参考资料

【0】Hive wiki - LanguageManual SortBy

posted @ 2018-10-18 10:17  huan&ping  阅读(266)  评论(0编辑  收藏  举报