环境要求
java-1.8
maven-3.6.3
hadoop-2.x.x
下载安装包
[root@basecoalmine source]# cd /opt/source/ [root@basecoalmine source]# wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz [root@basecoalmine source]# wget https://github.com/twitter/hadoop-lzo/archive/master.zip
创建安装目录
[root@basecoalmine source]# mkdir /opt/software/lzo-2.10
解压master
[root@basecoalmine source]# unzip master.zip [root@basecoalmine source]# mv hadoop-lzo-master/ ../software/
修改配置
修改hadoop-lzo-master的pom文件中的hadoop版本,换成2.6.0。
[root@basecoalmine source]# vim /opt/software/hadoop-lzo-master/pom.xml
<hadoop.current.version>2.6.0</hadoop.current.version>
编译安装lzo-2.10
[root@basecoalmine source]# cd /opt/software/lzo-2.10/ [root@basecoalmine lzo-2.10]# bash configure -prefix=/opt/software/lzo-2.10/ [root@basecoalmine lzo-2.10]# make [root@basecoalmine lzo-2.10]# make install
配置环境变量
[root@basecoalmine lzo-2.10]# vim /etc/profile export C_INCLUDE_PATH=/opt/software/lzo-2.10/include export LIBRARY_PATH=/opt/software/lzo-2.10/lib [root@basecoalmine lzo-2.10]# source /etc/profile
编译安装hadoop-lzo
[root@basecoalmine lzo-2.10]# cd /opt/software/hadoop-lzo-master/ [root@basecoalmine hadoop-lzo-master]# mvn package -Dmaven.test.skip=true
hadoop配置压缩
hadoop-lzo-master编译完成之后,在/opt/software/hadoop-lzo-master/target会生成一个jar包hadoop-lzo-0.4.21-SNAPSHOT.jar,将其移动到/opt/software/hadoop-2.6.0-cdh5.16.2/share/hadoop/common目录下。
[root@basecoalmine source]# cp /opt/software/hadoop-lzo-master/target/hadoop-lzo-0.4.21-SNAPSHOT.jar /opt/software/hadoop-2.6.0-cdh5.16.2/share/hadoop/common
core-site.xml增加配置支持LZO压缩
<property> <name>io.compression.codecs</name> <value> org.apache.hadoop.io.compress.GzipCodec, org.apache.hadoop.io.compress.DefaultCodec, org.apache.hadoop.io.compress.BZip2Codec, org.apache.hadoop.io.compress.SnappyCodec, com.hadoop.compression.lzo.LzoCodec, com.hadoop.compression.lzo.LzopCodec </value> </property> <property> <name>io.compression.codec.lzo.class</name> <value>com.hadoop.compression.lzo.LzoCodec</value> </property>
mapred-site.xml 添加如下配置
<property> <name>mapred.child.env </name> <value>LD_LIBRARY_PATH=/opt/app/lzo/complie/lib</value> </property> <property> <name>mapreduce.map.output.compress</name> <value>true</value> </property> <property> <name>mapreduce.map.output.compress.codec</name> <value>com.hadoop.compression.lzo.LzoCodec</value> </property>
最后重启hadoop集群即可。
LZO压缩使用
lzo压缩后的数据是不支持切片的,所以要想支持切片,需要对数据添加索引。
(1)在Hive中创建lzo_test表。
create table lzo_test ( name string, num string )row format delimited fields terminated by ',' STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat" OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
(2)本地生成超128MB大文件
自己想办法生成一个大文件,使其压缩后的lzo文件能够达到128MB。
生成文件:test
压缩命令:# lzop test
压缩文件:test.lzo
(3)文件上传到加载进Hive表
LOAD DATA LOCAL INPATH '/opt/tmp/test.lzo' OVERWRITE INTO TABLE lzo_test;
(4)不对压缩文件添加索引的情况下查看切片情况
#查询测试 select count(1) from lzo_test; 控制台日志截取: Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 11.37 sec HDFS Read: 289332141 HDFS Write: 8 SUCCESS
(5)创建和旧表相同的新表,用于测试切片
-- 开启压缩 SET hive.exec.compress.output=true; SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec; create table lzo_test_split STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat" OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" as select * from lzo_test;
(6)对压缩文件添加索引
# hadoop jar 存放在hadoop指定目录下的lzojar包 索引所在的类 HDFS中的lzo压缩文件 # hadoop jar /opt/app/hadoop/share/hadoop/common/hadoop-lzo-0.4.21-SNAPSHOT.jar com.hadoop.compression.lzo.LzoIndexer /user/hive/warehouse/lzo_test_split
(7)查看切片情况
select count(1) from lzo_test_split; 控制台日志: Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 9.37 sec HDFS Read: 190593490 HDFS Write: 8 SUCCESS