Hadoop压缩机制的了解
通过一定的算法对数据进行特殊编码,使得数据占用的存储空间比较小,这个过程我们称之为压缩,反之为解压缩
不管哪种压缩工具都需要权衡时间和空间
在大数据领域内还要考虑压缩文件的可分割性
Hadoop支持的压缩工具有:DEFLATE、gzip、bzip以及Snappy
压缩与解压:CompressTest.java
public class CompressTest {
public static void main(String[] args) throws IOException, ClassNotFoundException {
//compress("block.txt", "org.apache.hadoop.io.compress.GzipCodec");//解压时注释掉
//压缩方式:
//gzip => org.apache.hadoop.io.compress.GzipCodec
//bzip => org.apache.hadoop.io.compress.BZipCodec
//snappy => org.apache.hadoop.io.compress.SnappyCodec
//DEFLATE => org.apache.hadoop.io.compress.DefaultCodec
decompress(new File("block.txt.gz"));//压缩时注释掉
}
private static File compress(String fileName, String compressClassName) throws ClassNotFoundException, IOException {
Class<?> codecClass = Class.forName(compressClassName);
Configuration configuration = new Configuration();
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, configuration);
File fileOut = new File(fileName + codec.getDefaultExtension());
fileOut.delete();
OutputStream out = new FileOutputStream(fileOut);
CompressionOutputStream cout = codec.createOutputStream(out);
File fileIn = new File(fileName);
InputStream in = new FileInputStream(fileIn);
IOUtils.copyBytes(in, cout, 4096, false);
in.close();
cout.close();
return fileOut;
}
private static void decompress(File file) throws IOException {
Configuration configuration = new Configuration();
CompressionCodecFactory factory = new CompressionCodecFactory(configuration);
CompressionCodec codec = factory.getCodec(new Path(file.getName()));
if (codec == null) {
System.out.println("Can not find codec for file " + file);
return;
}
File fileOut = new File(file.getName() + "-.txt");
InputStream in = codec.createInputStream(new FileInputStream(file));
OutputStream outputStream = new FileOutputStream(fileOut);
IOUtils.copyBytes(in, outputStream, 4096, false);
in.close();
outputStream.close();
}
}
在WordCount.java主函数中增加压缩设置:
FileOutputFormat.setCompressOutput(job,true);
FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);