MapReduce-二进制输入
Hadoop的MapReduce不只是可以处理文本信息,它还可以处理二进制格式的数据
1. 关于SequenceFileInputFormat类
Hadoop的顺序文件格式存储二进制的键/值对的序列。由于它们是可分割的(它们有同步点,所以reader可以从文件中的任意一点雨记录边界进行同步,例如分片的起点),所以它们很符合MapReduce数据的格式要求,并且它们还支持压缩,可以使用一些序列化技术来存储任意类型。
如果要用顺序文件数据作为MapReduce的输入,应用SequenceFileInputFormat。键和值是由顺序文件决定,所以只需要保证map输入的类型匹配。
虽然从名称上看不出来,但SequenceFileInputFormat可以读MapFile(排序后的SequenceFile)和SequenceFile。如果在处理顺序文件时遇到目录,SequenceFileInputFormat类会认为自己正在读MapFile,使用的是其数据文件。
2. 关于SequenceFileAsTextInputFormat类
SequenceFileAsTextInputFormat是SequenceFileInputFormat的变体,它将顺序文件的键和值转换为Text对象。这个转换通过在键和值上调用toString方法实现。这个格式是顺序文件作为Streaming的合适的输入类型。
3. 关于SequenceFileAsBinaryInputFormat类
SequenceFileAsBinaryInputFormat是SequenceFileInputFormat的一种变体,它获取顺序文件的键和值作为二进制对象。它们被封装为BytesWritable对象,因而应用程序可以任意地解释这些字节数组。结合使用SequenceFile.Reader的appendRaw()方法或SequenceFileAsBinaryOutputFormat,它提供了在MapReduce中可以使用任意二进制数据类型的方法。
例子
将数据文件存为SequenceFile
package com.zhen.mapreduce.sequenceToText; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; /** * @author FengZhen * @date 2018年8月18日 * 输出为SequenceFile */ public class TextToSequence { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(TextToSequence.class); job.setMapperClass(WCMapper.class); job.setReducerClass(WCReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VLongWritable.class); // 设置输出类 job.setOutputFormatClass(SequenceFileOutputFormat.class); /** * 设置sequecnfile的格式,对于sequencefile的输出格式,有多种组合方式, * 从下面的模式中选择一种,并将其余的注释掉 */ // 组合方式1:不压缩模式 SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.NONE); //组合方式2:record压缩模式,并指定采用的压缩方式 :默认、gzip压缩等 // SequenceFileOutputFormat.setOutputCompressionType(job, // CompressionType.RECORD); // SequenceFileOutputFormat.setOutputCompressorClass(job, // DefaultCodec.class); //组合方式3:block压缩模式,并指定采用的压缩方式 :默认、gzip压缩等 // SequenceFileOutputFormat.setOutputCompressionType(job, // CompressionType.BLOCK); // SequenceFileOutputFormat.setOutputCompressorClass(job, // DefaultCodec.class); FileInputFormat.addInputPaths(job, "hdfs://fz/user/hdfs/MapReduce/data/squenceFile/origin"); SequenceFileOutputFormat.setOutputPath(job, new Path("hdfs://fz/user/hdfs/MapReduce/data/squenceFile/textToSequence/output")); System.exit(job.waitForCompletion(true)?0:1); } //map public static class WCMapper extends Mapper<LongWritable, Text, Text, VLongWritable> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] split = value.toString().split(""); for(String s : split){ context.write(new Text(s), new VLongWritable(1L)); } } } //reduce public static class WCReducer extends Reducer<Text, VLongWritable, Text, VLongWritable>{ @Override protected void reduce(Text key, Iterable<VLongWritable> v2s, Context context) throws IOException, InterruptedException { long sum=0; for(VLongWritable vl : v2s){ sum += vl.get(); } context.write(key, new VLongWritable(sum)); } } }
读取SequenceFile存为Text
package com.zhen.mapreduce.sequenceToText; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * @author FengZhen * @date 2018年8月18日 * 输入为SequenceFile */ public class SequenceToText extends Configured implements Tool{ static class SequenceToTextMapper extends Mapper<Text, VLongWritable, Text, VLongWritable>{ @Override protected void map(Text key, VLongWritable value, Mapper<Text, VLongWritable, Text, VLongWritable>.Context context) throws IOException, InterruptedException { String contents = value.toString(); System.out.println(contents); context.write(key, value); } } static class SequenceToTextReducer extends Reducer<Text, VLongWritable, Text, VLongWritable>{ @Override protected void reduce(Text key, Iterable<VLongWritable> value, Reducer<Text, VLongWritable, Text, VLongWritable>.Context context) throws IOException, InterruptedException { long sum = 0; while (value.iterator().hasNext()) { sum += Integer.parseInt(value.iterator().next().toString()); } context.write(key, new VLongWritable(sum)); } } public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJobName("SequenceToText"); job.setJarByClass(SequenceToText.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SequenceToTextMapper.class); job.setReducerClass(SequenceToTextReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VLongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VLongWritable.class); SequenceFileInputFormat.setInputPaths(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { String[] params = new String[]{"hdfs://fz/user/hdfs/MapReduce/data/squenceFile/textToSequence/output","hdfs://fz/user/hdfs/MapReduce/data/squenceFile/sequenceToText/output"}; int exitCode = ToolRunner.run(new SequenceToText(), params); System.out.println(exitCode); System.exit(exitCode); } }