MapReduce-二进制输入
Hadoop的MapReduce不只是可以处理文本信息,它还可以处理二进制格式的数据
1. 关于SequenceFileInputFormat类
Hadoop的顺序文件格式存储二进制的键/值对的序列。由于它们是可分割的(它们有同步点,所以reader可以从文件中的任意一点雨记录边界进行同步,例如分片的起点),所以它们很符合MapReduce数据的格式要求,并且它们还支持压缩,可以使用一些序列化技术来存储任意类型。
如果要用顺序文件数据作为MapReduce的输入,应用SequenceFileInputFormat。键和值是由顺序文件决定,所以只需要保证map输入的类型匹配。
虽然从名称上看不出来,但SequenceFileInputFormat可以读MapFile(排序后的SequenceFile)和SequenceFile。如果在处理顺序文件时遇到目录,SequenceFileInputFormat类会认为自己正在读MapFile,使用的是其数据文件。
2. 关于SequenceFileAsTextInputFormat类
SequenceFileAsTextInputFormat是SequenceFileInputFormat的变体,它将顺序文件的键和值转换为Text对象。这个转换通过在键和值上调用toString方法实现。这个格式是顺序文件作为Streaming的合适的输入类型。
3. 关于SequenceFileAsBinaryInputFormat类
SequenceFileAsBinaryInputFormat是SequenceFileInputFormat的一种变体,它获取顺序文件的键和值作为二进制对象。它们被封装为BytesWritable对象,因而应用程序可以任意地解释这些字节数组。结合使用SequenceFile.Reader的appendRaw()方法或SequenceFileAsBinaryOutputFormat,它提供了在MapReduce中可以使用任意二进制数据类型的方法。
例子
将数据文件存为SequenceFile
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | package com.zhen.mapreduce.sequenceToText; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; /** * @author FengZhen * @date 2018年8月18日 * 输出为SequenceFile */ public class TextToSequence { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(TextToSequence. class ); job.setMapperClass(WCMapper. class ); job.setReducerClass(WCReducer. class ); job.setOutputKeyClass(Text. class ); job.setOutputValueClass(VLongWritable. class ); // 设置输出类 job.setOutputFormatClass(SequenceFileOutputFormat. class ); /** * 设置sequecnfile的格式,对于sequencefile的输出格式,有多种组合方式, * 从下面的模式中选择一种,并将其余的注释掉 */ // 组合方式1:不压缩模式 SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.NONE); //组合方式2:record压缩模式,并指定采用的压缩方式 :默认、gzip压缩等 // SequenceFileOutputFormat.setOutputCompressionType(job, // CompressionType.RECORD); // SequenceFileOutputFormat.setOutputCompressorClass(job, // DefaultCodec.class); //组合方式3:block压缩模式,并指定采用的压缩方式 :默认、gzip压缩等 // SequenceFileOutputFormat.setOutputCompressionType(job, // CompressionType.BLOCK); // SequenceFileOutputFormat.setOutputCompressorClass(job, // DefaultCodec.class); FileInputFormat.addInputPaths(job, "hdfs://fz/user/hdfs/MapReduce/data/squenceFile/origin" ); SequenceFileOutputFormat.setOutputPath(job, new Path( "hdfs://fz/user/hdfs/MapReduce/data/squenceFile/textToSequence/output" )); System.exit(job.waitForCompletion( true )?0:1); } //map public static class WCMapper extends Mapper<LongWritable, Text, Text, VLongWritable> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] split = value.toString().split( "" ); for (String s : split){ context.write( new Text(s), new VLongWritable(1L)); } } } //reduce public static class WCReducer extends Reducer<Text, VLongWritable, Text, VLongWritable>{ @Override protected void reduce(Text key, Iterable<VLongWritable> v2s, Context context) throws IOException, InterruptedException { long sum=0; for (VLongWritable vl : v2s){ sum += vl. get (); } context.write(key, new VLongWritable(sum)); } } } |
读取SequenceFile存为Text
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | package com.zhen.mapreduce.sequenceToText; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * @author FengZhen * @date 2018年8月18日 * 输入为SequenceFile */ public class SequenceToText extends Configured implements Tool{ static class SequenceToTextMapper extends Mapper<Text, VLongWritable, Text, VLongWritable>{ @Override protected void map(Text key, VLongWritable value, Mapper<Text, VLongWritable, Text, VLongWritable>.Context context) throws IOException, InterruptedException { String contents = value.toString(); System. out .println(contents); context.write(key, value); } } static class SequenceToTextReducer extends Reducer<Text, VLongWritable, Text, VLongWritable>{ @Override protected void reduce(Text key, Iterable<VLongWritable> value, Reducer<Text, VLongWritable, Text, VLongWritable>.Context context) throws IOException, InterruptedException { long sum = 0; while (value.iterator().hasNext()) { sum += Integer.parseInt(value.iterator().next().toString()); } context.write(key, new VLongWritable(sum)); } } public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJobName( "SequenceToText" ); job.setJarByClass(SequenceToText. class ); job.setInputFormatClass(SequenceFileInputFormat. class ); job.setOutputFormatClass(TextOutputFormat. class ); job.setMapperClass(SequenceToTextMapper. class ); job.setReducerClass(SequenceToTextReducer. class ); job.setMapOutputKeyClass(Text. class ); job.setMapOutputValueClass(VLongWritable. class ); job.setOutputKeyClass(Text. class ); job.setOutputValueClass(VLongWritable. class ); SequenceFileInputFormat.setInputPaths(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion( true ) ? 0 : 1; } public static void main(String[] args) throws Exception { String[] params = new String[]{ "hdfs://fz/user/hdfs/MapReduce/data/squenceFile/textToSequence/output" , "hdfs://fz/user/hdfs/MapReduce/data/squenceFile/sequenceToText/output" }; int exitCode = ToolRunner.run( new SequenceToText(), params ); System. out .println(exitCode); System.exit(exitCode); } } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示