春哥甲(Hadoop雷点记录)
“如果你也被春哥击倒,那么套上这个春哥甲吧!!!”
这篇博客主要记录在学习Hadoop中遇到的各种雷点坑点!
WordCountAndLen
在这一节,我们开始尝试使用自定义的类型:WordCountAndLen去实现我们原有的WordCount问题!
首先附上WordCountAndLen类的定义代码
1 /** 2 * Created by l1ngyi on 23-3-15. 3 */ 4 5 //import org .apache . hadoop . io . IntWritable ; 6 import java.io.DataInput; 7 import java.io.DataOutput; 8 import java.io.IOException; 9 //import org.apache.hadoop.io.Writable; 10 import org.apache.hadoop.io.*; 11 12 public class WordCountAndLen implements Writable { 13 private IntWritable count; 14 private IntWritable length; 15 16 public WordCountAndLen(){ 17 set(new IntWritable(),new IntWritable()); 18 19 } 20 public WordCountAndLen(IntWritable count,IntWritable length){ 21 set(count,length); 22 23 } 24 public WordCountAndLen(int count ,int length){ 25 set(new IntWritable(count),new IntWritable(length)); 26 } 27 public void set(IntWritable count,IntWritable length){ 28 this.count = count; 29 this.length = length; 30 } 31 public IntWritable getCOunt(){ 32 return count; 33 } 34 public IntWritable getLength(){ 35 return length; 36 } 37 @Override 38 public String toString(){ 39 String res=""; 40 res+=count.toString(); 41 res+=" "+length.toString(); 42 return res; 43 } 44 @Override 45 public void write(DataOutput out) throws IOException{ 46 count.write(out); 47 length.write(out); 48 } 49 @Override 50 public void readFields(DataInput in) throws IOException{ 51 count.readFields(in); 52 length.readFields(in); 53 } 54 }
我们可以看到,相比于原来的单纯记录单词出现的次数,这里增加了字符的长度。即该类型是封装了count length两个变量的。
注意!这里相比书上的示例代码,增加了toString函数的重载。
即:
@Override public String toString(){ String res=""; res+=count.toString(); res+=" "+length.toString(); return res; }
如果我们不进行这个操作,输出是较为奇怪的。大家可以自己去试一试。(我也试了,确实奇怪,但是我没截图x)
然后就来到了我们的WordCount类:
import org.apache.hadoop.conf.Configuration ; import org.apache . hadoop .fs.*; import org .apache . hadoop . io . IntWritable ; import org .apache . hadoop . io . Text ; import org . apache . hadoop . mapreduce . Job ; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.io.InterruptedIOException; import java.util.StringTokenizer; /** * Created by l1ngyi on 23-3-15. */ public class WordCount { // 自定义 Mapper 类 public static class MyMapper extends Mapper<Object, Text, Text, IntWritable> { // 定义数值为1的变量,用来在每分割出一个单词之后构造一个<单词,1>的键值对 private final static IntWritable one = new IntWritable(1); private Text word = new Text(); // map 函数的具体定义,从下面可看出处理的是 Text 类型的 value,key 值被忽略了 public void map(Object key, Text value, Context context) throws IOException, InterruptedException { // 此处的 value 是文档中的一行文本数据,将其转成字符串类型之后,利用字符串分割的方法将一行中的每个单词分割 StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); // 将结果写入 context context.write(word, one); } } } public static class MyReducer extends Reducer<Text, IntWritable, Text, WordCountAndLen> { // 从这可以看出 reduce 处理的输入数据是 <key,value-list> 类型的键值对 private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); WordCountAndLen res1 = new WordCountAndLen(sum,key.getLength()); context.write(key, res1); } } public static void main(String[] args) throws Exception { // 设置配置信息和文件输入输出路径 Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count-1"); job.setJarByClass(WordCount.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); //change job.setOutputValueClass(WordCountAndLen.class); job.setMapOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileSystem fs =FileSystem.get(conf); fs.delete(new Path(args[1]),true); boolean result =job.waitForCompletion(true); System.exit(result ? 0 : 1); } }
为什么这么写呢?因为我们在Hadoop的源码中可以找到对应的答案。在这个案例中,我们Map过程仍然需要切割语句,之后构造每个key -value,即word - 1。为什么是1?因为在map阶段,我们只是对任务进行切分与key value映射,本质上是一个拆分的过程。每一个词语被拆分开对应的出现频率都是1。至于合并、计数工作,我们在Reduce阶段会去做。因此,相比于WordCount例题,我们无须修改map函数,只需要在reduce函数中进行更改。
public static class MyReducer extends Reducer<Text, IntWritable, Text, WordCountAndLen> { // 从这可以看出 reduce 处理的输入数据是 <key,value-list> 类型的键值对 private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); WordCountAndLen res1 = new WordCountAndLen(sum,key.getLength()); context.write(key, res1); } }
MyReduce函数中,我们需要得到每个单词的长度,以及他的出现频次。在上个案例中,我们已经计算出来了长度。那么在该函数,我们仅需要修改一下OutPutValue的类型,并且构造一个该类型的变量写入输出流即可。
在WordCountAndLen类型中,我们定义了一个双int的构造方法
public WordCountAndLen(int count ,int length){ set(new IntWritable(count),new IntWritable(length)); }
因此,只需要传入sum,与key(此处是对应的单词)的长度即可。之后改写context.write函数的传参,就可以正常使用了。
当然,我们需要修改main函数的一些参数。即设置OutPutValue的类型。
job.setOutputValueClass(WordCountAndLen.class); job.setMapOutputValueClass(IntWritable.class);
如此,我们便可以成功运行了!!!
注意:记得启动伪分布服务,并配置Run/Debug Configurations。本案例中,需要传递两个参数。输入路径、输出路径。
请确保你的文件已经上传到伪分布环境上!并且正确配置了resources!否则程序有可能仅在本地执行。
这里附上我的调用输出:
多mapReduce任务的串联实践
import org.apache.hadoop.conf.Configuration ; import org.apache . hadoop .fs.*; import org .apache . hadoop . io . IntWritable ; import org .apache . hadoop . io . Text ; import org . apache . hadoop . mapreduce . Job ; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.io.InterruptedIOException; import java.util.StringTokenizer; //First Word Count ///////////////////////// ///////////////////////// public class WordCount { // 自定义 Mapper 类 public static class MyMapper extends Mapper<Object, Text, Text, IntWritable> { // 定义数值为1的变量,用来在每分割出一个单词之后构造一个<单词,1>的键值对 private final static IntWritable one = new IntWritable(1); private Text word = new Text(); // map 函数的具体定义,从下面可看出处理的是 Text 类型的 value,key 值被忽略了 public void map(Object key, Text value, Context context) throws IOException, InterruptedException { // 此处的 value 是文档中的一行文本数据,将其转成字符串类型之后,利用字符串分割的方法将一行中的每个单词分割 StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); // 将结果写入 context context.write(word, one); } } } ///////////////////////// ///////////////////////// // First Reduce 类 // /////////////////////// ///////////////////////// public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> { // 从这可以看出 reduce 处理的输入数据是 <key,value-list> 类型的键值对 private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } ///////////////////////// ///////////////////////// public static class MyMapper2 extends Mapper<Object,Text,Text,IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text("The num of words:"); public void map(Object key ,Text value ,Context context) throws IOException,InterruptedException { context.write(word,one); } } public static void main(String[] args) throws Exception { // 设置配置信息和文件输入输出路径 Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count-1"); job.setJarByClass(WordCount.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileSystem fs =FileSystem.get(conf); fs.delete(new Path(args[1]),true); job.waitForCompletion(true); Job job2 = Job.getInstance(conf,"word count-2"); job2.setJarByClass(WordCount.class); job2.setMapperClass(MyMapper2.class); job2.setReducerClass(MyReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job2,new Path(args[1])); FileOutputFormat.setOutputPath(job2,new Path(args[2])); fs.delete(new Path(args[2]),true); boolean result =job2.waitForCompletion(true); System.exit(result ? 0 : 1); } }
这里照着书敲就可以啦!但是需要注意:
这个案例是需要三个参数的!因为main函数里用到了args[0]、args[1]、args[2]
所以在Run/Debug Configurations,你需要给出三个路径,以空格符区分。
0、1、2分别对应 输入文件,第一次mapreduce的输出,第二次mapreduce的输出。
附上运行成功的图片: