MapReduce_Score

TEST FILES

math.txt

zs 80
ls 90
ww 95
china.txt
zs 60
ls 65
ww 90

  1 package MapReduce;
  2 
  3 import java.io.IOException;
  4 import java.net.URI;
  5 import java.util.Iterator;
  6 import java.util.StringTokenizer;
  7  
  8 import org.apache.hadoop.conf.Configuration;
  9 import org.apache.hadoop.fs.FileSystem;
 10 import org.apache.hadoop.fs.Path;
 11 import org.apache.hadoop.io.IntWritable;
 12 import org.apache.hadoop.io.LongWritable;
 13 import org.apache.hadoop.io.Text;
 14 import org.apache.hadoop.mapreduce.Job;
 15 import org.apache.hadoop.mapreduce.Mapper;
 16 import org.apache.hadoop.mapreduce.Reducer;
 17 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 18 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 19 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 21 import org.apache.hadoop.util.GenericOptionsParser;
 22  
 23 public class Score {
 24     private static final String OUTPUT_PATH = "hdfs://h201:9000/user/hadoop/output";
 25     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { 
 26         // 实现map函数
 27         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 28             // 将输入的纯文本文件的数据转化成String
 29             String line = value.toString();
 30             // 将输入的数据首先按行进行分割
 31             StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n"); 
 32             // 分别对每一行进行处理
 33             while (tokenizerArticle.hasMoreElements()) {
 34                 // 每行按空格划分
 35                 //一个分隔的嵌套，先按行截，再在这里按空格对每一行的内容截取 
 36                 StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());
 37                 String strName = tokenizerLine.nextToken();// 学生姓名部分，是按顺序赋值的
 38                 String strScore = tokenizerLine.nextToken();// 成绩部分
 39  
 40                 Text name = new Text(strName);
 41                 int scoreInt = Integer.parseInt(strScore);
 42                 // 输出姓名和成绩
 43                 context.write(name, new IntWritable(scoreInt));
 44             }
 45         }
 46  
 47     }
 48  
 49     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
 50         // 实现reduce函数
 51         public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
 52             int sum = 0;
 53             int count = 0;
 54  
 55             Iterator<IntWritable> iterator = values.iterator();
 56             while (iterator.hasNext()) {
 57                 sum += iterator.next().get();// 计算总分
 58                 count++;// 统计总的科目数
 59             }
 60  
 61             int average = (int) sum / count;// 计算平均成绩
 62             context.write(key, new IntWritable(average));
 63         }
 64  
 65     }
 66  
 67     public static void main(String[] args) throws Exception {
 68         Configuration conf = new Configuration();
 69         
 70         conf.set("mapred.jar","Score.jar");
 71         final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);//读路径信息
 72         fileSystem.delete(new Path(OUTPUT_PATH), true);//删除路径信息 输出路径不能存在
 73         String[] ioArgs = new String[] { "hdfs://h201:9000/user/hadoop/input", "hdfs://h201:9000/user/hadoop/output" };
 74         //"score_in", "score_out"指定输入输出目录，正常应该写成{ "/user/hadoop/score_in", "/user/hadoop/score_out" }
 75         String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
 76         // GenericOptionsParser做一个命令的解析，下面有，可解析出路径
 77         if (otherArgs.length != 2) {                        
 78             System.err.println("Usage: Score Average <in> <out>");
 79             System.exit(2);
 80         }
 81         //即如果找不到"score_in", "score_out"中的任何一个都会报错，因为数组中有"score_in", "score_out"两个成员，所以长度是2
 82  
 83         Job job = new Job(conf, "Score Average");
 84         job.setJarByClass(Score.class);
 85  
 86         // 设置Map、Combine和Reduce处理类
 87         job.setMapperClass(Map.class);
 88         job.setCombinerClass(Reduce.class);
 89         job.setReducerClass(Reduce.class);
 90  
 91         // 设置输出类型
 92         job.setOutputKeyClass(Text.class);
 93         job.setOutputValueClass(IntWritable.class);
 94  
 95         // 将输入的数据集分割成小数据块splites，提供一个RecordReder的实现
 96         job.setInputFormatClass(TextInputFormat.class);
 97         // 提供一个RecordWriter的实现，负责数据输出
 98         job.setOutputFormatClass(TextOutputFormat.class);
 99  
100         // 设置输入和输出目录
101         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
102         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
103         System.exit(job.waitForCompletion(true) ? 0 : 1);
104     }
105 }

********补充
迭代器（Iterator）

　　迭代器是一种设计模式，它是一个对象，它可以遍历并选择序列中的对象，而开发人员不需要了解该序列的底层结构。迭代器通常被称为“轻量级”对象，因为创建它的代价小。

　　Java中的Iterator功能比较简单，并且只能单向移动：

　　(1) 使用方法iterator()要求容器返回一个Iterator。第一次调用Iterator的next()方法时，它返回序列的第一个元素。注意：iterator()方法是java.lang.Iterable接口,被Collection继承。

　　(2) 使用next()获得序列中的下一个元素。

　　(3) 使用hasNext()检查序列中是否还有元素。

　　(4) 使用remove()将迭代器新返回的元素删除。

　　Iterator是Java迭代器最简单的实现，为List设计的ListIterator具有更多的功能，它可以从两个方向遍历List，也可以从List中插入和删除元素。

1.创建集合：
Collection c = new ArrayList<String>();
2
添加元素：
c.add("hehehe");
c.add("huhuhu");
c.add("wawawa");
3
获取集合的迭代器：
Iterator iterator = c.iterator();
4
进行遍历：
while(iterator.hasNext())//如果仍有元素可以迭代，则返回 true
{
System.out.println(iterator.next());//返回迭代的下一个元素。
}

************
org.apache.hadoop.util包 GenericOptionsParser类【原创】

GenericOptionsParser功能描述
　　GenericOptionsParser是hadoop框架中解析命令行参数的基本类。它能够辨别一些标准的命令行参数，能够使应用程序轻易地指定namenode，jobtracker，以及其他额外的配置资源。

parseGeneralOptions(options, conf, args)这个函数解析用户指定的参数，获取基本选项以及根据需要修改配置。它首先指定每个通用选项的属性，然后解析选项，参数，把它转化为命令行对象（CommandLine）,紧接着把设定好的命令行参数写入系统配置
********************

posted @ 2018-03-25 18:56 蜘蛛侠0 阅读(201) 评论(0) 编辑收藏举报

刷新页面返回顶部

李杰然

MapReduce_Score

公告