03_MapReduce框架原理_3.8 排序 WritableComparable

1. 说明

 

 2. 排序时机 与 排序算法

 

 流程图

 

3. 排序分类

 

4. 实现

    1. 说明 

            自定义类 为key 时,必须 实现 WritableComparable接口,否则无法排序

    2. 实现

    自定义类 实现 WritableComparable接口, 重写 compareTo 方法

 

5. 代码案例

     1. 全排序

     

复制代码
package GroupByPersonOrderbyAgePk {

  import java.io.{DataInput, DataOutput}
  import java.lang

  import org.apache.hadoop.conf.Configuration
  import org.apache.hadoop.fs.Path
  import org.apache.hadoop.io._
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
  import org.apache.hadoop.mapreduce.{Job, Mapper, Reducer}

  /*
  * 需求
  *     根据 person计数,并按 age 排序
  *
  * 输入
  *  曹操 20 10
  *  曹操 20 10
  *  曹操 20 10
  *  曹仁 29 9
  *  曹仁 29 9
  *  曹冲 8  2
  *  曹冲 8  2
  *  曹冲 8  2
  *  曹冲 8  2
  *
  * 输出
  *  曹冲 # 8  # 2      4
  *  曹操 # 20 # 10    3
  *  曹仁 # 29 # 9      2
  *
  * */

  // Mapper 类
  class PersonCountMapper extends Mapper[LongWritable, Text, Person, IntWritable] {
    var v = new IntWritable(1)

    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Person, IntWritable]#Context) = {
      //1. 切割数据
      var line: Array[String] = value.toString.split(" +")
      var name: String = line(0)
      var age: String = line(1)
      var income: String = line(2)

      //2. 创建Person对象
      var person = new Person(name, age.toInt, income.toInt)

      println(person)

      //3. 写入到环形缓冲区
      context.write(person, v)
    }
  }


  // Reducer 类
  class PersonCountReducer extends Reducer[Person, IntWritable, Person, IntWritable] {

    var v = new IntWritable()

    override def reduce(key: Person, values: lang.Iterable[IntWritable], context: Reducer[Person, IntWritable, Person, IntWritable]#Context) = {
      var sum = 0
      //1. 对每个person 计数
      values.forEach(sum += _.get)
      v.set(sum)

      //2. 写出数据
      context.write(key, v)
    }
  }

  // Driver
  object PersonCountDriver {
    def main(args: Array[String]): Unit = {
      //1. 获取配置信息以及 获取job对象
      var configuration = new Configuration
      var job: Job = Job.getInstance(configuration)

      //2. 注册本Driver程序的jar
      job.setJarByClass(this.getClass)

      job.setJobName("PersonCount mr")

      //3. 注册 Mapper 和 Reducer的jar
      job.setMapperClass(classOf[PersonCountMapper])
      job.setReducerClass(classOf[PersonCountReducer])

      //4. 设置Mapper 类输出key-value 数据类型
      job.setMapOutputKeyClass(classOf[Person])
      job.setMapOutputValueClass(classOf[IntWritable])

      //5. 设置最终输出key-value 数据类型
      job.setOutputKeyClass(classOf[Person])
      job.setOutputValueClass(classOf[IntWritable])

      //6. 设置输入输出路径
      FileInputFormat.setInputPaths(job, new Path("src/main/data/input/1.txt"))
      FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))

      //7. 提交job
      val bool: Boolean =
        job.waitForCompletion(true)
      System.exit(bool match {
        case true => "0".toInt
        case false => "1".toInt
      })

    }

  }

  // public interface WritableComparable<T> extends Writable, Comparable<T>
  class Person() extends WritableComparable[Person] {
    var name: String = _
    var age: Int = _
    var income: Int = _

    // 空参构造
    def this(name: String, age: Int, income: Int) = {
      this() // 调用主构造器
      this.name = name
      this.age = age
      this.income = income
    }

    // 序列化方法
    override def write(out: DataOutput): Unit = {
      out.writeUTF(name)
      out.writeInt(age)
      out.writeInt(income)
    }

    // 反序列化方法
    override def readFields(in: DataInput): Unit = {
      name = in.readUTF
      age = in.readInt
      income = in.readInt
    }

    override def toString: String = {
      s"${name} # ${age} # ${income}"
    }

    override def compareTo(o: Person): Int = {
      this.age - o.age
    }
  }


}
View Code
复制代码

 

     2. 分区+排序+多字段排序   

 

复制代码
package PartitionByNameGroupByPersonOrderbyAgeIncomePk {

  import java.io.{DataInput, DataOutput}
  import java.lang

  import org.apache.hadoop.conf.Configuration
  import org.apache.hadoop.fs.Path
  import org.apache.hadoop.io.{IntWritable, _}
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
  import org.apache.hadoop.mapreduce.{Job, Mapper, Partitioner, Reducer}

  /*
  * 需求
  *     根据 Name 分区, 分区内 对person计数,并按 age、income 排序
  *
  * 输入
  * 曹操 20 10
    曹操 20 10
    曹操 20 10
    曹操 20 10
    曹操 20 10
    曹操 20 10
    曹操 20 10
    曹仁 29 9
    曹仁 29 9
    曹冲 8 2
    曹冲 8 2
    曹冲 8 2
    曹冲 8 2
    曹植 8 5
    曹植 8 5
    刘备 30 100
    关羽 25 80
    张飞 10 2
    孙权 20 100
    周瑜 45 99
    曹洪 29 88
    曹洪 29 88
    曹洪 29 88
  *
  * 输出
      Person(曹冲, 8, 2)    4
      Person(曹植, 8, 5)    2
      Person(曹操, 20, 10)    7
      Person(曹仁, 29, 9)    2
      Person(曹洪, 29, 88)    3

      Person(张飞, 10, 2)    1
      Person(关羽, 25, 80)    1
      Person(刘备, 30, 100)    1

      Person(孙权, 20, 100)    1
      Person(周瑜, 45, 99)    1


  *
  *
  *
  *
  *
  *
  *
  * */

  // Mapper 类
  class PersonCountMapper extends Mapper[LongWritable, Text, Hero, IntWritable] {
    var v = new IntWritable(1)
    var hero = new Hero()

    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Hero, IntWritable]#Context) = {

      //1. 切割数据
      var line: Array[String] = value.toString.split(" ")
      var name: String = line(0)
      var age: String = line(1)
      var income: String = line(2)

      //2. 创建Person对象
      hero.name = name
      hero.age = age.toInt
      hero.income = income.toInt

      //println(person)

      //3. 写入到环形缓冲区
      context.write(hero, v)
    }
  }


  // Reducer 类
  class PersonCountReducer extends Reducer[Hero, IntWritable, Hero, IntWritable] {

    var v = new IntWritable()

    override def reduce(key: Hero, values: lang.Iterable[IntWritable], context: Reducer[Hero, IntWritable, Hero, IntWritable]#Context) = {
      println(s"${key} ## ${values.toString}")
      var sum = 0
      //1. 对每个person 计数
      values.forEach(sum += _.get)
      v.set(sum)

      //2. 写出数据
      context.write(key, v)
    }
  }

  // Driver
  object HeroCountDriver {
    def main(args: Array[String]): Unit = {
      //1. 获取配置信息以及 获取job对象
      var configuration = new Configuration
      var job: Job = Job.getInstance(configuration)

      //2. 注册本Driver程序的jar
      job.setJarByClass(this.getClass)

      job.setJobName("PersonCount mr")

      //3. 注册 Mapper 和 Reducer的jar
      job.setMapperClass(classOf[PersonCountMapper])
      job.setReducerClass(classOf[PersonCountReducer])

      //4. 设置Mapper 类输出key-value 数据类型
      job.setMapOutputKeyClass(classOf[Hero])
      job.setMapOutputValueClass(classOf[IntWritable])

      //5. 设置最终输出key-value 数据类型
      job.setOutputKeyClass(classOf[Hero])
      job.setOutputValueClass(classOf[IntWritable])

      //6. 设置输入输出路径
      FileInputFormat.setInputPaths(job, new Path("src/main/data/input/1.txt"))
      FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))

      //7. 指定分区个数 与分区器
      job.setNumReduceTasks(3)
      job.setPartitionerClass(classOf[GcPartitioner])


      //7. 提交job
      val bool: Boolean =
        job.waitForCompletion(true)
      System.exit(bool match {
        case true => "0".toInt
        case false => "1".toInt
      })


    }

  }

  //key = Mapoutkey value=Mapoutvalue 根据map输出的key-value 分区
  class GcPartitioner extends Partitioner[Hero, IntWritable] {
    override def getPartition(key: Hero, value: IntWritable, numPartitions: Int): Int = {
      var one: List[String] = List("曹操", "曹仁", "曹冲", "曹植", "曹洪")
      var two: List[String] = List("张飞", "刘备", "关羽")
      var three: List[String] = List("孙权", "张昭", "周瑜")

      //println(s"${key} ###${one.contains(key.name)}")

      key.name match {
        case e: String if one.contains(e) => 0
        //case _ => 1
        case e1: String if two.contains(e1) => 1
        case e2: String if three.contains(e2) => 2
      }


    }
  }

  // public interface WritableComparable<T> extends Writable, Comparable<T>
  class Hero() extends WritableComparable[Hero] {
    var name: String = _
    var age: Int = _
    var income: Int = _

    // 空参构造
    def this(name: String, age: Int, income: Int) = {
      this() // 调用主构造器
      this.name = name
      this.age = age
      this.income = income
    }

    // 序列化方法
    override def write(out: DataOutput): Unit = {
      out.writeUTF(name)
      out.writeInt(age)
      out.writeInt(income)
    }

    // 反序列化方法
    override def readFields(in: DataInput): Unit = {
      name = in.readUTF
      age = in.readInt
      income = in.readInt
    }

    // 对象比较方法
    override def compareTo(o: Hero): Int = {
      if (this.name == o.name && this.age == o.age && this.income == o.income) {
        0
      } else if (this.age >= o.age && this.income >= o.income){
         1
      }else {
        -1
      }

    }

    override def toString = s"Person($name, $age, $income)"
  }


}
View Code
复制代码

 6. 注意事项(自定义类 继承类检查)

  1. 说明

     自定义类 作为key 必须实现 WritableComparable的 compareTo 方法

     map创建环形缓冲区时 会对key 类型做判断 是否继承了 WritableComparable接口

      2. 自定义类 不继承WritableComparable 报错

     Unable to initialize MapOutputCollector org.apache.hadoop.mapred.MapTask$MapOutputBuffer

      3. 源码逻辑

复制代码
    /******************检测MapOutputKeyClass 是否是WritableComparable的子类**************************/
    // 对key排序 是Hadoop的默认行为
    // 因此 map输出前,会检查 输出key类型 是否继承 WritableComparable
    // 报错 Unable to initialize MapOutputCollector org.apache.hadoop.mapred.MapTask$MapOutputBuffer
    //     Caused by: java.io.IOException: Initialization of all the collectors failed.
    //     Error in last collector was:java.lang.ClassCastException: class GroupByPoneNumPk.Person

    JobConf类 
    // 获取map输出key的比较器
    public RawComparator getOutputKeyComparator() {
    Class<? extends RawComparator> theClass = getClass(
      JobContext.KEY_COMPARATOR, null, RawComparator.class);
    if (theClass != null)
      return ReflectionUtils.newInstance(theClass, this);
    // 检查 job.setMapOutputKeyClass(classOf[Person]) 指定的类 是否是 WritableComparable的子类
    return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class), this);
    }
复制代码

 

posted @   学而不思则罔!  阅读(70)  评论(0编辑  收藏  举报
编辑推荐:
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
阅读排行:
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· 字符编码:从基础到乱码解决
· SpringCloud带你走进微服务的世界
点击右上角即可分享
微信分享提示