02_Hadoop序列化_2.2 自定义Bean对象实现序列化接口(Writable)

代码示例

package GroupByPoneNumPk {

  import java.io.{DataInput, DataOutput}
  import java.lang

  import org.apache.hadoop.conf.Configuration
  import org.apache.hadoop.fs.Path
  import org.apache.hadoop.io.{LongWritable, Text, Writable}
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
  import org.apache.hadoop.mapreduce.{Job, Mapper, Reducer}

  // Mapper 类
  // 每个Mapper类实例 处理一个切片文件
  class GroupByPoneNumMapper extends Mapper[LongWritable, Text, Text, FlowBean] {
    var text = new Text

    // 每行记录调用一次map方法
    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, FlowBean]#Context) = {
      //1363157985066   13726230503 00-FD-07-A4-72-B8:CMCC  120.196.100.82  i02.c.aliimg.com        24  27  2481    24681   200
      //1. 切割数据
      var line: Array[String] = value.toString.split(" +")
      println("第一行 : " + line.mkString("-"))
      var phone = line(1)
      var upflow = line.reverse(2)
      var downflow = line.reverse(1)

      //2. 创建FlowBean对象
      var flowBean = new FlowBean(upflow.toInt, downflow.toInt, 0)

      //3. 写入到环形缓冲区
      text.set(phone)
      context.write(text, flowBean)
      println(flowBean)
    }
  }


  // Reducer 类
  // 所有Mapper实例 执行完毕后 Reducer才会执行
  // Mapper类的输出类型 = Reducer类的输入类型
  class GroupByPoneNumReducer extends Reducer[Text, FlowBean, Text, FlowBean] {

    // 每个key调用一次
    override def reduce(key: Text, values: lang.Iterable[FlowBean], context: Reducer[Text, FlowBean, Text, FlowBean]#Context) = {
      println("reduce into ....")
      //1. 对 upflow、downflow求和
      var sumUpflow = 0
      var sumDownflow = 0
      values.forEach(
        bean => {
          sumUpflow += bean.upflow
          sumDownflow += bean.downflow
        }
      )

      //2. 求 总流量
      var flowBean = new FlowBean(sumUpflow, sumDownflow, sumUpflow + sumDownflow)

      //2. 写出数据
      context.write(key, flowBean)
      println("第二行 :" + flowBean)

    }
  }

  // Driver
  object Driver {
    def main(args: Array[String]): Unit = {
      //1. 获取配置信息以及 获取job对象
      var configuration =
        new Configuration
      var job: Job =
        Job.getInstance(configuration)

      //2. 注册本Driver程序的jar
      job.setJarByClass(this.getClass)

      job.setJobName("scala mr")

      //3. 注册 Mapper 和 Reducer的jar
      job.setMapperClass(classOf[GroupByPoneNumMapper])
      job.setReducerClass(classOf[GroupByPoneNumReducer])

      //4. 设置Mapper 类输出key-value 数据类型
      job.setMapOutputKeyClass(classOf[Text])
      job.setMapOutputValueClass(classOf[FlowBean])

      //5. 设置最终输出key-value 数据类型
      job.setOutputKeyClass(classOf[Text])
      job.setOutputValueClass(classOf[FlowBean])

      //6. 设置输入输出路径
      FileInputFormat.setInputPaths(job, new Path("src/main/data/input/phone_data.txt"))
      FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))

      //7. 提交job
      val bool: Boolean =
        job.waitForCompletion(true)
      System.exit(bool match {
        case true => "0".toInt
        case false => "1".toInt
      })

    }

  }


  //  1363157985066 id
  //  13726230503 手机号
  //  00-FD-07-A4-72-B8:CMCC MAC地址
  //  120.196.100.82 网络ip
  //  i02.c.aliimg.com 域名
  //  24
  //  27
  //  2481 上行流量
  //  24681 下行流量
  //  200  网络状态码
  class FlowBean() extends Writable {
    var upflow = 0
    var downflow = 0
    var sumflow = 0

    //辅助构造器
    def this(upflow: Int, downflow: Int, sumflow: Int) {
      this()
      this.upflow = upflow
      this.downflow = downflow
      this.sumflow = sumflow
    }

    override def write(out: DataOutput): Unit = {
      out.writeInt(upflow)
      out.writeInt(downflow)
      out.writeInt(sumflow)
    }

    override def readFields(in: DataInput): Unit = {
      upflow = in.readInt
      downflow = in.readInt
      sumflow = in.readInt
    }

    override def toString: String = {
      s"${upflow} \t ${downflow} \t ${sumflow}"
    }
  }


}

posted @ 2021-12-13 19:29 学而不思则罔！阅读(143) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 10年+ .NET Coder 心语 ── 封装的思维：从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· 字符编码：从基础到乱码解决
· SpringCloud带你走进微服务的世界

公告

昵称：学而不思则罔！
园龄： 4年10个月
粉丝： 11
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

私人小院

02_Hadoop序列化_2.2 自定义Bean对象实现序列化接口(Writable)

公告

搜索

常用链接

随笔分类

随笔档案

linux运维

阅读排行榜

评论排行榜

推荐排行榜

最新评论

私人小院

02_Hadoop序列化_2.2 自定义Bean对象 实现序列化接口(Writable)

公告

搜索

常用链接

随笔分类

随笔档案

linux运维

阅读排行榜

评论排行榜

推荐排行榜

最新评论

02_Hadoop序列化_2.2 自定义Bean对象实现序列化接口(Writable)