02_Hadoop序列化_2.2 自定义Bean对象 实现序列化接口(Writable)
代码示例
package GroupByPoneNumPk { import java.io.{DataInput, DataOutput} import java.lang import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{LongWritable, Text, Writable} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{Job, Mapper, Reducer} // Mapper 类 // 每个Mapper类实例 处理一个切片文件 class GroupByPoneNumMapper extends Mapper[LongWritable, Text, Text, FlowBean] { var text = new Text // 每行记录调用一次map方法 override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, FlowBean]#Context) = { //1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200 //1. 切割数据 var line: Array[String] = value.toString.split(" +") println("第一行 : " + line.mkString("-")) var phone = line(1) var upflow = line.reverse(2) var downflow = line.reverse(1) //2. 创建FlowBean对象 var flowBean = new FlowBean(upflow.toInt, downflow.toInt, 0) //3. 写入到环形缓冲区 text.set(phone) context.write(text, flowBean) println(flowBean) } } // Reducer 类 // 所有Mapper实例 执行完毕后 Reducer才会执行 // Mapper类的输出类型 = Reducer类的输入类型 class GroupByPoneNumReducer extends Reducer[Text, FlowBean, Text, FlowBean] { // 每个key调用一次 override def reduce(key: Text, values: lang.Iterable[FlowBean], context: Reducer[Text, FlowBean, Text, FlowBean]#Context) = { println("reduce into ....") //1. 对 upflow、downflow求和 var sumUpflow = 0 var sumDownflow = 0 values.forEach( bean => { sumUpflow += bean.upflow sumDownflow += bean.downflow } ) //2. 求 总流量 var flowBean = new FlowBean(sumUpflow, sumDownflow, sumUpflow + sumDownflow) //2. 写出数据 context.write(key, flowBean) println("第二行 :" + flowBean) } } // Driver object Driver { def main(args: Array[String]): Unit = { //1. 获取配置信息以及 获取job对象 var configuration = new Configuration var job: Job = Job.getInstance(configuration) //2. 注册本Driver程序的jar job.setJarByClass(this.getClass) job.setJobName("scala mr") //3. 注册 Mapper 和 Reducer的jar job.setMapperClass(classOf[GroupByPoneNumMapper]) job.setReducerClass(classOf[GroupByPoneNumReducer]) //4. 设置Mapper 类输出key-value 数据类型 job.setMapOutputKeyClass(classOf[Text]) job.setMapOutputValueClass(classOf[FlowBean]) //5. 设置最终输出key-value 数据类型 job.setOutputKeyClass(classOf[Text]) job.setOutputValueClass(classOf[FlowBean]) //6. 设置输入输出路径 FileInputFormat.setInputPaths(job, new Path("src/main/data/input/phone_data.txt")) FileOutputFormat.setOutputPath(job, new Path("src/main/data/output")) //7. 提交job val bool: Boolean = job.waitForCompletion(true) System.exit(bool match { case true => "0".toInt case false => "1".toInt }) } } // 1363157985066 id // 13726230503 手机号 // 00-FD-07-A4-72-B8:CMCC MAC地址 // 120.196.100.82 网络ip // i02.c.aliimg.com 域名 // 24 // 27 // 2481 上行流量 // 24681 下行流量 // 200 网络状态码 class FlowBean() extends Writable { var upflow = 0 var downflow = 0 var sumflow = 0 //辅助构造器 def this(upflow: Int, downflow: Int, sumflow: Int) { this() this.upflow = upflow this.downflow = downflow this.sumflow = sumflow } override def write(out: DataOutput): Unit = { out.writeInt(upflow) out.writeInt(downflow) out.writeInt(sumflow) } override def readFields(in: DataInput): Unit = { upflow = in.readInt downflow = in.readInt sumflow = in.readInt } override def toString: String = { s"${upflow} \t ${downflow} \t ${sumflow}" } } }
分类:
Mapreduce
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· 字符编码:从基础到乱码解决
· SpringCloud带你走进微服务的世界