03_MapReduce框架原理_3.12 Join 应用
1. 需求描述
订单数据表 :
t_order表 id 订单id pid 产品id amount 产品数量 id pid amount 1001 01 1 1002 02 2 1003 03 3 1004 01 4 1005 02 5 1006 03 6
商品信息表 :
t_product表 pid 产品id pname 产品名称表 pid pname 01 小米 02 华为 03 oppo
需求 :
select id ,pname ,amount from t_order left outer join t_product on pid = pid id pname amount 1001 小米 1 1002 华为 2 1003 oppo 3 1004 小米 4 1005 华为 5 1006 oppo 6
代码实现
1. Reduce Join
package ReduceJoinPk { import java.lang import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{IntWritable, LongWritable, NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{InputSplit, Job, Mapper, Reducer} import scala.collection.mutable.ListBuffer /* * * Reduce Join * 缺点 * 1. 合并方式在Reduce阶段完成,Reduce压力太大,Map节点运算负载很低,资源利用率不高 * 2. Reduce节点极易产生数据倾斜 * 示例 * 按 产品分区 * 01 1万条记录 * 02 10条记录 * ReduceTask02 处理完后,需要等待ReduceTask01 * 解决方案 * Map端合并 * * */ // Mapper 类 class RJoinMapper extends Mapper[LongWritable, Text, Text, Text] { private val outkey = new Text() private val outvalue = new Text() override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, Text]#Context) = { // 获取 该map分配到的 切片对象 var fileSplit: FileSplit = context.getInputSplit.asInstanceOf[FileSplit] // 获取切片对象 所属的文件名称 var FileName = fileSplit.getPath.getName //t_order.txt t_product.txt var arr = value.toString.split(" +") if (FileName == "t_order.txt") { outkey.set(arr(1)) outvalue.set(FileName + "#" + arr(0) + "#" + arr(2)) } else { outkey.set(arr(0)) outvalue.set(FileName + "#" + arr(1)) } context.write(outkey, outvalue) } } // Reducer 类 class RJoinReducer extends Reducer[Text, Text, Text, Text] { private val outkey = new Text() private val outvalue = new Text() override def reduce(key: Text, values: lang.Iterable[Text], context: Reducer[Text, Text, Text, Text]#Context) = { //获取pname var pname = "" val list: ListBuffer[String] = new ListBuffer[String] values.forEach(e => { var arr = e.toString.split("#") if (arr.size == 2) pname = arr(1) else list.addOne(arr(1) + "#" + arr(2)) }) println(pname) for (e <- list) { var arr = e.split("#") outkey.set(arr(0) + "#" + pname + "#" + arr(1)) context.write(outkey,outvalue) } } } // Driver object RJoinDriver { def main(args: Array[String]): Unit = { //1. 获取配置信息以及 获取job对象 //读取配置文件 Configuration: core-default.xml, core-site.xml var configuration = new Configuration var job: Job = Job.getInstance(configuration) //2. 注册本Driver程序的jar job.setJarByClass(this.getClass) job.setJobName("Reduce Join") //3. 注册 Mapper 和 Reducer的jar job.setMapperClass(classOf[RJoinMapper]) job.setReducerClass(classOf[RJoinReducer]) //4. 设置Mapper 类输出key-value 数据类型 job.setMapOutputKeyClass(classOf[Text]) job.setMapOutputValueClass(classOf[Text]) //5. 设置最终输出key-value 数据类型 job.setOutputKeyClass(classOf[Text]) job.setOutputValueClass(classOf[Text]) //6. 设置输入输出路径 FileInputFormat.setInputPaths(job, "src/main/data/input/t_*.txt") FileOutputFormat.setOutputPath(job, new Path("src/main/data/output")) //8. 提交job val bool: Boolean = job.waitForCompletion(false) System.exit(bool match { case true => "0".toInt case false => "1".toInt }) } } }
2. Map Join
package MapJoinPk { import java.io.{BufferedReader, InputStreamReader} import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.{IntWritable, LongWritable, NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{InputSplit, Job, Mapper, Reducer} import scala.collection.mutable /* * * Map Join * 使用场景 * 1. 小表关联大表(将小表读取到内存中,在Map端关联) * 步骤 * 1. 驱动类中读取小表,缓存到内存中 * job.addCacheFile(new URI("src/main/data/input/t_product.txt")) * 2. 设置ReduceTask个数为0 (无reduce需求) * job.setNumReduceTasks(0) * 3. 在Mapper类中读取缓存文件 * * */ // Mapper 类 class MJoinMapper extends Mapper[LongWritable, Text, Text, Text] { private val map: mutable.Map[String, String] = mutable.Map[String, String]() //Called once at the beginning of the task // 在MapTask开始前,调用一次 // 读取缓存文件(产品维表) override def setup(context: Mapper[LongWritable, Text, Text, Text]#Context) = { //读取缓冲文件路径 val cacheFiles: Array[URI] = context.getCacheFiles val path = new Path(cacheFiles(0)) //获取文件系统对象,并获取输入流 val fileSystem = FileSystem.get(context.getConfiguration) val inputStream: FSDataInputStream = fileSystem.open(path) val reader: BufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) // 3. 读取数据 var str: String = reader.readLine while (str != null) { val strings = str.split(" +") map += (strings(0) -> strings(1)) str = reader.readLine } println(map.mkString(" ")) } private val outkey = new Text() private val outvalue = new Text() override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, Text]#Context) = { val strings = value.toString.split(" +") //1001 01 1 outkey.set(strings(0)) outvalue.set(map.get(strings(1)).get + "\t" + strings(2)) context.write(outkey, outvalue) } } // Reducer 类 // Driver object MJoinDriver { def main(args: Array[String]): Unit = { //1. 获取配置信息以及 获取job对象 //读取配置文件 Configuration: core-default.xml, core-site.xml var configuration = new Configuration var job: Job = Job.getInstance(configuration) //2. 注册本Driver程序的jar job.setJarByClass(this.getClass) job.setJobName("Map Join") //3. 注册 Mapper 和 Reducer的jar job.setMapperClass(classOf[MJoinMapper]) //4. 设置Mapper 类输出key-value 数据类型 job.setMapOutputKeyClass(classOf[Text]) job.setMapOutputValueClass(classOf[Text]) //5. 设置最终输出key-value 数据类型 job.setOutputKeyClass(classOf[Text]) job.setOutputValueClass(classOf[Text]) //6. 设置输入输出路径 FileInputFormat.setInputPaths(job, "src/main/data/input/t_order.txt") FileOutputFormat.setOutputPath(job, new Path("src/main/data/output")) //7. 加载缓冲数据(产品维表) job.addCacheFile(new URI("src/main/data/input/t_product.txt")) //map端合并完后,直接输出,不需要Reduce阶段 job.setNumReduceTasks(0) //8. 提交job val bool: Boolean = job.waitForCompletion(false) System.exit(bool match { case true => "0".toInt case false => "1".toInt }) } } }
分类:
Mapreduce
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂