03_MapReduce框架原理_3.12 Join 应用

1. 需求描述

订单数据表 :

t_order表
    id 订单id
    pid 产品id
    amount 产品数量

id     pid    amount
1001   01     1
1002   02     2
1003   03     3
1004   01     4
1005   02     5
1006   03     6

商品信息表 :

t_product表
    pid 产品id
    pname 产品名称表
    
pid    pname
01     小米
02     华为
03     oppo

需求 :

select
id
,pname
,amount
from t_order 
left outer join
     t_product
on pid = pid

id     pname    amount
1001   小米     1
1002   华为     2
1003   oppo     3
1004   小米     4
1005   华为     5
1006   oppo     6

代码实现

1. Reduce Join

package ReduceJoinPk {

  import java.lang

  import org.apache.hadoop.conf.Configuration
  import org.apache.hadoop.fs.Path
  import org.apache.hadoop.io.{IntWritable, LongWritable, NullWritable, Text}
  import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
  import org.apache.hadoop.mapreduce.{InputSplit, Job, Mapper, Reducer}

  import scala.collection.mutable.ListBuffer
  
  /*
  *
  *   Reduce Join
  *   缺点
  *       1. 合并方式在Reduce阶段完成,Reduce压力太大,Map节点运算负载很低,资源利用率不高
  *       2. Reduce节点极易产生数据倾斜
  *          示例
  *             按 产品分区
  *                01 1万条记录
  *                02 10条记录
  *              ReduceTask02 处理完后,需要等待ReduceTask01
  *   解决方案
  *       Map端合并
  *
  * */

  // Mapper 类
  class RJoinMapper extends Mapper[LongWritable, Text, Text, Text] {
    private val outkey = new Text()
    private val outvalue = new Text()

    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, Text]#Context) = {
      // 获取 该map分配到的 切片对象
      var fileSplit: FileSplit = context.getInputSplit.asInstanceOf[FileSplit]
      // 获取切片对象 所属的文件名称
      var FileName = fileSplit.getPath.getName
      //t_order.txt  t_product.txt
      var arr = value.toString.split(" +")
      if (FileName == "t_order.txt") {
        outkey.set(arr(1))
        outvalue.set(FileName + "#" + arr(0) + "#" + arr(2))
      } else {
        outkey.set(arr(0))
        outvalue.set(FileName + "#" + arr(1))
      }
      context.write(outkey, outvalue)

    }
  }

  // Reducer 类
  class RJoinReducer extends Reducer[Text, Text, Text, Text] {
    private val outkey = new Text()
    private val outvalue = new Text()


    override def reduce(key: Text, values: lang.Iterable[Text], context: Reducer[Text, Text, Text, Text]#Context) = {
      //获取pname
      var pname = ""
      val list: ListBuffer[String] = new ListBuffer[String]

      values.forEach(e => {
        var arr = e.toString.split("#")
        if (arr.size == 2)
          pname = arr(1)
        else
          list.addOne(arr(1) + "#" + arr(2))
      })

      println(pname)

      for (e <- list) {
        var arr = e.split("#")
        outkey.set(arr(0) + "#" + pname + "#" + arr(1))
        context.write(outkey,outvalue)
      }



    }
  }

  // Driver
  object RJoinDriver {
    def main(args: Array[String]): Unit = {
      //1. 获取配置信息以及 获取job对象
      //读取配置文件  Configuration: core-default.xml, core-site.xml
      var configuration = new Configuration
      var job: Job = Job.getInstance(configuration)

      //2. 注册本Driver程序的jar
      job.setJarByClass(this.getClass)

      job.setJobName("Reduce Join")

      //3. 注册 Mapper 和 Reducer的jar
      job.setMapperClass(classOf[RJoinMapper])
      job.setReducerClass(classOf[RJoinReducer])

      //4. 设置Mapper 类输出key-value 数据类型
      job.setMapOutputKeyClass(classOf[Text])
      job.setMapOutputValueClass(classOf[Text])

      //5. 设置最终输出key-value 数据类型
      job.setOutputKeyClass(classOf[Text])
      job.setOutputValueClass(classOf[Text])

      //6. 设置输入输出路径
      FileInputFormat.setInputPaths(job, "src/main/data/input/t_*.txt")
      FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))

      //8. 提交job
      val bool: Boolean = job.waitForCompletion(false)
      System.exit(bool match {
        case true => "0".toInt
        case false => "1".toInt
      })

    }


  }
}

2. Map Join

package MapJoinPk {

  import java.io.{BufferedReader, InputStreamReader}
  import java.net.URI

  import org.apache.hadoop.conf.Configuration
  import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
  import org.apache.hadoop.io.{IntWritable, LongWritable, NullWritable, Text}
  import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
  import org.apache.hadoop.mapreduce.{InputSplit, Job, Mapper, Reducer}

  import scala.collection.mutable


  /*
  *
  *   Map Join
  *   使用场景
  *       1. 小表关联大表(将小表读取到内存中,在Map端关联)
  *   步骤
  *       1. 驱动类中读取小表,缓存到内存中
  *           job.addCacheFile(new URI("src/main/data/input/t_product.txt"))
  *       2. 设置ReduceTask个数为0 (无reduce需求)
  *           job.setNumReduceTasks(0)
  *       3. 在Mapper类中读取缓存文件
  *           
  * */

  // Mapper 类
  class MJoinMapper extends Mapper[LongWritable, Text, Text, Text] {
    private val map: mutable.Map[String, String] = mutable.Map[String, String]()

    //Called once at the beginning of the task
    // 在MapTask开始前,调用一次
    // 读取缓存文件(产品维表)
    override def setup(context: Mapper[LongWritable, Text, Text, Text]#Context) = {
      //读取缓冲文件路径
      val cacheFiles: Array[URI] = context.getCacheFiles
      val path = new Path(cacheFiles(0))

      //获取文件系统对象,并获取输入流
      val fileSystem = FileSystem.get(context.getConfiguration)
      val inputStream: FSDataInputStream = fileSystem.open(path)
      val reader: BufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))

      // 3. 读取数据
      var str: String = reader.readLine
      while (str != null) {
        val strings = str.split(" +")
        map += (strings(0) -> strings(1))
        str = reader.readLine
      }

      println(map.mkString(" "))


    }


    private val outkey = new Text()
    private val outvalue = new Text()

    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, Text]#Context) = {
      val strings = value.toString.split(" +")

      //1001   01     1
      outkey.set(strings(0))
      outvalue.set(map.get(strings(1)).get + "\t" + strings(2))

      context.write(outkey, outvalue)

    }

  }

  // Reducer 类

  // Driver
  object MJoinDriver {
    def main(args: Array[String]): Unit = {
      //1. 获取配置信息以及 获取job对象
      //读取配置文件  Configuration: core-default.xml, core-site.xml
      var configuration = new Configuration
      var job: Job = Job.getInstance(configuration)

      //2. 注册本Driver程序的jar
      job.setJarByClass(this.getClass)

      job.setJobName("Map Join")

      //3. 注册 Mapper 和 Reducer的jar
      job.setMapperClass(classOf[MJoinMapper])

      //4. 设置Mapper 类输出key-value 数据类型
      job.setMapOutputKeyClass(classOf[Text])
      job.setMapOutputValueClass(classOf[Text])

      //5. 设置最终输出key-value 数据类型
      job.setOutputKeyClass(classOf[Text])
      job.setOutputValueClass(classOf[Text])

      //6. 设置输入输出路径
      FileInputFormat.setInputPaths(job, "src/main/data/input/t_order.txt")
      FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))

      //7. 加载缓冲数据(产品维表)
      job.addCacheFile(new URI("src/main/data/input/t_product.txt"))

      //map端合并完后,直接输出,不需要Reduce阶段
      job.setNumReduceTasks(0)


      //8. 提交job
      val bool: Boolean = job.waitForCompletion(false)
      System.exit(bool match {
        case true => "0".toInt
        case false => "1".toInt
      })

    }


  }

}