03_MapReduce框架原理_3.13 ETL(Map端过滤数据)

复制代码
package MapFilterPk {
  
  import org.apache.hadoop.conf.Configuration
  import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
  import org.apache.hadoop.io.{IntWritable, LongWritable, NullWritable, Text}
  import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
  import org.apache.hadoop.mapreduce.{InputSplit, Job, Mapper, Reducer}

  /*
  * 需求
  *    1. 过滤蜀国、魏国人员
  *
  * */

  // Mapper 类
  class MFilterMapper extends Mapper[LongWritable, Text, Text, NullWritable] {

    private val outkey = new Text()
    private var outvalue = new Text()

    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, NullWritable]#Context) = {
      val strings = value.toString.split(" +")

      //var one: List[String] = List("曹操", "曹仁", "曹植")
      //var two: List[String] = List("张飞", "刘备", "关羽")
      var three: List[String] = List("孙权", "张昭", "周瑜")

      strings.foreach(e => if (three.contains(e)) {
        outkey.set(e)
        context.write(outkey, NullWritable.get)
      })

    }

  }

  // Reducer 类

  // Driver
  object MFilterDriver {
    def main(args: Array[String]): Unit = {
      //1. 获取配置信息以及 获取job对象
      //读取配置文件  Configuration: core-default.xml, core-site.xml
      var configuration = new Configuration
      var job: Job = Job.getInstance(configuration)

      //2. 注册本Driver程序的jar
      job.setJarByClass(this.getClass)

      job.setJobName("Map Join")

      //3. 注册 Mapper 和 Reducer的jar
      job.setMapperClass(classOf[MFilterMapper])

      //4. 设置Mapper 类输出key-value 数据类型
      //      job.setMapOutputKeyClass(classOf[Text])
      //      job.setMapOutputValueClass(classOf[NullWritable])

      //5. 设置最终输出key-value 数据类型
      job.setOutputKeyClass(classOf[Text])
      job.setOutputValueClass(classOf[NullWritable])

      //6. 设置输入输出路径
      FileInputFormat.setInputPaths(job, "src/main/data/input/1.txt")
      FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))


      //map端合并完后,直接输出,不需要Reduce阶段
      job.setNumReduceTasks(0)


      //8. 提交job
      val bool: Boolean = job.waitForCompletion(false)
      System.exit(bool match {
        case true => "0".toInt
        case false => "1".toInt
      })

    }


  }

}
复制代码

 

posted @   学而不思则罔!  阅读(75)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
点击右上角即可分享
微信分享提示