03_MapReduce框架原理_3.13 ETL(Map端过滤数据)
package MapFilterPk { import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.{IntWritable, LongWritable, NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{InputSplit, Job, Mapper, Reducer} /* * 需求 * 1. 过滤蜀国、魏国人员 * * */ // Mapper 类 class MFilterMapper extends Mapper[LongWritable, Text, Text, NullWritable] { private val outkey = new Text() private var outvalue = new Text() override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, NullWritable]#Context) = { val strings = value.toString.split(" +") //var one: List[String] = List("曹操", "曹仁", "曹植") //var two: List[String] = List("张飞", "刘备", "关羽") var three: List[String] = List("孙权", "张昭", "周瑜") strings.foreach(e => if (three.contains(e)) { outkey.set(e) context.write(outkey, NullWritable.get) }) } } // Reducer 类 // Driver object MFilterDriver { def main(args: Array[String]): Unit = { //1. 获取配置信息以及 获取job对象 //读取配置文件 Configuration: core-default.xml, core-site.xml var configuration = new Configuration var job: Job = Job.getInstance(configuration) //2. 注册本Driver程序的jar job.setJarByClass(this.getClass) job.setJobName("Map Join") //3. 注册 Mapper 和 Reducer的jar job.setMapperClass(classOf[MFilterMapper]) //4. 设置Mapper 类输出key-value 数据类型 // job.setMapOutputKeyClass(classOf[Text]) // job.setMapOutputValueClass(classOf[NullWritable]) //5. 设置最终输出key-value 数据类型 job.setOutputKeyClass(classOf[Text]) job.setOutputValueClass(classOf[NullWritable]) //6. 设置输入输出路径 FileInputFormat.setInputPaths(job, "src/main/data/input/1.txt") FileOutputFormat.setOutputPath(job, new Path("src/main/data/output")) //map端合并完后,直接输出,不需要Reduce阶段 job.setNumReduceTasks(0) //8. 提交job val bool: Boolean = job.waitForCompletion(false) System.exit(bool match { case true => "0".toInt case false => "1".toInt }) } } }
分类:
Mapreduce
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?