1. 要求
| 1. 过去用户的行为日志数据中响应状态码大于等于400的数据 |
| 2. 并且需要查看一下合法的数据有多少条,不合法的数据有多少条 |
| 3. 将处理完成的结果保存到HDFS分布式文件存储系统上 |
2. 代码:使用自带累加器
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| object A1LogFilter { |
| def main(args: Array[String]): Unit = { |
| val sparkConf:SparkConf = new SparkConf().setAppName("demo02").setMaster("local[3]") |
| val sc:SparkContext = new SparkContext(sparkConf) |
| |
| val rdd = sc.textFile("hdfs://node1:9000/opt/data/a.log") |
| |
| val accumulator = sc.longAccumulator |
| val accumulator1 = sc.longAccumulator |
| val value: RDD[String] = rdd.filter((line: String) => { |
| val str = line.split(" ") |
| if (str.length < 15) { |
| accumulator.add(1L) |
| false |
| } else if (str(8).toInt >= 400) { |
| accumulator1.add(1L) |
| true |
| } else { |
| accumulator.add(1L) |
| false |
| } |
| }) |
| value.saveAsTextFile("hdfs://node1:9000/clean") |
| println(s"合法 ${accumulator1.value} 不合法 ${accumulator.value}") |
| |
| sc.stop() |
| } |
| } |
3. 代码二:使用自定义累加器
| object A1LogFilter { |
| def main(args: Array[String]): Unit = { |
| val sparkConf:SparkConf = new SparkConf().setAppName("demo02").setMaster("local[3]") |
| val sc:SparkContext = new SparkContext(sparkConf) |
| |
| val rdd = sc.textFile("hdfs://node1:9000/opt/data/a.log") |
| |
| |
| |
| |
| val accumutor = new A1Accumutor() |
| val value: RDD[String] = rdd.filter((line: String) => { |
| val str = line.split(" ") |
| if (str.length < 15) { |
| |
| accumutor.add("novalid") |
| false |
| } else if (str(8).toInt >= 400) { |
| |
| accumutor.add("valid") |
| true |
| } else { |
| accumutor.add("novalid") |
| |
| false |
| } |
| }) |
| value.saveAsTextFile("hdfs://node1:9000/clean") |
| |
| println(accumutor.value) |
| |
| sc.stop() |
| } |
| } |
| package sparkcorerddexample |
| |
| import org.apache.spark.rdd.RDD |
| import org.apache.spark.{SparkConf, SparkContext} |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| object A1LogFilter { |
| def main(args: Array[String]): Unit = { |
| val sparkConf:SparkConf = new SparkConf().setAppName("demo02").setMaster("local[3]") |
| val sc:SparkContext = new SparkContext(sparkConf) |
| |
| val rdd = sc.textFile("hdfs://node1:9000/opt/data/a.log") |
| |
| |
| |
| |
| val accumutor = new A1Accumutor() |
| val value: RDD[String] = rdd.filter((line: String) => { |
| val str = line.split(" ") |
| if (str.length < 15) { |
| |
| accumutor.add("novalid") |
| false |
| } else if (str(8).toInt >= 400) { |
| |
| accumutor.add("valid") |
| true |
| } else { |
| accumutor.add("novalid") |
| |
| false |
| } |
| }) |
| value.saveAsTextFile("hdfs://node1:9000/clean") |
| |
| println(accumutor.value) |
| |
| sc.stop() |
| } |
| } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?