一个spark streaming的黑名单过滤小例子
> nc -lk 9999
20190912,sz
20190913,lin
package com.lin.spark.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} /** * Created by Administrator on 2019/6/4. */ object TransformApp { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("SqlNetworkWordCount") val ssc = new StreamingContext(conf,Seconds(3)) //构建黑名单 val black = List("sz","gz") val blacksRDD = ssc.sparkContext.parallelize(black).map(black=>(black,true)) val lines = ssc.socketTextStream("node1",9999) /*** * 数据: * 20190912,sz * 20190913,lin * ---->split * (sz,(20190912,sz)) * (lin,(20190913,lin)) * ------>leftOuterJoin * (sz,((20190912,sz),true)) * (lin,((20190913,lin),none)) * ------->filter * (lin,((20190913,lin),false)) * ------>map * (20190913,lin) */ val clicklog = lines.map(x => (x.split(",")(1), x)).transform(rdd => { rdd.leftOuterJoin(blacksRDD) .filter(x=> x._2._2.getOrElse(false) != true) .map(x=>x._2._1) }) clicklog.print() ssc.start() ssc.awaitTermination() } }