一个spark streaming的黑名单过滤小例子

 

 > nc -lk 9999

20190912,sz
20190913,lin
package com.lin.spark.streaming

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * Created by Administrator on 2019/6/4.
  */
object TransformApp {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("SqlNetworkWordCount")
    val ssc = new StreamingContext(conf,Seconds(3))

    //构建黑名单
    val black = List("sz","gz")
    val blacksRDD = ssc.sparkContext.parallelize(black).map(black=>(black,true))

    val lines = ssc.socketTextStream("node1",9999)
    /***
      * 数据:
      * 20190912,sz
      * 20190913,lin
      * ---->split
      * (sz,(20190912,sz))
      * (lin,(20190913,lin))
      * ------>leftOuterJoin
      * (sz,((20190912,sz),true))
      * (lin,((20190913,lin),none))
      * ------->filter
      * (lin,((20190913,lin),false))
      * ------>map
      * (20190913,lin)
      */

    val clicklog = lines.map(x => (x.split(",")(1), x)).transform(rdd => {
      rdd.leftOuterJoin(blacksRDD)
        .filter(x=> x._2._2.getOrElse(false) != true)
        .map(x=>x._2._1)
    })

    clicklog.print()
    ssc.start()
    ssc.awaitTermination()

  }

}

 

posted @ 2019-06-04 20:50  消失的白桦林  阅读(894)  评论(0编辑  收藏  举报