ke01开启: nc -lk 8888

Map:遍历数据流中的每一个元素,产生一个新的元素

package com.text.transformation
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream = env.socketTextStream("ke01", 8888)
    val streamValue = stream.map(x => {
      if (!x.contains("a")) {
        x
      }
    })
    streamValue.print()
    env.execute()
  }
}

[root@ke01 bigdata]# nc -lk 8888
b
c
b
a
a


结果:
11> b
12> c
1> b
2> ()
3> ()

 

flatMap:遍历数据流中的每一个元素,产生N个元素 N=0,1,2,......

package com.text.transformation
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream = env.socketTextStream("ke01", 8888)
    val value = stream.flatMap( x => x.split(","))
    value.print()
    env.execute()
  }
}

a,c
a,d,e

结果:
3> a
3> c
4> a
4> d
4> e

 

使用flatMap代替filter

package com.text.transformation
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._

import scala.collection.mutable.ListBuffer
object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream = env.socketTextStream("ke01", 8888)
    val value = stream.flatMap( x => {
      val rest = new ListBuffer[String]
      if(!x.contains("a")){
        rest += x
      }
      rest.iterator
    })
    value.print()
    env.execute()
  }
}

abc
qwe
结果:
4> qwe

 

keyBy :根据数据流中指定的字段来分区,相同指定字段值的数据一定是在同一个分区中,内部分区使用的是 HashPartitioner

指定分区字段的方式有三种:

1、根据索引号指定
2、通过匿名函数来指定
3、通过实现KeySelector接口 指定分区字段


package
com.text.transformation import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.streaming.api.scala._ object MapOperator { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment val stream = env.socketTextStream("ke01", 8888)

//根据索引号来指定分区字段
// .keyBy(0)
//通过传入匿名函数 指定分区字段
// .keyBy(x=>x._1)
//通过实现KeySelector接口 指定分区字段

    stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).print()
    env.execute()
  }
}

结果:
8> (a,1)
8> (a,1)
3> (b,1)
3> (b,1)
6> (c,1)
8> (a,1)

 

 

keyBy
package com.text.transformation
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._

object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream = env.socketTextStream("ke01", 8888)
    stream.flatMap(_.split(" ")).map((_, 1)).keyBy(new KeySelector[(String, Int), String]{
      override def getKey(value: (String, Int)): String = {
        value._1
      }
    }).print()
    env.execute()
  }
}


结果:
8> (a,1)
3> (b,1)
8> (a,1)
3> (b,1)

 

 

reduce,一般结合keyBy使用

package com.text.transformation
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._

object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream = env.socketTextStream("ke01", 8888)
    stream.flatMap(_.split(" ")).map((_, 1)).keyBy(new KeySelector[(String, Int), String]{
      override def getKey(value: (String, Int)): String = {
        value._1
      }
    }).reduce((x, y) => (x._1, x._2 + y ._2)).print()
    env.execute()
  }
}

结果:
8> (a,1)
8> (a,2)
8> (a,3)

 

split: 根据条件将一个流分成两个或者更多的流

package com.text.transformation
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._

object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
   // 偶数分到一个流(first) 奇数分到另外一个流(second)
    val stream = env.generateSequence(1, 100)
    val splitStream = stream.split(info => {
      info % 2 match {
        case 0 => List("first")
        case 1 => List("second")
      }
    })
    // 查找当前流,从SplitStream中选择一个或者多个数据流
    splitStream.select("first").print()
    env.execute()
  }
}

结果:

10> 10
6> 6
12> 12
6> 18
8> 8
6> 30
4> 4
2> 2
4> 16

 

 

练习: 读取kafka数据,实时统计各个卡口下的车流量

-- 实现kafka生产者,读取卡口数据并且往kafka中生产数据

package com.text.source

import java.util.Properties

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.flink.streaming.api.scala._
import scala.io.Source

object FlinkKafkaProduct {
  def main(args: Array[String]): Unit = {
    val prop = new Properties()
    prop.setProperty("bootstrap.servers", "ke02:9092,ke03:9092,ke04:9092")
    prop.setProperty("key.serializer", classOf[StringSerializer].getName)
    prop.setProperty("value.serializer", classOf[StringSerializer].getName)

    // 创建一个kafaka生产者对象
    var producer = new KafkaProducer[String, String](prop)

    val lines = Source.fromFile("D:\\code\\scala\\test\\test07\\data\\carFlow_all_column_test.txt").getLines()

    for (i <- 1 to 100) {
      for (elem <- lines) {
        val splits = elem.split(",")
        val monitorId = splits(0).replace("'", "")
        val carId = splits(2).replace("'", "")
        val timestamp = splits(4).replace("'", "")
        val speed = splits(6)
        val stringBuilder = new StringBuilder
        val info = stringBuilder.append(monitorId + "\t").append(carId + "\t").append(timestamp + "\t").append(speed)
        producer.send(new ProducerRecord[String, String]("flink-kafka", i+"", info.toString()))
        Thread.sleep(500)
      }
    }

  }
}



-- 流式统计各个卡口的车流量
package com.text.transformation
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.flink.streaming.api.scala._
object Demo1 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", "ke02:9092,ke03:9092,ke04:9092")
    properties.setProperty("group.id", "flink-kafka-001")
    properties.setProperty("key.deserializer", classOf[StringSerializer].getName)
    properties.setProperty("value.deserializer", classOf[StringSerializer].getName)
    val stream = env.addSource(new FlinkKafkaConsumer[String]("flink-kafka", new SimpleStringSchema(), properties))

    stream.map(data => {
      val splits = data.split("\t")
      (splits(0), 1)
    }).keyBy(_._1).sum(1).print()
    env.execute()
  }
}

 

Aggregations是一类聚合算子

keyedStream.sum(0)
keyedStream.sum("key")
keyedStream.min(0)
keyedStream.min("key")
keyedStream.max(0)
keyedStream.max("key")
keyedStream.minBy(0)
keyedStream.minBy("key")
keyedStream.maxBy(0)
keyedStream.maxBy("key")

 

demo02:实时统计各个卡口最先通过的汽车的信息

package com.text.transformation
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.flink.streaming.api.scala._
import java.text.SimpleDateFormat

object Demo1 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", "ke02:9092,ke03:9092,ke04:9092")
    properties.setProperty("group.id", "flink-kafka-001")
    properties.setProperty("key.deserializer", classOf[StringSerializer].getName)
    properties.setProperty("value.deserializer", classOf[StringSerializer].getName)
    val stream = env.addSource(new FlinkKafkaConsumer[String]("flink-kafka", new SimpleStringSchema(), properties))

    stream.map(data => {
      val splits = data.split("\t")
      val eventTime = splits(2)
      var format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      val date = format.parse(eventTime)
      (splits(0), date.getTime)
    }).keyBy(_._1).min(1).print()
    env.execute()
  }
}


结果:
11> (310999021105,1408514970000)
7> (310999012504,1408514970000)
2> (310999008906,1408514970000)
1> (310999008805,1408514973000)
10> (310999007204,1408514970000)

 

 

 

 

union: 合并两个或者更多的数据流产生一个新的数据流,这个新的数据流中包含了所合并的数据流的元素

注意:需要保证数据流中元素类型一致

package com.text.transformation
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._

object MapOperator {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream1 = env.fromCollection(List(("a", 1), ("b", 2)))
    val stream2 = env.fromCollection(List(("a", 3), ("d", 4)))
    val value = stream1.union(stream2)
    value.print()
    env.execute()
  }
}

结果:
11> (b,2)
8> (a,3)
10> (a,1)
9> (d,4)

 

Connect (假合并):合并两个数据流并且保留两个数据流的数据类型,能够共享两个流的状态

package com.text.transformation
import org.apache.flink.streaming.api.functions.co.CoMapFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._

object Demo1 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    val ds1 = env.socketTextStream("ke01", 8888)
    val ds2 = env.socketTextStream("ke01", 9999)
    val wcStream1 = ds1.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1)
    val wcStream2 = ds2.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1)
    val restStream: ConnectedStreams[(String, Int), (String, Int)] =
      wcStream2.connect(wcStream1)
//
restStream没有println方法,所以需要map进行转换,map中可以new CoMap、CoFlatMap

    restStream.map(new CoMapFunction[(String, Int),(String, Int), (String, Int)] {
override def map1(value: (String, Int)): (String, Int) = {
(value._1 + ": first", value._2 + 100)
}
override def map2(value: (String, Int)): (String, Int) = {
(value._2 + ":second", value._2*100)
}
}).print()
env.execute()
}
}

结果:

8888输入:ke ke ke

7> (ke: first,101)
7> (ke: first,102)

7> (ke: first,103)

  9999输入: ke ke ke

7> (9:second,100)
7> (10:second,200)
7> (11:second,300)



 

CoMap, CoFlatMap并不是具体算子名字,而是一类操作名称
凡是基于ConnectedStreams数据流做map遍历,这类操作叫做CoMap
凡是基于ConnectedStreams数据流做flatMap遍历,这类操作叫做CoFlatMap

 

//CoMap第一种实现方式:

restStream.map(new CoMapFunction[(String,Int),(String,Int),(String,Int)] {
 //对第一个数据流做计算
 override def map1(value: (String, Int)): (String, Int) = {
  (value._1+":first",value._2+100)
 }
 //对第二个数据流做计算
 override def map2(value: (String, Int)): (String, Int) = {
  (value._1+":second",value._2*100)
 }
}).print()

 

 

// CoMap第二种实现方式:


restStream.map(
  //对第一个数据流做计算
  x=>{(x._1+":first",x._2+100)}
  //对第二个数据流做计算
  ,y=>{(y._1+":second",y._2*100)}
).print()



 

 

CoFlatMap第一种实现方式:

package com.text.transformation
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

object Demo1 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val ds1 = env.socketTextStream("ke01", 8888)
    val ds2 = env.socketTextStream("ke01", 9999)
    val wcStream1 = ds1.flatMap(_.split(" "))
    val wcStream2 = ds2.flatMap(_.split(" "))
    val restStream: ConnectedStreams[(String), (String)] =
      wcStream2.connect(wcStream1)
    restStream.flatMap(
      (x, c:Collector[String])=>{
        x.split(" ").foreach(w =>{
          c.collect(w)
        })
      },
      (y, c:Collector[String])=>{
        y.split(" ").foreach(d => {
          c.collect(d)
        })
      }
    ).print()
    env.execute()
  }
}

 

CoFlatMap第二种实现方式:

ds1.connect(ds2).flatMap(
//对第一个数据流做计算
x=>{
x.split(" ")
}
//对第二个数据流做计算
,y=>{
y.split(" ")
}).print()

 

CoFlatMap第三种实现方式:

ds1.connect(ds2).flatMap(new CoFlatMapFunction[String,String,(String,Int)] {
//对第一个数据流做计算
override def flatMap1(value: String, out: Collector[(String, Int)]): Unit =
{
val words = value.split(" ")
words.foreach(x=>{
out.collect((x,1))
})
}
//对第二个数据流做计算
override def flatMap2(value: String, out: Collector[(String, Int)]): Unit =
{
val words = value.split(" ")
words.foreach(x=>{
out.collect((x,1))
})
}
}).print()

 

Demo03:现有一个配置文件存储车牌号与车主的真实姓名,通过数据流中的车牌号实时匹配出对应的
车主姓名(注意:配置文件可能实时改变)

package com.text.transformation
import org.apache.flink.api.java.io.TextInputFormat
import org.apache.flink.core.fs.Path
import org.apache.flink.streaming.api.functions.co.CoMapFunction
import org.apache.flink.streaming.api.functions.source.FileProcessingMode
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import scala.collection.mutable

object Demo2 {

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    val filePath = "D:\\code\\scala\\test\\test07\\data\\text.txt"
    val textStream = env.readFile(new TextInputFormat(new Path(filePath)), filePath, FileProcessingMode.PROCESS_CONTINUOUSLY, 10)
    val dataStream = env.socketTextStream("ke01", 8888)
    dataStream.connect(textStream).map(new CoMapFunction[String, String, String] {
      private val hashMap = new mutable.HashMap[String, String]()
      override def map1(value: String): String = {
        hashMap.getOrElse(value, "not found name")
      }
      override def map2(value: String): String = {
        val splits = value.split(" ")
        hashMap.put(splits(0), splits(1))
        value + "加载完毕....."
      }
    }).print()
    env.execute()
  }
}


110 警察加载完毕.....
120 救护车加载完毕.....
119 火警加载完毕.....

结果:
输入aa  -> not found name
输入110 ->  警察

 

side output侧输出流:流计算过程,可能遇到根据不同的条件来分隔数据流。filter分割造成不必要的数据复制

package com.text.transformation

import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.streaming.api.scala.{OutputTag, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

object Demo3 {

  def main(args: Array[String]): Unit = {

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val stream = env.socketTextStream("ke01", 8888)
    val gtTag = new OutputTag[String]("gt")
    val processStream = stream.process(new ProcessFunction[String, String] {
      override def processElement(value: String, ctx: ProcessFunction[String, String]#Context, out: Collector[String]): Unit = {
        try {
          val longVar = value.toLong
          if (longVar > 100) {
            out.collect(value)
          } else {
            ctx.output(gtTag, value)
          }
        } catch {
          case e => e.getMessage
            ctx.output(gtTag, value)
        }
      }
    })
    val sideStream = processStream.getSideOutput(gtTag)
    sideStream.print("sideStream")
    processStream.print("mainStream")
    env.execute()
  }

}


结果:
sideStream:4> 50
sideStream:5> 100
mainStream:6> 120
mainStream:7> 130

 

 

 

Iterate


Iterate算子提供了对数据流迭代的支持
迭代由两部分组成:迭代体、终止迭代条件
不满足终止迭代条件的数据流会返回到stream流中,进行下一次迭代
满足终止迭代条件的数据流继续往下游发送


package com.text.transformation import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.streaming.api.scala._
object Demo4 { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment val initStream = env.socketTextStream("ke01", 8888) val stream = initStream.map(_.toLong) stream.iterate( info => { val infoIterate = info.map(x => { println(x) if (x > 0) x - 1 else x }) // 大于0的值继续返回到stream流中,当 <= 0 继续往下游发送 (infoIterate.filter(_ > 0), infoIterate.filter(_ < 0)) } ).print() env.execute() } }

 

函数类和富函数类

在使用Flink算子的时候,可以通过传入匿名函数和函数类对象 例如:

 

 

 富函数类相比于普通的函数,可以获取运行环境的上下文(Context),拥有一些生命周期方法,管理状态,可以实现更加复杂的功能

普通函数类 

富函数类

MapFunction  RichMapFunction
FlatMapFunction  RichFlatMapFunction
FilterFunction 

RichFilterFunction

  • 使用普通函数类过滤掉车速高于100的车辆信息
    package com.text.transformation
    import org.apache.flink.api.common.functions.FilterFunction
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    object Demo5 {
      def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        val stream = env.readTextFile("D:\\code\\scala\\test\\test07\\data\\carFlow_all_column_test.txt")
        stream.filter(new FilterFunction[String] {
          override def filter(value: String): Boolean = {
            if (value != null && !"".equals(value)) {
              val speed = value.split(",")(6).replace("'", "").toLong
              if (speed > 100) {
                false
              } else {
                true
              }
            } else {
              false
            }
          }
        }).print()
        env.execute()
      }
    }

    结果:
    '310999003001', '3109990030010220140820141230292','00000000','','2014-08-20 14:09:35','0',255,'SN',  0.00,'4','','310999','310999003001','02','','','2','','','2014-08-20 14:12:30','2014-08-20 14:16:13',0,0,'2014-08-21 18:50:05','','',' '
    '310999003102', '3109990031020220140820141230266','粤BT96V3','','2014-08-20 14:09:35','0',21,'NS', 0.00,'2','','310999','310999003102','02','','','2','','','2014-08-20 14:12:30','2014-08-20 14:16:13',0,0,'2014-08-21 18:50:05','','',' '
     

     

  • 使用富函数类,将车牌号转化成车主真实姓名,映射表存储在Redis中
    
    

    <dependency>
    <groupId>redis.clients</groupId>
    <artifactId>jedis</artifactId>
    <version>${redis.version}</version>
    </dependency>

    
    
    
    
    package com.text.transformation
    import org.apache.flink.api.common.functions.RichMapFunction
    import org.apache.flink.configuration.Configuration
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    import org.apache.flink.streaming.api.scala._
    import redis.clients.jedis.Jedis
    object Demo6 {
      def main(args: Array[String]): Unit = {
    
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        val stream = env.socketTextStream("192.168.75.91", 8888)
        stream.map(new RichMapFunction[String, String] {
          private var jedis: Jedis = _
          // /初始化函数 在每一个thread启动的时候(处理元素的时候,会调用一次)
          // 在open中可以创建连接redis的连接
          override def open(parameters: Configuration): Unit = {
            //getRuntimeContext可以获取flink运行的上下文环境 AbstractRichFunction抽象类提供的
            val taskName = getRuntimeContext.getTaskName
            val subTasks = getRuntimeContext.getTaskNameWithSubtasks
            println("=========open======" + "taskName:" + taskName + "subTasks:" + subTasks)
            jedis = new Jedis("192.168.75.91", 6390)
            jedis.auth("aa123456")
            jedis.select(3)
          }
    
          //每处理一个元素,就会调用一次
          override def map(value: String): String = {
            val name = jedis.get(value)
            if (name == null) {
              "not found name"
            } else {
              name
            }
          }
    
          //元素处理完毕后,会调用close方法 关闭redis连接
          override def close(): Unit = {
            jedis.close()
          }
        }).setParallelism(2).print()
        env.execute()
      }
    
    }
    
    
    结果:
    =========open======taskName:MapsubTasks:Map (2/2)
    =========open======taskName:MapsubTasks:Map (1/2)
    5> not found name
    10> 1
;
    6> 1

     

 

底层API(ProcessFunctionAPI)

-- map、filter、flatMap等算子都是基于这层高层封装出来的

-- 越低层次的API,功能越强大,用户能够获取的信息越多,比如可以拿到元素状态信息、事件时间、设置定时器等

package com.text.transformation

import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

object Demo7 {

    case class CarInfo(carId: String, speed: Long)
    def main(args: Array[String]): Unit = {
      val env = StreamExecutionEnvironment.getExecutionEnvironment
      val stream = env.socketTextStream("192.168.75.91", 8888)
      stream.map(data => {
        val split = data.split(" ")
        val carId = split(0)
        val speed = split(1).toLong
        CarInfo(carId, speed)
      }).keyBy(_.carId)
        // KeyedStream调用process需要传入KeyedProcessFunction
        // DataStream调用process需要传入ProcessFunction
        .process(new KeyedProcessFunction[String, CarInfo, String] {
        override def processElement(value: CarInfo, ctx: KeyedProcessFunction[String, CarInfo, String]#Context, out: Collector[String]): Unit = {
          val currentTime = ctx.timerService().currentProcessingTime()
          if (value.speed > 100) {
            val timerTime = currentTime + 2 * 1000
            ctx.timerService().registerEventTimeTimer(timerTime)
          }
        }

        override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[String, CarInfo, String]#OnTimerContext, out: Collector[String]): Unit = {
          var warnMag = "warn... time:" + timestamp + "carID:" + ctx.getCurrentKey
          out.collect(warnMag)
        }
      }).print()
      env.execute()
    }
}

 

posted on 2021-06-11 00:32  陕西小楞娃  阅读(109)  评论(0编辑  收藏  举报