|NO.Z.00050|——————————|BigDataEnd|——|Hadoop&实时数仓.V30|——|项目.v30|需求三:数据处理&增量统计广告.V4|——|编程实现|
一、编程实现:每隔5秒统计最近1小时内广告的点击量$增量统计
package dw.dws
import modes.{AdClick, CountByProductAd}
import myutils.SourceKafka
import java.sql.Date
import java.text.SimpleDateFormat
import java.util.concurrent.TimeUnit
import com.alibaba.fastjson.JSON.parseObject
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.util.Collector
object AdEventLog {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //设置时间特征为事件时间
//从kafka中获取数据(flume)
val kafkaSource: FlinkKafkaConsumer[String] = new SourceKafka().getKafkaSource("eventlog")
val eventLogStream: DataStream[String] = env.addSource(kafkaSource)
/**
* area:
* uid:
* product_id:
* time:
*/
val mapEventStream: DataStream[AdClick] = eventLogStream.map(x => {
val jsonObj: JSONObject = parseObject(x)
val attr: String = jsonObj.get("attr").toString
val attrJson: JSONObject = parseObject(attr)
val area: String = attrJson.get("area").toString
val uid: String = attrJson.get("uid").toString
val eventData: String = jsonObj.get("yanqi_event").toString
val datas: JSONArray = JSON.parseArray(eventData)
val list = new java.util.ArrayList[String]()
datas.forEach(x => list.add(x.toString))
var productId: String = null
var timestamp: Long = 0L
list.forEach(x => {
val xJson: JSONObject = parseObject(x)
if (xJson.get("name").toString.equals("ad")) {
val jsonData: String = xJson.get("json").toString
val jsonDatas = parseObject(jsonData)
productId = jsonDatas.get("product_id").toString
timestamp = TimeUnit.MILLISECONDS.toSeconds(xJson.get("time").toString.toLong)
}
})
AdClick(area, uid, productId, timestamp)
})
val filtered: DataStream[AdClick] = mapEventStream.filter(x => x.productId != null)
val result: DataStream[CountByProductAd] = filtered
.assignAscendingTimestamps(x => x.timestamp)
.keyBy(_.productId)
.timeWindow(Time.seconds(20), Time.seconds(10))
.aggregate(new AdAggFunc, new AdWindowFunc())
result.print()
env.execute()
}
class AdAggFunc() extends AggregateFunction[AdClick, Long, Long] {
override def createAccumulator(): Long = 0L
override def add(ad: AdClick, acc: Long): Long = acc + 1
override def getResult(acc: Long): Long = acc
override def merge(acc1: Long, acc2: Long): Long = acc1 + acc2
}
class AdWindowFunc() extends WindowFunction[Long, CountByProductAd, String, TimeWindow] {
private def formatTs(ts: Long) = {
val df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
df.format(new Date(ts))
}
override def apply(key: String,
window: TimeWindow,
input: Iterable[Long],
out: Collector[CountByProductAd]): Unit = {
out.collect(CountByProductAd(formatTs(window.getEnd * 1000), key, input.iterator.next()))
}
}
}
Walter Savage Landor:strove with none,for none was worth my strife.Nature I loved and, next to Nature, Art:I warm'd both hands before the fire of life.It sinks, and I am ready to depart
——W.S.Landor
分类:
bdv026-EB实时数仓
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通