Flink SQL尝试
一、SOURCE
- Mock Data
package com.shydow.utils; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import lombok.Data; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.ProducerRecord; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; /** * @author Shydow * @date 2022-01-11 */ public class ClickLogGenerator { public static void main(String[] args) throws InterruptedException { Properties props = new Properties(); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "node:9092"); props.put("acks", "all"); props.put("retries", 2); props.put("retries.backoff.ms", 20); props.put("buffer.memory", 33554432); props.put("batch.size", 16384); props.put("linger.ms", 25); props.put("max.request.size", 163840); props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(props); Random random = new Random(); for (int i = 0; i < 100; i++) { //构建log ClickLog clickLog = new ClickLog(); clickLog.setChannelID(channelID[random.nextInt(channelID.length)]); clickLog.setCategoryID(categoryID[random.nextInt(categoryID.length)]); clickLog.setProduceID(produceID[random.nextInt(produceID.length)]); clickLog.setUserID(userID[random.nextInt(userID.length)]); clickLog.setCountry(contrys[random.nextInt(contrys.length)]); clickLog.setProvince(provinces[random.nextInt(provinces.length)]); clickLog.setCity(citys[random.nextInt(citys.length)]); clickLog.setNetwork(networks[random.nextInt(networks.length)]); clickLog.setSource(sources[random.nextInt(sources.length)]); clickLog.setBrowserType(browser[random.nextInt(browser.length)]); Long[] times = usetimelog.get(random.nextInt(usetimelog.size())); clickLog.setEntryTime(times[0]); clickLog.setLeaveTime(times[1]); //构建msg Message msg = new Message();//构建Message msg.setMessage(clickLog); msg.setCount(1); msg.setTimeStamp(System.currentTimeMillis()); String msgJson = JSON.toJSONString(msg); //发送数据 ProducerRecord<String, String> record = new ProducerRecord<>("click", msgJson); kafkaProducer.send(record); System.out.println("消息已发送到Kafka:" + msgJson); Thread.sleep(500); } kafkaProducer.close(); } private static Long[] channelID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//频道id集合 private static Long[] categoryID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//产品类别id集合 private static Long[] produceID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//产品id集合 private static Long[] userID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//用户id集合 /** * 地区 */ private static String[] contrys = new String[]{"china"};//地区-国家集合 private static String[] provinces = new String[]{"HeNan","HeBei"};//地区-省集合 private static String[] citys = new String[]{"ShiJiaZhuang","ZhengZhou", "LuoYang"};//地区-市集合 /** *网络方式 */ private static String[] networks = new String[]{"电信","移动","联通"}; /** * 来源方式 */ private static String[] sources = new String[]{"直接输入","百度跳转","360搜索跳转","必应跳转"}; /** * 浏览器 */ private static String[] browser = new String[]{"火狐","qq浏览器","360浏览器","谷歌浏览器"}; /** * 打开时间 离开时间 */ private static List<Long[]> usetimelog = producetimes(); //获取时间 public static List<Long[]> producetimes(){ List<Long[]> usetimelog = new ArrayList<Long[]>(); for(int i=0;i<100;i++){ Long [] timesarray = gettimes("2020-01-01 24:60:60:000"); usetimelog.add(timesarray); } return usetimelog; } private static Long [] gettimes(String time){ DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss:SSS"); try { Date date = dateFormat.parse(time); long timetemp = date.getTime(); Random random = new Random(); int randomint = random.nextInt(10); long starttime = timetemp - randomint*3600*1000; long endtime = starttime + randomint*3600*1000; return new Long[]{starttime,endtime}; } catch (ParseException e) { e.printStackTrace(); } return new Long[]{0L,0L}; } @Data public static class Message { // 消息次数 private int count; // 消息的时间戳 private long timeStamp; // 消息体 private ClickLog message; } /** * 点击流日志 */ @Data public static class ClickLog { //频道ID private long channelID ; //产品的类别ID private long categoryID ; //产品ID private long produceID ; //用户的ID private long userID ; //国家 private String country ; //省份 private String province ; //城市 private String city ; //网络方式 private String network ; //来源方式 private String source ; //浏览器类型 private String browserType; //进入网站时间 private Long entryTime ; //离开网站时间 private long leaveTime ; } }
- Kafka Source
-- 这里使用处理时间 create table click_log( `count` INT, `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT>, `timeStamp` BIGINT, `proctime` as PROCTIME() -- 声明额外的列存储处理时间 ) with ( 'connector' = 'kafka', 'topic' = 'click', 'properties.bootstrap.servers' = 'node:9092', 'properties.group.id' = 'test', 'scan.startup.mode' = 'latest-offset', 'format' = 'json', 'json.fail-on-missing-field' = 'true', 'json.ignore-parse-errors' = 'false', 'value.fields-include' = 'ALL' ); -- 将timeStamp设置为事件时间 create table click_log( `count` INT, `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT>, `timeStamp` BIGINT, `ts` AS TO_TIMESTAMP_LTZ(`timeStamp`, 3), -- 将bigint转为timestamp WATERMARK FOR `ts` AS `ts` - INTERVAL '3' SECOND -- 设置水印延迟时间为3s ) with ( 'connector' = 'kafka', 'topic' = 'click', 'properties.bootstrap.servers' = 'node:9092', 'properties.group.id' = 'test', 'scan.startup.mode' = 'latest-offset', 'format' = 'json', 'json.fail-on-missing-field' = 'true', 'json.ignore-parse-errors' = 'false', 'value.fields-include' = 'ALL' );
- 滚动窗口与滑动窗口
-- 求每5分钟内各个浏览器的pv,uv select message.browserType, TUMBLE_START(ts, INTERVAL '5' MINUTES) as window_start, TUMBLE_END(ts, INTERVAL '5' MINUTES) as window_end, TUMBLE_ROWTIME(ts, INTERVAL '5' MINUTES) as window_rowtime count(1) as pv, count(distinct message.userID) as uv from click_log group by message.browserType, TUMBLE(ts, INTERVAL '5' MINUTE); -- 每1分钟求一次过去5min中的pv,uv select message.browserType, HOP_START(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) AS window_start, HOP_END(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) as window_end, HOP_ROWTIME(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) as window_rowtime, count(1) as pv, count(distinct message.userID) as uv from click_log group by message.browserType, HOP(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES)
- CDC Source
CREATE TABLE mysql_users ( userId STRING PRIMARY KEY NOT ENFORCED, userName STRING ) WITH ( 'connector'= 'mysql-cdc', 'hostname'= 'node', 'port'= '3306', 'username'= 'root', 'password'= '123456', 'server-time-zone'= 'Asia/Shanghai', 'debezium.snapshot.mode' = 'initial', 'database-name'= 'aucc', 'table-name'= 'dim_user' )
二、Flink读写Hive
- 注册Hive Catalog
HiveCatalog hiver = new HiveCatalog(Constants.catalogName, Constants.defaultDatabase, Constants.hiveConfDir); tabEnv.registerCatalog("hiver", hiver); tabEnv.useCatalog("hiver"); tabEnv.getConfig().getConfiguration().setString("table.dynamic-table-options.enabled", "true");
- 读取Hive数据
tabEnv.getConfig().setSqlDialect(SqlDialect.HIVE); tabEnv.executeSql("select * from t_region_test").print();
- 流式写Hive
// 创建hive sink表 tabEnv.getConfig().setSqlDialect(SqlDialect.HIVE); // 切换到hive方言创建hive表 tabEnv.executeSql("drop table if exists t_kafka_sink"); tabEnv.executeSql("CREATE TABLE t_kafka_sink(\n" + " `flowID` STRING,\n" + " `userID` INT,\n" + " `browserType` STRING, \n" + " `categoryID` INT, \n" + " `channelID` INT, \n" + " `city` STRING, \n" + " `country` STRING\n" + ") PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (\n" + " 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',\n" + " 'sink.partition-commit.trigger'='partition-time',\n" + " 'sink.partition-commit.delay'='1 h',\n" + " 'sink.partition-commit.policy.kind'='metastore,success-file'\n" + ")"); // 创建kafka source表 tabEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT); tabEnv.executeSql("drop table if exists click_log"); tabEnv.executeSql("create table click_log(\n" + " `count` INT,\n" + " `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT, `flowID` STRING>,\n" + " `timeStamp` BIGINT,\n" + " `proctime` as PROCTIME() -- 声明额外的列存储处理时间\n" + ") with (\n" + " 'connector' = 'kafka', \n" + " 'topic' = 'click',\n" + " 'properties.bootstrap.servers' = 'node:9092', \n" + " 'properties.group.id' = 'test', \n" + " 'scan.startup.mode' = 'latest-offset', \n" + " 'format' = 'json', \n" + " 'json.fail-on-missing-field' = 'true',\n" + " 'json.ignore-parse-errors' = 'false',\n" + " 'value.fields-include' = 'ALL'\n" + ")"); tabEnv.executeSql("insert into t_kafka_sink\n" + "select \n" + " message.flowID,\n" + " message.userID,\n" + " message.browserType,\n" + " message.categoryID,\n" + " message.channelID,\n" + " message.city,\n" + " message.country,\n" + " DATE_FORMAT(proctime, 'yyyy-MM-dd'), \n" + " DATE_FORMAT(proctime, 'HH')\n" + "from click_log");