Flink SQL尝试

一、SOURCE

  • Mock Data
    package com.shydow.utils;
    
    import com.alibaba.fastjson.JSON;
    import com.alibaba.fastjson.JSONObject;
    import lombok.Data;
    import org.apache.kafka.clients.producer.KafkaProducer;
    import org.apache.kafka.clients.producer.ProducerConfig;
    import org.apache.kafka.clients.producer.ProducerRecord;
    import java.text.DateFormat;
    import java.text.ParseException;
    import java.text.SimpleDateFormat;
    import java.util.*;
    
    /**
     * @author Shydow
     * @date 2022-01-11
     */
    
    public class ClickLogGenerator {
        public static void main(String[] args) throws InterruptedException {
            Properties props = new Properties();
            props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "node:9092");
            props.put("acks", "all");
            props.put("retries", 2);
            props.put("retries.backoff.ms", 20);
            props.put("buffer.memory", 33554432);
            props.put("batch.size", 16384);
            props.put("linger.ms", 25);
            props.put("max.request.size", 163840);
            props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
            props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
            KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(props);
    
            Random random = new Random();
            for (int i = 0; i < 100; i++) {
                //构建log
                ClickLog clickLog = new ClickLog();
                clickLog.setChannelID(channelID[random.nextInt(channelID.length)]);
                clickLog.setCategoryID(categoryID[random.nextInt(categoryID.length)]);
                clickLog.setProduceID(produceID[random.nextInt(produceID.length)]);
                clickLog.setUserID(userID[random.nextInt(userID.length)]);
                clickLog.setCountry(contrys[random.nextInt(contrys.length)]);
                clickLog.setProvince(provinces[random.nextInt(provinces.length)]);
                clickLog.setCity(citys[random.nextInt(citys.length)]);
                clickLog.setNetwork(networks[random.nextInt(networks.length)]);
                clickLog.setSource(sources[random.nextInt(sources.length)]);
                clickLog.setBrowserType(browser[random.nextInt(browser.length)]);
                Long[] times = usetimelog.get(random.nextInt(usetimelog.size()));
                clickLog.setEntryTime(times[0]);
                clickLog.setLeaveTime(times[1]);
                //构建msg
                Message msg = new Message();//构建Message
                msg.setMessage(clickLog);
                msg.setCount(1);
                msg.setTimeStamp(System.currentTimeMillis());
                String msgJson = JSON.toJSONString(msg);
                //发送数据
                ProducerRecord<String, String> record = new ProducerRecord<>("click", msgJson);
                kafkaProducer.send(record);
    
                System.out.println("消息已发送到Kafka:" + msgJson);
                Thread.sleep(500);
            }
            kafkaProducer.close();
        }
    
        private static Long[] channelID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//频道id集合
        private static Long[] categoryID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//产品类别id集合
        private static Long[] produceID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//产品id集合
        private static Long[] userID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//用户id集合
    
        /**
         * 地区
         */
        private static String[] contrys = new String[]{"china"};//地区-国家集合
        private static String[] provinces = new String[]{"HeNan","HeBei"};//地区-省集合
        private static String[] citys = new String[]{"ShiJiaZhuang","ZhengZhou", "LuoYang"};//地区-市集合
    
        /**
         *网络方式
         */
        private static String[] networks = new String[]{"电信","移动","联通"};
    
        /**
         * 来源方式
         */
        private static String[] sources = new String[]{"直接输入","百度跳转","360搜索跳转","必应跳转"};
    
        /**
         * 浏览器
         */
        private static String[] browser = new String[]{"火狐","qq浏览器","360浏览器","谷歌浏览器"};
    
        /**
         * 打开时间 离开时间
         */
        private static List<Long[]> usetimelog = producetimes();
        //获取时间
        public static List<Long[]> producetimes(){
            List<Long[]> usetimelog = new ArrayList<Long[]>();
            for(int i=0;i<100;i++){
                Long [] timesarray = gettimes("2020-01-01 24:60:60:000");
                usetimelog.add(timesarray);
            }
            return usetimelog;
        }
    
        private static Long [] gettimes(String time){
            DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss:SSS");
            try {
                Date date = dateFormat.parse(time);
                long timetemp = date.getTime();
                Random random = new Random();
                int randomint = random.nextInt(10);
                long starttime = timetemp - randomint*3600*1000;
                long endtime = starttime + randomint*3600*1000;
                return new Long[]{starttime,endtime};
            } catch (ParseException e) {
                e.printStackTrace();
            }
            return new Long[]{0L,0L};
        }
    
        @Data
        public static class Message {
            // 消息次数
            private int count;
            // 消息的时间戳
            private long timeStamp;
            // 消息体
            private ClickLog message;
        }
    
        /**
         * 点击流日志
         */
        @Data
        public static class ClickLog {
            //频道ID
            private long channelID ;
            //产品的类别ID
            private long categoryID ;
            //产品ID
            private long produceID ;
            //用户的ID
            private long userID ;
            //国家
            private String country ;
            //省份
            private String province ;
            //城市
            private String city ;
            //网络方式
            private String network ;
            //来源方式
            private String source ;
            //浏览器类型
            private String browserType;
            //进入网站时间
            private Long entryTime ;
            //离开网站时间
            private long leaveTime ;
        }
    }
      
  • Kafka Source
    -- 这里使用处理时间
    create table click_log(
        `count` INT,
        `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT>,
        `timeStamp` BIGINT,
        `proctime` as PROCTIME()  -- 声明额外的列存储处理时间
    ) with (
        'connector' = 'kafka', 
        'topic' = 'click',
        'properties.bootstrap.servers' = 'node:9092', 
        'properties.group.id' = 'test', 
        'scan.startup.mode' = 'latest-offset', 
        'format' = 'json', 
        'json.fail-on-missing-field' = 'true',
        'json.ignore-parse-errors' = 'false',
        'value.fields-include' = 'ALL'
    );
    
    -- 将timeStamp设置为事件时间
    create table click_log(
        `count` INT,
        `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT>,
        `timeStamp` BIGINT,
        `ts` AS TO_TIMESTAMP_LTZ(`timeStamp`, 3),         -- 将bigint转为timestamp
        WATERMARK FOR `ts` AS `ts` - INTERVAL '3' SECOND  -- 设置水印延迟时间为3s
    ) with (
        'connector' = 'kafka', 
        'topic' = 'click',
        'properties.bootstrap.servers' = 'node:9092', 
        'properties.group.id' = 'test', 
        'scan.startup.mode' = 'latest-offset', 
        'format' = 'json', 
        'json.fail-on-missing-field' = 'true',
        'json.ignore-parse-errors' = 'false',
        'value.fields-include' = 'ALL'
    );

     

  • 滚动窗口与滑动窗口
    -- 求每5分钟内各个浏览器的pv,uv
    select 
        message.browserType,
        TUMBLE_START(ts, INTERVAL '5' MINUTES) as window_start, 
        TUMBLE_END(ts, INTERVAL '5' MINUTES) as window_end,
        TUMBLE_ROWTIME(ts, INTERVAL '5' MINUTES) as window_rowtime
        count(1) as pv, 
        count(distinct message.userID) as uv
    from 
        click_log
    group by 
        message.browserType, TUMBLE(ts, INTERVAL '5' MINUTE);
    
    -- 每1分钟求一次过去5min中的pv,uv
    select 
        message.browserType,
        HOP_START(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) AS window_start,
        HOP_END(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) as window_end,
        HOP_ROWTIME(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) as window_rowtime,
        count(1) as pv,
        count(distinct message.userID) as uv
    from 
        click_log
    group by 
        message.browserType, HOP(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES)

     

  • CDC Source
    CREATE TABLE mysql_users (
        userId STRING PRIMARY KEY NOT ENFORCED,
        userName STRING
    ) WITH (
        'connector'= 'mysql-cdc',
        'hostname'= 'node',
        'port'= '3306',
        'username'= 'root',
        'password'= '123456',
        'server-time-zone'= 'Asia/Shanghai',
        'debezium.snapshot.mode' = 'initial',
        'database-name'= 'aucc',
        'table-name'= 'dim_user'
    )

     

二、Flink读写Hive

  • 注册Hive Catalog
    HiveCatalog hiver = new HiveCatalog(Constants.catalogName, Constants.defaultDatabase, Constants.hiveConfDir);
    tabEnv.registerCatalog("hiver", hiver);
    tabEnv.useCatalog("hiver");
    tabEnv.getConfig().getConfiguration().setString("table.dynamic-table-options.enabled", "true");

     

  • 读取Hive数据
    tabEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
    tabEnv.executeSql("select * from t_region_test").print();

     

  • 流式写Hive
    // 创建hive sink表
    tabEnv.getConfig().setSqlDialect(SqlDialect.HIVE);   // 切换到hive方言创建hive表
    tabEnv.executeSql("drop table if exists t_kafka_sink");
    tabEnv.executeSql("CREATE TABLE t_kafka_sink(\n" +
                    "  `flowID` STRING,\n" +
                    "  `userID` INT,\n" +
                    "  `browserType` STRING, \n" +
                    "  `categoryID` INT, \n" +
                    "  `channelID` INT, \n" +
                    "  `city` STRING, \n" +
                    "  `country` STRING\n" +
                    ") PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (\n" +
                    "  'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',\n" +
                    "  'sink.partition-commit.trigger'='partition-time',\n" +
                    "  'sink.partition-commit.delay'='1 h',\n" +
                    "  'sink.partition-commit.policy.kind'='metastore,success-file'\n" +
                    ")");
    
    // 创建kafka source表
    tabEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
    tabEnv.executeSql("drop table if exists click_log");
    tabEnv.executeSql("create table click_log(\n" +
            "    `count` INT,\n" +
            "    `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT, `flowID` STRING>,\n" +
            "    `timeStamp` BIGINT,\n" +
            "    `proctime` as PROCTIME()  -- 声明额外的列存储处理时间\n" +
            ") with (\n" +
            "    'connector' = 'kafka', \n" +
            "    'topic' = 'click',\n" +
            "    'properties.bootstrap.servers' = 'node:9092', \n" +
            "    'properties.group.id' = 'test', \n" +
            "    'scan.startup.mode' = 'latest-offset', \n" +
            "    'format' = 'json', \n" +
            "    'json.fail-on-missing-field' = 'true',\n" +
            "    'json.ignore-parse-errors' = 'false',\n" +
            "    'value.fields-include' = 'ALL'\n" +
            ")");
    
    tabEnv.executeSql("insert into t_kafka_sink\n" +
            "select \n" +
            "    message.flowID,\n" +
            "    message.userID,\n" +
            "    message.browserType,\n" +
            "    message.categoryID,\n" +
            "    message.channelID,\n" +
            "    message.city,\n" +
            "    message.country,\n" +
            "    DATE_FORMAT(proctime, 'yyyy-MM-dd'), \n" +
            "    DATE_FORMAT(proctime, 'HH')\n" +
            "from click_log");

     

 

posted @ 2022-01-11 15:45  Shydow  阅读(218)  评论(0编辑  收藏  举报