Flink SQL尝试

一、SOURCE

Mock Data

package com.shydow.utils;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import lombok.Data;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 * @author Shydow
 * @date 2022-01-11
 */

public class ClickLogGenerator {
    public static void main(String[] args) throws InterruptedException {
        Properties props = new Properties();
        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "node:9092");
        props.put("acks", "all");
        props.put("retries", 2);
        props.put("retries.backoff.ms", 20);
        props.put("buffer.memory", 33554432);
        props.put("batch.size", 16384);
        props.put("linger.ms", 25);
        props.put("max.request.size", 163840);
        props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(props);

        Random random = new Random();
        for (int i = 0; i < 100; i++) {
            //构建log
            ClickLog clickLog = new ClickLog();
            clickLog.setChannelID(channelID[random.nextInt(channelID.length)]);
            clickLog.setCategoryID(categoryID[random.nextInt(categoryID.length)]);
            clickLog.setProduceID(produceID[random.nextInt(produceID.length)]);
            clickLog.setUserID(userID[random.nextInt(userID.length)]);
            clickLog.setCountry(contrys[random.nextInt(contrys.length)]);
            clickLog.setProvince(provinces[random.nextInt(provinces.length)]);
            clickLog.setCity(citys[random.nextInt(citys.length)]);
            clickLog.setNetwork(networks[random.nextInt(networks.length)]);
            clickLog.setSource(sources[random.nextInt(sources.length)]);
            clickLog.setBrowserType(browser[random.nextInt(browser.length)]);
            Long[] times = usetimelog.get(random.nextInt(usetimelog.size()));
            clickLog.setEntryTime(times[0]);
            clickLog.setLeaveTime(times[1]);
            //构建msg
            Message msg = new Message();//构建Message
            msg.setMessage(clickLog);
            msg.setCount(1);
            msg.setTimeStamp(System.currentTimeMillis());
            String msgJson = JSON.toJSONString(msg);
            //发送数据
            ProducerRecord<String, String> record = new ProducerRecord<>("click", msgJson);
            kafkaProducer.send(record);

            System.out.println("消息已发送到Kafka:" + msgJson);
            Thread.sleep(500);
        }
        kafkaProducer.close();
    }

    private static Long[] channelID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//频道id集合
    private static Long[] categoryID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//产品类别id集合
    private static Long[] produceID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//产品id集合
    private static Long[] userID = new Long[]{1l,2l,3l,4l,5l,6l,7l,8l,9l,10l,11l,12l,13l,14l,15l,16l,17l,18l,19l,20l};//用户id集合

    /**
     * 地区
     */
    private static String[] contrys = new String[]{"china"};//地区-国家集合
    private static String[] provinces = new String[]{"HeNan","HeBei"};//地区-省集合
    private static String[] citys = new String[]{"ShiJiaZhuang","ZhengZhou", "LuoYang"};//地区-市集合

    /**
     *网络方式
     */
    private static String[] networks = new String[]{"电信","移动","联通"};

    /**
     * 来源方式
     */
    private static String[] sources = new String[]{"直接输入","百度跳转","360搜索跳转","必应跳转"};

    /**
     * 浏览器
     */
    private static String[] browser = new String[]{"火狐","qq浏览器","360浏览器","谷歌浏览器"};

    /**
     * 打开时间 离开时间
     */
    private static List<Long[]> usetimelog = producetimes();
    //获取时间
    public static List<Long[]> producetimes(){
        List<Long[]> usetimelog = new ArrayList<Long[]>();
        for(int i=0;i<100;i++){
            Long [] timesarray = gettimes("2020-01-01 24:60:60:000");
            usetimelog.add(timesarray);
        }
        return usetimelog;
    }

    private static Long [] gettimes(String time){
        DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss:SSS");
        try {
            Date date = dateFormat.parse(time);
            long timetemp = date.getTime();
            Random random = new Random();
            int randomint = random.nextInt(10);
            long starttime = timetemp - randomint*3600*1000;
            long endtime = starttime + randomint*3600*1000;
            return new Long[]{starttime,endtime};
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return new Long[]{0L,0L};
    }

    @Data
    public static class Message {
        // 消息次数
        private int count;
        // 消息的时间戳
        private long timeStamp;
        // 消息体
        private ClickLog message;
    }

    /**
     * 点击流日志
     */
    @Data
    public static class ClickLog {
        //频道ID
        private long channelID ;
        //产品的类别ID
        private long categoryID ;
        //产品ID
        private long produceID ;
        //用户的ID
        private long userID ;
        //国家
        private String country ;
        //省份
        private String province ;
        //城市
        private String city ;
        //网络方式
        private String network ;
        //来源方式
        private String source ;
        //浏览器类型
        private String browserType;
        //进入网站时间
        private Long entryTime ;
        //离开网站时间
        private long leaveTime ;
    }
}

Kafka Source

-- 这里使用处理时间
create table click_log(
    `count` INT,
    `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT>,
    `timeStamp` BIGINT,
    `proctime` as PROCTIME()  -- 声明额外的列存储处理时间
) with (
    'connector' = 'kafka', 
    'topic' = 'click',
    'properties.bootstrap.servers' = 'node:9092', 
    'properties.group.id' = 'test', 
    'scan.startup.mode' = 'latest-offset', 
    'format' = 'json', 
    'json.fail-on-missing-field' = 'true',
    'json.ignore-parse-errors' = 'false',
    'value.fields-include' = 'ALL'
);

-- 将timeStamp设置为事件时间
create table click_log(
    `count` INT,
    `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT>,
    `timeStamp` BIGINT,
    `ts` AS TO_TIMESTAMP_LTZ(`timeStamp`, 3),         -- 将bigint转为timestamp
    WATERMARK FOR `ts` AS `ts` - INTERVAL '3' SECOND  -- 设置水印延迟时间为3s
) with (
    'connector' = 'kafka', 
    'topic' = 'click',
    'properties.bootstrap.servers' = 'node:9092', 
    'properties.group.id' = 'test', 
    'scan.startup.mode' = 'latest-offset', 
    'format' = 'json', 
    'json.fail-on-missing-field' = 'true',
    'json.ignore-parse-errors' = 'false',
    'value.fields-include' = 'ALL'
);

滚动窗口与滑动窗口

-- 求每5分钟内各个浏览器的pv，uv
select 
    message.browserType,
    TUMBLE_START(ts, INTERVAL '5' MINUTES) as window_start, 
    TUMBLE_END(ts, INTERVAL '5' MINUTES) as window_end,
    TUMBLE_ROWTIME(ts, INTERVAL '5' MINUTES) as window_rowtime
    count(1) as pv, 
    count(distinct message.userID) as uv
from 
    click_log
group by 
    message.browserType, TUMBLE(ts, INTERVAL '5' MINUTE);

-- 每1分钟求一次过去5min中的pv，uv
select 
    message.browserType,
    HOP_START(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) AS window_start,
    HOP_END(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) as window_end,
    HOP_ROWTIME(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES) as window_rowtime,
    count(1) as pv,
    count(distinct message.userID) as uv
from 
    click_log
group by 
    message.browserType, HOP(ts, INTERVAL '1' MINUTES, INTERVAL '5' MINUTES)

CDC Source

CREATE TABLE mysql_users (
    userId STRING PRIMARY KEY NOT ENFORCED，
    userName STRING
) WITH (
    'connector'= 'mysql-cdc',
    'hostname'= 'node',
    'port'= '3306',
    'username'= 'root',
    'password'= '123456',
    'server-time-zone'= 'Asia/Shanghai',
    'debezium.snapshot.mode' = 'initial',
    'database-name'= 'aucc',
    'table-name'= 'dim_user'
)

二、Flink读写Hive

注册Hive Catalog

HiveCatalog hiver = new HiveCatalog(Constants.catalogName, Constants.defaultDatabase, Constants.hiveConfDir);
tabEnv.registerCatalog("hiver", hiver);
tabEnv.useCatalog("hiver");
tabEnv.getConfig().getConfiguration().setString("table.dynamic-table-options.enabled", "true");

读取Hive数据

tabEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
tabEnv.executeSql("select * from t_region_test").print();

流式写Hive

// 创建hive sink表
tabEnv.getConfig().setSqlDialect(SqlDialect.HIVE);   // 切换到hive方言创建hive表
tabEnv.executeSql("drop table if exists t_kafka_sink");
tabEnv.executeSql("CREATE TABLE t_kafka_sink(\n" +
                "  `flowID` STRING,\n" +
                "  `userID` INT,\n" +
                "  `browserType` STRING, \n" +
                "  `categoryID` INT, \n" +
                "  `channelID` INT, \n" +
                "  `city` STRING, \n" +
                "  `country` STRING\n" +
                ") PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (\n" +
                "  'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',\n" +
                "  'sink.partition-commit.trigger'='partition-time',\n" +
                "  'sink.partition-commit.delay'='1 h',\n" +
                "  'sink.partition-commit.policy.kind'='metastore,success-file'\n" +
                ")");

// 创建kafka source表
tabEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
tabEnv.executeSql("drop table if exists click_log");
tabEnv.executeSql("create table click_log(\n" +
        "    `count` INT,\n" +
        "    `message` ROW<`browserType` string, `categoryID` INT, `channelID` INT, `city` string, `country` string, `entryTime` BIGINT, `leaveTime` BIGINT, `network` string, `produceID` INT, `province` string, `source` string, `userID` INT, `flowID` STRING>,\n" +
        "    `timeStamp` BIGINT,\n" +
        "    `proctime` as PROCTIME()  -- 声明额外的列存储处理时间\n" +
        ") with (\n" +
        "    'connector' = 'kafka', \n" +
        "    'topic' = 'click',\n" +
        "    'properties.bootstrap.servers' = 'node:9092', \n" +
        "    'properties.group.id' = 'test', \n" +
        "    'scan.startup.mode' = 'latest-offset', \n" +
        "    'format' = 'json', \n" +
        "    'json.fail-on-missing-field' = 'true',\n" +
        "    'json.ignore-parse-errors' = 'false',\n" +
        "    'value.fields-include' = 'ALL'\n" +
        ")");

tabEnv.executeSql("insert into t_kafka_sink\n" +
        "select \n" +
        "    message.flowID,\n" +
        "    message.userID,\n" +
        "    message.browserType,\n" +
        "    message.categoryID,\n" +
        "    message.channelID,\n" +
        "    message.city,\n" +
        "    message.country,\n" +
        "    DATE_FORMAT(proctime, 'yyyy-MM-dd'), \n" +
        "    DATE_FORMAT(proctime, 'HH')\n" +
        "from click_log");

posted @ 2022-01-11 15:45 Shydow 阅读(220) 评论(0) 编辑收藏举报

刷新页面返回顶部

Shydow

Flink SQL尝试

公告