flink ETL数据处理
Flink ETL 实现数据清洗
一:需求(针对算法产生的日志数据进行清洗拆分)
1. 算法产生的日志数据是嵌套json格式,需要拆分
2.针对算法中的国家字段进行大区转换
3.最后把不同类型的日志数据分别进行储存
二:整体架构
这里演示处理从rabbitmq来的数据 进行数据处理 然后发送到rabbitmq
自定义redistSource flink没有redis的source
package com.yw.source; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import redis.clients.jedis.Jedis; import redis.clients.jedis.exceptions.JedisConnectionException; import java.util.HashMap; import java.util.Map; /** * redis中进行数据初始化 * <p> * 在reids中保存国家和大区关系 * hset areas AREA_IN IN * hset areas AREA_US US * hset areas AREA_CT TW,HK * hset areas AREA_AR PK,KW,SA * * * @Auther: YW * @Date: 2019/6/15 10:23 * @Description: */ public class MyRedisSource implements SourceFunction<HashMap<String, String>> { private final Logger LOG = LoggerFactory.getLogger(MyRedisSource.class); private boolean isRuning = true; private Jedis jedis = null; private final long SLEEP = 60000; private final long expire = 60; @Override public void run(SourceContext<HashMap<String, String>> ctx) throws Exception { this.jedis = new Jedis("localhost", 6397); // 存储国家和地区关系 HashMap<String, String> map = new HashMap<>(); while (isRuning) { try { map.clear(); // 老数据清除 Map<String, String> areas = jedis.hgetAll("areas"); for (Map.Entry<String, String> entry : areas.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); String[] splits = value.split(","); for (String split : splits) { map.put(split, key); } } if (map.size() > 0) { // map >0 数据发送出去 ctx.collect(map); }else { LOG.warn("获取数据为空!"); } // 歇6秒 Thread.sleep(SLEEP); } catch (JedisConnectionException e) { LOG.error("redis连接异常 重新连接",e.getCause()); // 如果连接异常 重新连接 jedis = new Jedis("localhost", 6397); }catch (Exception e){ LOG.error("redis Source其他异常",e.getCause()); } } } @Override public void cancel() { isRuning = false; while (jedis != null) { jedis.close(); } } }
DataClean数据处理
package com.yw; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.rabbitmq.client.AMQP; import com.yw.source.MyRedisSource; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.CheckpointConfig; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction; import org.apache.flink.streaming.connectors.rabbitmq.RMQSink; import org.apache.flink.streaming.connectors.rabbitmq.RMQSinkPublishOptions; import org.apache.flink.streaming.connectors.rabbitmq.RMQSource; import org.apache.flink.streaming.connectors.rabbitmq.common.RMQConnectionConfig; import org.apache.flink.streaming.util.serialization.SimpleStringSchema; import org.apache.flink.util.Collector; import java.util.HashMap; /** * @Auther: YW * @Date: 2019/6/15 10:09 * @Description: */ public class DataClean { // 队列名 public final static String QUEUE_NAME = "two.aa.in"; public static void main(String[] args) throws Exception { // 获取环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 一分钟 checkpoint env.enableCheckpointing(60000); env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000); // enableCheckpointing最小间隔时间(一半) env.getCheckpointConfig().setCheckpointTimeout(10000);// 超时时间 env.getCheckpointConfig().setMaxConcurrentCheckpoints(1); env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); final RMQConnectionConfig rmqConf = new RMQConnectionConfig.Builder().setHost("127.0.0.1").setPort(5672).setVirtualHost("/").setUserName("guest").setPassword("guest").build(); // 获取mq数据 DataStream<String> data1 = env.addSource(new RMQSource<String>(rmqConf, QUEUE_NAME, false, new SimpleStringSchema())).setParallelism(1); //{"dt":"2019-06-10","countryCode":"US","data":[{"type":"s1","score":0.3,"level":"A"},{"type":"s2","score":0.1,"level":"B"},{"type":"s3","score":0.2,"level":"C"}]} DataStreamSource<HashMap<String, String>> mapData = env.addSource(new MyRedisSource()); // connect可以连接两个流 DataStream<String> streamOperator = data1.connect(mapData).flatMap(new CoFlatMapFunction<String, HashMap<String, String>, String>() { // 保存 redis返回数据 国家和大区的映射关系 private HashMap<String, String> allMap = new HashMap<String, String>(); // flatMap1 处理rabbitmq的数据 @Override public void flatMap1(String value, Collector<String> out) throws Exception { JSONObject jsonObject = JSONObject.parseObject(value); String countryCode = jsonObject.getString("countryCode"); String dt = jsonObject.getString("dt"); // 获取大区 String area = allMap.get(countryCode); JSONArray jsonArray = jsonObject.getJSONArray("data"); for (int i = 0; i < jsonArray.size(); i++) { JSONObject jsonObject1 = jsonArray.getJSONObject(i); jsonObject1.put("area", area); jsonObject1.put("dt", dt); out.collect(jsonObject1.toJSONString()); } } // 处理redis的返回的map类型的数据 @Override public void flatMap2(HashMap<String, String> value, Collector<String> out) throws Exception { this.allMap = value; } }); streamOperator.addSink(new RMQSink<String>(rmqConf, new SimpleStringSchema(), new RMQSinkPublishOptions<String>() { @Override public String computeRoutingKey(String s) { return "CC"; } @Override public AMQP.BasicProperties computeProperties(String s) { return null; } @Override public String computeExchange(String s) { return "test.flink.output"; } })); data1.print(); env.execute("etl"); } }
rabbitmq 模拟数据
package com.yw; import com.rabbitmq.client.Channel; import com.rabbitmq.client.Connection; import com.rabbitmq.client.ConnectionFactory; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Random; /** * @Auther: YW * @Date: 2019/6/5 14:57 * @Description: */ public class RabbitMQProducerUtil { public final static String QUEUE_NAME = "two.aa.in"; public static void main(String[] args) throws Exception { //创建连接工厂 ConnectionFactory factory = new ConnectionFactory(); //设置RabbitMQ相关信息 factory.setHost("127.0.0.1"); factory.setUsername("guest"); factory.setPassword("guest"); factory.setVirtualHost("/"); factory.setPort(5672); //创建一个新的连接 Connection connection = factory.newConnection(); //创建一个通道 Channel channel = connection.createChannel(); // 声明一个队列 // channel.queueDeclare(QUEUE_NAME, false, false, false, null); //发送消息到队列中 String message = "{\"dt\":\""+getCurrentTime()+"\",\"countryCode\":\""+getCountryCode()+"\"," + "{\"type\":\""+getType()+"\",\"score\":"+getScore()+"\"level\":\""+getLevel()+"\"}," + "{\"type\":\""+getType()+"\",\"score\":"+getScore()+"\"level\":\""+getLevel()+"\"}," + "{\"type\":\""+getType()+"\",\"score\":"+getScore()+"\"level\":\""+getLevel()+"\"}]}"; //我们这里演示发送一千条数据 for (int i = 0; i < 20; i++) { channel.basicPublish("", QUEUE_NAME, null, (message + i).getBytes("UTF-8")); System.out.println("Producer Send +'" + message); } //关闭通道和连接 channel.close(); connection.close(); } public static String getCurrentTime() { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); return sdf.format(new Date()); } public static String getCountryCode() { String[] types={"US","TN","HK","PK","KW","SA","IN"}; Random random = new Random(); int i = random.nextInt(types.length); return types[i]; } public static String getType() { String[] types={"s1","s2","s3","s4","s5"}; Random random = new Random(); int i = random.nextInt(types.length); return types[i]; } public static String getScore() { String[] types={"0.1","0.2","0.3","0.4","0.5"}; Random random = new Random(); int i = random.nextInt(types.length); return types[i]; } public static String getLevel() { String[] types={"A","B","C","D","E"}; Random random = new Random(); int i = random.nextInt(types.length); return types[i]; } }
redis 初始化数据
* hset areas AREA_IN IN
* hset areas AREA_US US
* hset areas AREA_CT TW,HK
* hset areas AREA_AR PK,KW,SA
------------最后运行DataClean------------
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 25岁的心里话
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现