flume 拦截器的三个案例
今天整理下最近使用flume
案例一:过滤非JSON数据
使用 flume 监控日志文件传到 kafka,由于业务需要只需要将日志里的 json 数据发送到 Kafka 即可,非 json 数据直接进行过滤。
1、pom.xml
<build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <configuration> <source>7</source> <target>7</target> </configuration> </plugin> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>2.3.2</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.apache.flume</groupId> <artifactId>flume-ng-core</artifactId> <version>1.9.0</version> <scope>provided</scope> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.62</version> </dependency> </dependencies>
2、JSONUtil 判断数据是否JSON
import com.alibaba.fastjson.JSONException; import com.alibaba.fastjson.JSONObject; /** * 判断参数是否是 JSON 数据 */ public class JSONUtil { public static boolean isJSON(String log) { boolean flag = false; try { JSONObject.parseObject(log); flag = true; } catch (JSONException e) { } return flag; } }
3、拦截器
import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.interceptor.Interceptor; import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.List; /** * 自定义拦截器 * 1、实现 Interceptor * 2、实现 Interceptor 的 4 个方法 */ public class ETLInterceptor implements Interceptor { public void initialize() { } /** * 过滤非 JSON 数据 * @param event * @return */ public Event intercept(Event event) { //过滤 event 的 数据格式私发不满足JSON byte[] body = event.getBody(); String log = new String(body, StandardCharsets.UTF_8); //判断是否 是JSON boolean flag = JSONUtil.isJSON(log); return flag ? event : null; } public List<Event> intercept(List<Event> list) { //使用迭代器将处理后的 null 删除 Iterator<Event> iterator = list.iterator(); while (iterator.hasNext()) { Event next = iterator.next(); if (intercept(next) == null) { iterator.remove(); } } return list; } public void close() { } public static class Builder implements Interceptor.Builder { @Override public Interceptor build() { return new ETLInterceptor(); } @Override public void configure(Context context) { } } }
4、编写完即可打包,将带有jar 文件的 jar 文件上传服务器
-rw-rw-r--. 1 hui hui 662626 Jan 30 09:51 collect.demo0125-1.0-SNAPSHOT-jar-with-dependencies.jar [hui@hadoop201 lib]$ pwd /opt/module/flume/lib
5、编写 flume 配置文件
[hui@hadoop201 job]$ cat file_to_kafka.conf #为各组件命名 a1.sources = r1 a1.channels = c1 #描述source a1.sources.r1.type = TAILDIR a1.sources.r1.filegroups = f1 a1.sources.r1.filegroups.f1 = /opt/module/applog/log/app.* a1.sources.r1.positionFile = /opt/module/flume/taildir_position.json a1.sources.r1.interceptors = i1 a1.sources.r1.interceptors.i1.type = org.wdh01.flume.interceptor.ETLInterceptor$Builder #描述channel a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel a1.channels.c1.kafka.bootstrap.servers = hadoop201:9092,hadoop202:9092 a1.channels.c1.kafka.topic = topic_log #flume 默认 Event传输,false 表示已 字符串传输,不使用默认传输 a1.channels.c1.parseAsFlumeEvent = false #绑定source和channel以及sink和channel的关系 a1.sources.r1.channels = c1
6、flume 启停脚本
[hui@hadoop201 ~]$ cat bin/f1.sh #!/bin/sh case $1 in "start"){ for i in hadoop201 hadoop202 do echo " --------启动 $i 采集flume-------" ssh $i "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f /opt/module/flume/job/file_to_kafka.conf >/dev/null 2>&1 &" done };; "stop"){ for i in hadoop201 hadoop202 do echo " --------停止 $i 采集flume-------" ssh $i "ps -ef | grep file_to_kafka.conf | grep -v grep |awk '{print \$2}' | xargs -n1 kill -9 " done };; esac
案例二
使用flume将案例一的kafka里的JSON数据传到hdfs的目标路径,由于时间延迟,为了避免零点漂移情况的发生,需要对读取的json 的时间做处理
1、拦截器 pom 文件同案例一
2、拦截器
import com.alibaba.fastjson.JSONObject; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.interceptor.Interceptor; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; /** * 时间戳自定义拦截器 */ public class TimeStampInterceptor implements Interceptor { @Override public void initialize() { } @Override public Event intercept(Event event) { //修改 header 中的时间,改为日志数据中的时间,提供 hdfs sink 使用,使用正常的 时间作为日志文件名 byte[] body = event.getBody(); String log = new String(body, StandardCharsets.UTF_8); JSONObject jsonObject = JSONObject.parseObject(log); String timeStamp = jsonObject.getString("ts"); //获取 header Map<String, String> headers = event.getHeaders(); headers.put("timestamp", timeStamp);//注意 headers 的 key 是固定的 timestamp return event; } @Override public List<Event> intercept(List<Event> list) { for (Event event : list) { intercept(event); } return list; } @Override public void close() { } public static class Builder implements Interceptor.Builder { @Override public Interceptor build() { return new TimeStampInterceptor(); } @Override public void configure(Context context) { } } }
3、打包上传同案例一
4、flume配置文件
[hui@hadoop203 job]$ cat kafka_to_hdfs.conf ## 组件 a1.sources=r1 a1.channels=c1 a1.sinks=k1 ## source1 a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource #满足5k 及发送 a1.sources.r1.batchSize = 5000 #满足2s 发送 a1.sources.r1.batchDurationMillis = 2000 #kafka 链接 a1.sources.r1.kafka.bootstrap.servers = hadoop201:9092,hadoop202:9092,hadoop203:9092 a1.sources.r1.kafka.topics=topic_log a1.sources.r1.interceptors = i1 a1.sources.r1.interceptors.i1.type = org.wdh01.flume.interceptor.TimeStampInterceptor$Builder ## channel1 a1.channels.c1.type = file #检查点文件 a1.channels.c1.checkpointDir = /opt/module/flume/checkpoint/behavior1 #落盘文件 a1.channels.c1.dataDirs = /opt/module/flume/data/behavior1/ a1.channels.c1.maxFileSize = 2146435071 a1.channels.c1.capacity = 1000000 a1.channels.c1.keep-alive = 6 ## sink1 a1.sinks.k1.type = hdfs a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_log/%Y-%m-%d a1.sinks.k1.hdfs.filePrefix = log- a1.sinks.k1.hdfs.rollInterval = 10 a1.sinks.k1.hdfs.rollSize = 134217728 a1.sinks.k1.hdfs.rollCount = 0 ## 控制输出文件是原生文件。 a1.sinks.k1.hdfs.fileType = CompressedStream a1.sinks.k1.hdfs.codeC = gzip ## 拼装 a1.sources.r1.channels = c1 a1.sinks.k1.channel= c1
5、启停脚本
[hui@hadoop201 ~]$ cat bin/f2.sh #!/bin/bash case $1 in "start") echo " --------启动 hadoop203 日志数据flume-------" ssh hadoop203 "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf -f /opt/module/flume/job/kafka_to_hdfs.conf >/dev/null 2>&1 &" ;; "stop") echo " --------停止 hadoop203 日志数据flume-------" ssh hadoop203 "ps -ef | grep kafka_to_hdfs.conf | grep -v grep |awk '{print \$2}' | xargs -n1 kill" ;; esac
案例三
业务场景:需要使用maxwell监控mysql 的binlog ,将业务数据的变化实时发送到kafka 中,再利用flume 将这部分数据传到 hdfs 上,这里也需要编写一个时间拦截器,避免零点漂移
1、pom 文件同案例一
2、拦截器
import com.alibaba.fastjson.JSONObject; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.interceptor.Interceptor; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; /** * 业务数据 时间戳拦截器 */ public class TimeStampInterceptor implements Interceptor { @Override public void initialize() { } @Override public Event intercept(Event event) { Map<String, String> headers = event.getHeaders(); String log = new String(event.getBody(), StandardCharsets.UTF_8); JSONObject jsonObject = JSONObject.parseObject(log); Long ts = jsonObject.getLong("ts"); //Maxwell输出的数据中的ts字段时间戳单位为秒,Flume HDFSSink要求单位为毫秒 String timeMills = String.valueOf(ts * 1000); headers.put("timestamp", timeMills); return event; } @Override public List<Event> intercept(List<Event> events) { for (Event event : events) { intercept(event); } return events; } @Override public void close() { } public static class Builder implements Interceptor.Builder { @Override public Interceptor build() { return new TimeStampInterceptor(); } @Override public void configure(Context context) { } } }
3、打包上传同案例一
4、flume 配置文件
[hui@hadoop203 job]$ less kafka_to_hdfs_db.conf a1.sources = r1 a1.channels = c1 a1.sinks = k1 a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource a1.sources.r1.batchSize = 5000 a1.sources.r1.batchDurationMillis = 2000 a1.sources.r1.kafka.bootstrap.servers = hadoop201:9092,hadoop202:9092 a1.sources.r1.kafka.topics = cart_info,comment_info,coupon_use,favor_info,order_detail_activity,order_detail_coupon,order_detail,order_info,order_refund_info,order_status_log,payment_info,refund_payment,user_info a1.sources.r1.kafka.consumer.group.id = flume a1.sources.r1.setTopicHeader = true a1.sources.r1.topicHeader = topic a1.sources.r1.interceptors = i1 a1.sources.r1.interceptors.i1.type = org.wdh01.flume.interceptor.db.TimeStampInterceptorNew$Builder a1.channels.c1.type = file a1.channels.c1.checkpointDir = /opt/module/flume/checkpoint/behavior2 a1.channels.c1.dataDirs = /opt/module/flume/data/behavior2 a1.channels.c1.maxFileSize = 2146435071 a1.channels.c1.capacity = 1123456 a1.channels.c1.keep-alive = 6 ## sink1 a1.sinks.k1.type = hdfs a1.sinks.k1.hdfs.path = /origin_data/gmall/db/%{topic}_inc/%Y-%m-%d a1.sinks.k1.hdfs.filePrefix = db a1.sinks.k1.hdfs.round = false a1.sinks.k1.hdfs.rollInterval = 10 a1.sinks.k1.hdfs.rollSize = 134217728 a1.sinks.k1.hdfs.rollCount = 0 a1.sinks.k1.hdfs.fileType = CompressedStream a1.sinks.k1.hdfs.codeC = gzip ## 拼装 a1.sources.r1.channels = c1 a1.sinks.k1.channel= c1
5、flume 启停
[hui@hadoop201 ~]$ cat bin/f3.sh #!/bin/bash case $1 in "start") echo " --------启动 hadoop203 业务数据flume-------" ssh hadoop203 "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf -f /opt/module/flume/job/kafka_to_hdfs_db.conf >/dev/null 2>&1 &" ;; "stop") echo " --------停止 hadoop203 业务数据flume-------" ssh hadoop203 "ps -ef | grep kafka_to_hdfs_db.conf | grep -v grep |awk '{print \$2}' | xargs -n1 kill" ;; esac