|NO.Z.00045|——————————|BigDataEnd|——|Hadoop&Flume.V08|——|Flume.v08|Flume.v1.9案例.v06|
一、监控日志文件采集数据到HDFS、本地文件系统
### --- 监控日志文件采集数据到HDFS、本地文件系统
~~~ # 业务需求:
~~~ 监控日志文件,收集信息上传到HDFS 和 本地文件系统
### --- 需求分析:
~~~ 需要多个Agent级联实现
~~~ source 选择 taildir
~~~ channel 选择 memory
~~~ 最终的 sink 分别选择 hdfs、file_roll
~~~ # taildir Source。
~~~ Flume 1.7.0加入的新Source,相当于 spooldir source + execsource。
~~~ 可以监控多个目录,并且使用正则表达式匹配该目录中的文件名进行实时收集。
~~~ 实时监控一批文件,并记录每个文件最新消费位置,agent进程重启后不会有数据丢失的问题。
~~~ 目前不适用于Windows系统;其不会对于跟踪的文件有任何处理,
~~~ 不会重命名也不会删除,不会做任何修改。
~~~ 不支持读取二进制文件,支持一行一行的读取文本文件。

二、实现步骤:
### --- 创建第一个配置文件flume-taildir-avro.conf 配置文件包括:
~~~ 1个 taildir source
~~~ 2个 memory channel
~~~ 2个 avro sink
[root@linux123 ~]# vim $FLUME_HOME/conf/flume-taildir-avro.conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2
# 将数据流复制给所有channel
a1.sources.r1.selector.type = replicating
# source
a1.sources.r1.type = taildir
# 记录每个文件最新消费位置
a1.sources.r1.positionFile = /root/flume/taildir_position.json
a1.sources.r1.filegroups = f1
# 备注:.*log 是正则表达式;这里写成 *.log 是错误的
a1.sources.r1.filegroups.f1 = /tmp/root/.*log
# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = linux123
a1.sinks.k1.port = 9091
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = linux123
a1.sinks.k2.port = 9092
# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 10000
a1.channels.c1.transactionCapacity = 500
a1.channels.c2.type = memory
a1.channels.c2.capacity = 10000
a1.channels.c2.transactionCapacity = 500
# Bind the source and sink to the channel
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
### --- 创建第二个配置文件flume-avro-hdfs.conf配置文件包括:
~~~ 1个 avro source
~~~ 1个 memory channel
~~~ 1个 hdfs sink
[root@linux123 ~]# vim $FLUME_HOME/conf/flume-avro-hdfs.conf
# Name the components on this agent
a2.sources = r1
a2.sinks = k1
a2.channels = c1
# Describe/configure the source
a2.sources.r1.type = avro
a2.sources.r1.bind = linux123
a2.sources.r1.port = 9091
# Describe the channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 10000
a2.channels.c1.transactionCapacity = 500
# Describe the sink
a2.sinks.k1.type = hdfs
a2.sinks.k1.hdfs.path = hdfs://linux121:9000/flume2/%Y%m%d/%H
# 上传文件的前缀
a2.sinks.k1.hdfs.filePrefix = flume2-
# 是否使用本地时间戳
a2.sinks.k1.hdfs.useLocalTimeStamp = true
# 500个Event才flush到HDFS一次
a2.sinks.k1.hdfs.batchSize = 500
# 设置文件类型,可支持压缩
a2.sinks.k1.hdfs.fileType = DataStream
# 60秒生成一个新的文件
a2.sinks.k1.hdfs.rollInterval = 60
a2.sinks.k1.hdfs.rollSize = 0
a2.sinks.k1.hdfs.rollCount = 0
a2.sinks.k1.hdfs.minBlockReplicas = 1
# Bind the source and sink to the channel
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
### --- 创建第三个配置文件flume-avro-file.conf配置文件包括:
~~~ 1个 avro source
~~~ 1个 memory channel
~~~ 1个 file_roll sink
[root@linux123 ~]# vim $FLUME_HOME/conf/flume-avro-file.conf
# Name the components on this agent
a3.sources = r1
a3.sinks = k1
a3.channels = c2
# Describe/configure the source
a3.sources.r1.type = avro
a3.sources.r1.bind = linux123
a3.sources.r1.port = 9092
# Describe the sink
a3.sinks.k1.type = file_roll
# 目录需要提前创建好
a3.sinks.k1.sink.directory = /root/flume/output
# Describe the channel
a3.channels.c2.type = memory
a3.channels.c2.capacity = 10000
a3.channels.c2.transactionCapacity = 500
# Bind the source and sink to the channel
a3.sources.r1.channels = c2
a3.sinks.k1.channel = c2
### --- 分别启动3个Agent
[root@linux123 ~]# mkdir -p /root/flume/output
[root@linux123 ~]# $FLUME_HOME/bin/flume-ng agent --name a3 \
--conf-file $FLUME_HOME/conf/flume-avro-file.conf \
-Dflume.root.logger=INFO,console &
[root@linux123 ~]# $FLUME_HOME/bin/flume-ng agent --name a2 \
--conf-file $FLUME_HOME/conf/flume-avro-hdfs.conf \
-Dflume.root.logger=INFO,console &
[root@linux123 ~]# $FLUME_HOME/bin/flume-ng agent --name a1 \
--conf-file $FLUME_HOME/conf/flume-taildir-avro.conf \
-Dflume.root.logger=INFO,console &
### --- 执行hive命令产生日志
[root@linux123 ~]# hive 1 -e "show databases"
#生成的日志文件
[root@linux123 ~]# cat flume/taildir_position.json
[{"inode":135825763,"pos":111033,"file":"/tmp/root/hive.log"}]
#本地文件
[root@linux123 ~]# ls flume/output/
1630135382504-1 1630135382504-12 1630135382504-15 1630135382504-18 1630135382504-20 1630135382504-4 1630135382504-7
1630135382504-10 1630135382504-13 1630135382504-16 1630135382504-19 1630135382504-21 1630135382504-5 1630135382504-8
1630135382504-11 1630135382504-14 1630135382504-17 1630135382504-2 1630135382504-3 1630135382504-6 1630135382504-9
#hdfs文件
[root@linux123 ~]# hdfs dfs -ls /flume2/20210828/15
Found 2 items
-rw-r--r-- 3 root supergroup 102205 2021-08-28 15:30 /flume2/20210828/15/flume2-.1630135773740
-rw-r--r-- 3 root supergroup 8828 2021-08-28 15:32 /flume2/20210828/15/flume2-.1630135865690
### --- 分别检查HDFS文件、本地文件、以及消费位置文件
~~~ # 3种监控日志文件Source的对比
~~~ exec Source:适用于监控一个实时追加的文件,但不能保证数据不丢失;
~~~ spooldir Source:能够保证数据不丢失,且能够实现断点续传,但延迟较高,不能实时监控;
~~~ taildir Source:既能够实现断点续传,又可以保证数据不丢失,还能够进行实时监控。
Walter Savage Landor:strove with none,for none was worth my strife.Nature I loved and, next to Nature, Art:I warm'd both hands before the fire of life.It sinks, and I am ready to depart
——W.S.Landor
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」