Flume应用

Flume配置说明

1.部署结构图

2.Agent配置

##########################################
# agent config
###########################################
#*****************agent section**********************
agent.sources = source_pad source_adx source_pad_clean source_db_data
agent.channels = channel_pad channel_adx channel_pad_clean channel_adx_clean channel_db_data
agent.sinks = sink_pad sink_adx sink_pad_clean sink_pad_clean1 sink_adx_clean sink_adx_clean1 sink_db_data sink_db_data1
 
#*****************source section**********************
#pad source section
agent.sources.source_pad.deletePolicy=immediate
agent.sources.source_pad.type = spooldir
agent.sources.source_pad.channels = channel_pad
agent.sources.source_pad.spoolDir=/wls/data/pad
agent.sources.source_pad.batchSize=500
agent.sources.source_pad.bufferMaxLineLength=20000
#adx source section
agent.sources.source_adx.deletePolicy=immediate
agent.sources.source_adx.type = spooldir
agent.sources.source_adx.channels = channel_adx channel_adx_clean
agent.sources.source_adx.spoolDir=/wls/data/adx
agent.sources.source_adx.batchSize=500
agent.sources.source_adx.bufferMaxLineLength=10000
agent.sources.source_adx.inputCharset=UTF-8
agent.sources.source_adx.decodeErrorPolicy=IGNORE
agent.sources.source_adx.selector.type=replicating
 
#pad clean source section
agent.sources.source_pad_clean.type=org.apache.flume.source.kafka.KafkaSource
agent.sources.source_pad_clean.kafka.bootstrap.servers=collector3:9092,collector2:9092,collector1:9092
agent.sources.source_pad_clean.kafka.topics=pad_report_data_clean
agent.sources.source_pad_clean.channels=channel_pad_clean
 
#db data source section
agent.sources.source_db_data.type=org.apache.flume.source.kafka.KafkaSource
agent.sources.source_db_data.kafka.bootstrap.servers=collector3:9092,collector2:9092,collector1:9092
agent.sources.source_db_data.kafka.topics=db_data
agent.sources.source_db_data.channels=channel_db_data
 
#*****************sink section**********************
#pad sink section
agent.sinks.sink_pad.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.sink_pad.kafka.bootstrap.servers=collector3:9092,collector2:9092,collector1:9092
agent.sinks.sink_pad.kafka.flumeBatchSize=500
agent.sinks.sink_pad.kafka.producer.acks=1
agent.sinks.sink_pad.kafka.producer.type=async
agent.sinks.sink_pad.kafka.topic=pad_report_data
agent.sinks.sink_pad.kafka.producer.compression.type = snappy
agent.sinks.sink_pad.kafka.producer.linger.ms=50
agent.sinks.sink_pad.channel = channel_pad
 
#adx sink section
agent.sinks.sink_adx.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.sink_adx.kafka.bootstrap.servers=collector1:9092,collector2:9092,collector3:9092
agent.sinks.sink_adx.kafka.flumeBatchSize=500
agent.sinks.sink_adx.kafka.producer.acks=1
agent.sinks.sink_adx.kafka.producer.type=async
agent.sinks.sink_adx.kafka.topic=adx_report_data
agent.sinks.sink_adx.kafka.producer.compression.type = snappy
agent.sinks.sink_adx.kafka.producer.linger.ms=50
agent.sinks.sink_adx.channel = channel_adx
 
#pad clean sink
agent.sinks.sink_pad_clean.type=avro
agent.sinks.sink_pad_clean.hostname=30.16.94.72
agent.sinks.sink_pad_clean.port=44444
agent.sinks.sink_pad_clean.threads=10
agent.sinks.sink_pad_clean.channel=channel_pad_clean
 
agent.sinks.sink_pad_clean1.type=avro
agent.sinks.sink_pad_clean1.hostname=30.16.94.75
agent.sinks.sink_pad_clean1.port=44444
agent.sinks.sink_pad_clean1.threads=10
agent.sinks.sink_pad_clean1.channel=channel_pad_clean
 
#adx clean sink
agent.sinks.sink_adx_clean.type=avro
agent.sinks.sink_adx_clean.hostname=30.16.94.72
agent.sinks.sink_adx_clean.port=44445
agent.sinks.sink_adx_clean.threads=10
agent.sinks.sink_adx_clean.channel=channel_adx_clean
 
agent.sinks.sink_adx_clean1.type=avro
agent.sinks.sink_adx_clean1.hostname=30.16.94.75
agent.sinks.sink_adx_clean1.port=44445
agent.sinks.sink_adx_clean1.threads=10
agent.sinks.sink_adx_clean1.channel=channel_adx_clean
 
#db data sink
agent.sinks.sink_db_data.type=avro
agent.sinks.sink_db_data.hostname=30.16.94.72
agent.sinks.sink_db_data.port=44446
agent.sinks.sink_db_data.threads=10
agent.sinks.sink_db_data.channel=channel_db_data
 
agent.sinks.sink_db_data1.type=avro
agent.sinks.sink_db_data1.hostname=30.16.94.75
agent.sinks.sink_db_data1.port=44446
agent.sinks.sink_db_data1.threads=10
agent.sinks.sink_db_data1.channel=channel_db_data
 
#*****************sink group**************************
#pad clean sinkgroup
agent.sinkgroups=gpad
agent.sinkgroups.gpad.sinks=sink_pad_clean sink_pad_clean1
agent.sinkgroups.gpad.processor.type = load_balance
agent.sinkgroups.gpad.processor.selector = round_robin
agent.sinkgroups.gpad.processor.backoff = true
 
 
#pad clean sinkgroup
agent.sinkgroups=gadx
agent.sinkgroups.gadx.sinks=sink_adx_clean sink_adx_clean1
agent.sinkgroups.gadx.processor.type = load_balance
agent.sinkgroups.gadx.processor.selector = round_robin
agent.sinkgroups.gadx.processor.backoff = true
 
 
#db data sinkgroup
agent.sinkgroups=gdb
agent.sinkgroups.gdb.sinks=sink_db_data sink_db_data1
agent.sinkgroups.gdb.processor.type = load_balance
agent.sinkgroups.gdb.processor.selector = round_robin
agent.sinkgroups.gdb.processor.backoff = true
 
#*****************channel section**********************
#pad channel section
agent.channels.channel_pad.type = memory
agent.channels.channel_pad.capacity = 20000
agent.channels.channel_pad.keep-alive=60
agent.channels.channel_pad.transactionCapacity=2000
#adx channel section
agent.channels.channel_adx.type = memory
agent.channels.channel_adx.capacity = 20000
agent.channels.channel_adx.keep-alive=60
agent.channels.channel_adx.transactionCapacity=2000
#pad clean channel section
agent.channels.channel_pad_clean.type = memory
agent.channels.channel_pad_clean.capacity = 20000
agent.channels.channel_pad_clean.keep-alive=60
agent.channels.channel_pad_clean.transactionCapacity=2000
#adx clean channel section
agent.channels.channel_adx_clean.type = memory
agent.channels.channel_adx_clean.capacity = 20000
agent.channels.channel_adx_clean.keep-alive=60
agent.channels.channel_adx_clean.transactionCapacity=2000
#db data channel section
agent.channels.channel_db_data.type = memory
agent.channels.channel_db_data.capacity = 10000
agent.channels.channel_db_data.keep-alive=60
agent.channels.channel_db_data.transactionCapacity=2000
 
############interceptor########
agent.sources.source_pad.interceptors = i1
agent.sources.source_pad.interceptors.i1.type=org.apache.flume.sink.solr.morphline.UUIDInterceptor$Builder
agent.sources.source_pad.interceptors.i1.preserveExisting = false
agent.sources.source_pad.interceptors.i1.headerName =key
 
agent.sources.source_adx.interceptors=i2
agent.sources.source_adx.interceptors.i2.type=org.apache.flume.interceptor.Md5ConvertInterceptor$Builder
 
agent.sources.source_pad_clean.interceptors=i3
agent.sources.source_pad_clean.interceptors.i3.type=org.apache.flume.interceptor.DmpTimestampInterceptor$Builder
 
agent.sources.source_db_data.interceptors=i4
agent.sources.source_db_data.interceptors.i4.type=org.apache.flume.interceptor.DbDataInterceptor$Builder

数据流图:

3.Collector配置

##########################################
# collector config
###########################################
#*****************agent section**********************
collector.sources=source_pad source_adx source_db
collector.channels=channel_pad channel_adx channel_db
collector.sinks=sink_pad sink_adx sink_db
#*****************source section**********************
#pad source section
collector.sources.source_pad.type = avro
collector.sources.source_pad.channels = channel_pad
collector.sources.source_pad.bind=0.0.0.0
collector.sources.source_pad.port=44444
#adx source section
collector.sources.source_adx.type = avro
collector.sources.source_adx.channels = channel_adx
collector.sources.source_adx.bind=0.0.0.0
collector.sources.source_adx.port=44445
#db source section
collector.sources.source_db.type = avro
collector.sources.source_db.channels = channel_db
collector.sources.source_db.bind=0.0.0.0
collector.sources.source_db.port=44446
 
#pad clean sink
collector.sinks.sink_pad.type=hdfs
collector.sinks.sink_pad.hdfs.path=hdfs://dmp/data/logs/pad/%{day}
collector.sinks.sink_pad.hdfs.rollInterval=86400
collector.sinks.sink_pad.hdfs.rollSize=0
collector.sinks.sink_pad.hdfs.idleTimeout=172800
collector.sinks.sink_pad.hdfs.callTimeout=60000
collector.sinks.sink_pad.hdfs.writeFormat=Text
collector.sinks.sink_pad.hdfs.filePrefix=pad.master1
collector.sinks.sink_pad.hdfs.rollCount=9900000000
collector.sinks.sink_pad.hdfs.batchSize=3000
collector.sinks.sink_pad.hdfs.fileType=DataStream
collector.sinks.sink_pad.channel=channel_pad
 
#adx clean sink
collector.sinks.sink_adx.type=hdfs
collector.sinks.sink_adx.hdfs.path=hdfs://dmp/data/logs/%{datatype}/%{day}
collector.sinks.sink_adx.hdfs.rollInterval=0
collector.sinks.sink_adx.hdfs.rollSize=0
collector.sinks.sink_adx.hdfs.idleTimeout=172800
collector.sinks.sink_adx.hdfs.writeFormat=Text
collector.sinks.sink_adx.hdfs.callTimeout=60000
collector.sinks.sink_adx.hdfs.filePrefix=%{datatype}.master1
collector.sinks.sink_adx.hdfs.rollCount=9900000000
collector.sinks.sink_adx.hdfs.batchSize=3000
collector.sinks.sink_adx.hdfs.fileType=DataStream
collector.sinks.sink_adx.channel=channel_adx
 
#db clean sink
collector.sinks.sink_db.type=hdfs
collector.sinks.sink_db.hdfs.path=hdfs://dmp/data/logs/%{datatype}
collector.sinks.sink_db.hdfs.rollInterval=0
collector.sinks.sink_db.hdfs.rollSize=0
collector.sinks.sink_db.hdfs.idleTimeout=172800
collector.sinks.sink_db.hdfs.writeFormat=Text
collector.sinks.sink_db.hdfs.callTimeout=60000
collector.sinks.sink_db.hdfs.filePrefix=%{datatype}.master1
collector.sinks.sink_db.hdfs.rollCount=9900000000
collector.sinks.sink_db.hdfs.batchSize=1000
collector.sinks.sink_db.hdfs.fileType=DataStream
collector.sinks.sink_db.channel=channel_db
 
#*****************channel section**********************
#pad channel section
collector.channels.channel_pad.type = memory
collector.channels.channel_pad.capacity = 60000
collector.channels.channel_pad.keep-alive=60
collector.channels.channel_pad.transactionCapacity=10000
#adx channel section
collector.channels.channel_adx.type = memory
collector.channels.channel_adx.capacity = 60000
collector.channels.channel_adx.keep-alive=60
collector.channels.channel_adx.transactionCapacity=6000
#db channel section
collector.channels.channel_db.type = memory
collector.channels.channel_db.capacity = 30000
collector.channels.channel_db.keep-alive=60
collector.channels.channel_db.transactionCapacity=3000

数据流图:

4.Interceptor

​在event前置拦截器,可以根据数据添加一些header,例如type、partition、date,用于数据归类和分区,还可以做一些简单过滤工作(flume自带时间戳解析过慢,不建议使用timestamp及相关表达式)

​继承Flume中的Interceptor,新增子类Builder继承Interceptor.Builder,然后实现相关方法

public class DmpTimestampInterceptor implements Interceptor{
  @Override
public void close() {
}
@Override
public void initialize() {
}
@Override
public Event intercept(Event e) {
      return e;
}
@Override
public List<Event> intercept(List<Event> events) {
       return events;  
}
public static class Builder implements Interceptor.Builder {  
        @Override  
        public Interceptor build() {  
            return new DmpTimestampInterceptor();  
        }  
@Override
public void configure(Context arg0) {
}  
    }  
}

​配置interceptor的type为该子类Builder即可

5.其他配置

​1.no-reload-conf设置是否自动加载配置文件。

​2.sinkgroups.g1.processor.type = failover(load_balance)设置失效切换和负载均衡机制。

​3.flume组件启动顺序:channels——>sinks——>sources,关闭顺序:sources——>sinks——>channels.

​4.sink 可以配置压缩选项。

​5.hdfs.idleTimeout用于设置文件长期未操作则释放。

​6.spool-source:只能放静态文件,否则读取到正在编辑的文件则异常终止。

posted @ 2017-09-24 00:15  秋水无声  阅读(197)  评论(0编辑  收藏  举报