Spark straming
Hadoop
- 分布式文件系统 HDFS
- 分布式存储 HDFS
- 分布式计算 mapReduce、 spark sparkSql/SparkStreaming
- 资源管理调度 yarn
安装Java jdk1.8.0_144
版本 1.8 or 以上 官网下载
- 添加环境变量:
export JAVA_HOME=/app/jdk1.8.0_144
export PATH=$JAVA_HOME/bin:$PATH
- 配置生效:
source profile
- 测试
java -version
安装flume apache-flume-1.6.0-cdh5.7.0-bin cdh5 下载
- 添加环境变量
export FLUME_HOME=/app/apache-flume-1.6.0-cdh5.7.0-bin
export PATH=$FLUME_HOME/bin:$PATH
- 配置生效
source profile
- 配置设置文件: $FLUME_HOME/conf/
cp flume-env.sh.template flume-env.sh
- flume-env.sh的配置
export JAVA_HOME=/app/jdk1.8.0_144
- 测试:
flume-ng version
使用flume关键就是配置文件
- 配置Source
- 配置Channel
- 配置Sink
- 把以上三个组件串起来
- a1 agent名称
- r1 source名称
- k1 sink名称
- c1 channel名称
- --name agent的名称
- --conf flume配置文件
- --conf-file 编辑的配置文件
技术选型 一
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
- 运行
flume-ng agent --name a1 --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/example.conf -Dflume.root.logger=INFO,console
- 测试
使用telnet进行测试: telnet localhost 44444
技术选型 二
- exec source + memory channel + logger sink
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /spark/data.log
a1.sources.r1.shell = /bin/sh -c
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
- 运行
flume-ng agent --name a1 --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/exec-memory-logger.conf -Dflume.root.logger=INFO,console
技术选型 三
- exec source + memory channel + avro sink
- avro source + memory channel + logger sink
exec-memory-avro.sources = exec-source
exec-memory-avro.sinks = avro-sink
exec-memory-avro.channels = memory-channel
exec-memory-avro.sources.exec-source.type = exec
exec-memory-avro.sources.exec-source.command = tail -F /spark/data.log
exec-memory-avro.sources.exec-source.shell = /bin/sh -c
exec-memory-avro.sinks.avro-sink.type = avro
exec-memory-avro.sinks.avro-sink.hostname = localhost
exec-memory-avro.sinks.avro-sink.port = 44444
exec-memory-avro.channels.memory-channel.type = memory
exec-memory-avro.sources.exec-source.channels = memory-channel
exec-memory-avro.sinks.avro-sink.channel = memory-channel
avro-memory-logger.conf
avro-memory-logger.sources = avro-source
avro-memory-logger.sinks = logger-sink
avro-memory-logger.channels = memory-channel
avro-memory-logger.sources.avro-source.type = avro
avro-memory-logger.sources.avro-source.bind = localhost
avro-memory-logger.sources.avro-source.port= 44444
avro-memory-logger.sinks.logger-sink.type =logger
avro-memory-logger.channels.memory-channel.type = memory
avro-memory-logger.sources.avro-source.channels = memory-channel
avro-memory-logger.sinks.logger-sink.channel = memory-channel
- 运行
avro-memory-logger.conf
flume-ng agent --name avro-memory-logger --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/avro-memory-logger.conf -Dflume.root.logger=INFO,console
exec-memory-avro.conf
flume-ng agent --name exec-memory-avro --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/exec-memory-avro.conf -Dflume.root.logger=INFO,console
Kafka架构
- producer: 生产者
- consumer: 消费者
- broker: 篮子
- topic: 主题,标签
安装 zookeeper
版本 zookeeper-3.4.5-cdh5.7.0.tar.gz cdh5 下载
- 添加环境变量:
export ZK_HOME=/app/zookeeper-3.4.5-cdh5.7.0
export PATH=$ZK_HOME/bin:$PATH
- 配置生效
source profile
- 配置设置文件 $ZK_HOME/conf/
cp zoo_sample.cfg zoo.cfg
- zoo.cfg的配置:
dataDir = /app/tmp/zk
- 测试
./zkServer.sh start
安装kafka: kafka_2.11-0.9 下载
- 添加环境变量:
export KAFKA_HOME=/app/kafka_2.11-0.9.0.0
export PATH=$KAFKA_HOME/bin:$PATH
- 配置生效:
source profile
- 配置设置文件: $KAFKA_HOME/conf/
server.properties Setting 1) broker.id = 0
2) listeners=PLAINTEXT://:9092
3) host.name=localhost
4) log.dirs = /app/tmp/kafka-logs
5) zookeeper.connect = localhost:2181
- 启动kafka:
- 先启动zookeepr zkServer.sh start
kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties
- 测试
jps -m
- 创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
- 查看
kafka-topics.sh --list --zookeeper localhost:2181
- 查看topic详细信息
kafka-topics.sh --describe --zookeeper localhost:2181
- 发送消息
kafka-console-producer.sh --broker-list localhost:9092 --topic test
- 消费消息
kafka-console-consumer.sh --zookeeper localhost:2181 --topic test --from-beginning
- 配置kafka单节点多broker
cp server.properties server-1.properties
cp server.properties server-2.properties
- 配置设置文件: $KAFKA_HOME/conf/
server-1.properties 1) broker.id = 1
2) listeners=PLAINTEXT://:9093
3) host.name=localhost
4) log.dirs = /app/tmp/kafka-logs-1
5) zookeeper.connect = localhost:2181
server-2.properties 1) broker.id = 2
2) listeners=PLAINTEXT://:9094
3) host.name=localhost
4) log.dirs = /app/tmp/kafka-logs-2
5) zookeeper.connect = localhost:2181
- 启动多节点broker:
kafka-server-start.sh -daemon $KAFKA_HOME/config/server-1.properties &
...
kafka-server-start.sh -daemon $KAFKA_HOME/config/server-2.properties &
...
- 测试
jps -m
- 创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic my-replicated-topic
- 发送消息
kafka-console-producer.sh --broker-list localhost:9093,localhost:9094 --topic my-replicated-topic
- 消费消息
kafka-console-consumer.sh --zookeeper localhost:2181 --topic my-replicated-topic
整合Flume和kafka的综合使用
- exec source + memory channel + avro sink
- avro source + memory channel + kafka sink
exec-memory-avro.conf
exec-memory-avro.sources = exec-source
exec-memory-avro.sinks = avro-sink
exec-memory-avro.channels = memory-channel
exec-memory-avro.sources.exec-source.type = exec
exec-memory-avro.sources.exec-source.command = tail -F /app/data.log
exec-memory-avro.sources.exec-source.shell = /bin/sh -c
exec-memory-avro.sinks.avro-sink.type = avro
exec-memory-avro.sinks.avro-sink.hostname = localhost
exec-memory-avro.sinks.avro-sink.port = 44444
exec-memory-avro.channels.memory-channel.type = memory
exec-memory-avro.sources.exec-source.channels = memory-channel
exec-memory-avro.sinks.avro-sink.channel = memory-channel
avro-memory-kafka.conf
avro-memory-kafka.sources = avro-source
avro-memory-kafka.sinks = kafka-sink
avro-memory-kafka.channels = memory-channel
avro-memory-kafka.sources.avro-source.type = avro
avro-memory-kafka.sources.avro-source.bind = localhost
avro-memory-kafka.sources.avro-source.port= 44444
avro-memory-kafka.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
avro-memory-kafka.sinks.kafka-sink.brokerList = localhost:9092
avro-memory-kafka.sinks.kafka-sink.topic = test
avro-memory-kafka.sinks.kafka-sink.batchSize = 5
avro-memory-kafka.sinks.kafka-sink.requiredAcks = 1
avro-memory-kafka.channels.memory-channel.type = memory
avro-memory-kafka.sources.avro-source.channels = memory-channel
avro-memory-kafka.sinks.kafka-sink.channel = memory-channel
- 运行 启动kafka
启动zookeepr zkServer.sh start
kafka-server-start.sh $KAFKA_HOME/config/server.properties
- 运行
avro-memory-kafka.conf
flume-ng agent --name avro-memory-kafka --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/avro-memory-kafka.conf -Dflume.root.logger=INFO,console
exec-memory-avro.conf
flume-ng agent --name exec-memory-avro --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/exec-memory-avro.conf -Dflume.root.logger=INFO,console
- 测试
jps -m
- 创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
- 查看topic
kafka-topics.sh --list --zookeeper localhost:2181
- 查看topic详细信息
kafka-topics.sh --describe --zookeeper localhost:2181
- 发送消息
kafka-console-producer.sh --broker-list localhost:9092 --topic test
- 消费
kafka-console-consumer.sh --zookeeper localhost:2181 --topic test --from-beginning
安装scala 2.11.8 下载
wget https://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.tgz
- 添加环境变量:
export SCALA_HOME=/app/scala-2.11.8
export PATH=$SCALA_HOME/bin:$PATH
- 配置生效:
source profile
- 测试:
scala
安装Maven 3.3.9 下载
- 添加环境变量:
export MAVEN_HOME=/app/apache-maven-3.3.9
export PATH=$MAVEN_HOME/bin:$PATH
- 配置生效:
source profile
- 测试:
source profile
- 配置:/app/apache-maven-3.3.9/conf/settings.xml
vim <localRepository>/app/tmp/maven_repos/</localRepository>
安装hadoop hadoop-2.6.0-cdh5.7.0 下载
wget http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.6.0-cdh5.7.0.tar.gz
- 添加环境变量:
export HADOOP_HOME=/app/hadoop-2.6.0-cdh5.7.0
export PATH=$HADOOP_HOME/bin:$PATH
- 配置生效:
source profile
- 配置:
ssh-keygen -t rsa
cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys
- 解压文件
tar -zxvf hadoop-2.6.0-cdh5.7.0.tar.gz
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/app/jdk1.8.0_171
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:8020</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/app/tmp</value>
</property>
</configuration>
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/slaves
localhost
./hdfs namenode -format
cd /app/hadoop-2.6.0-cdh5.7.0/sbin/start-dfs.sh
访问 www.samor.site:50070
cd /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
vim yarn-site.xml
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
cd /app/hadoop-2.6.0-cdh5.7.0/sbin
./start-yarn.sh
访问 www.samor.site:8088
./hadoop fs -ls /
./hadoop fs -mkdir /data
./hadoop fs -ls /data
./hadoop fs -put mr-jobhistory-daemon.sh /data/
./hadoop fs -ls /data
./hadoop fs -text /data/mr-jobhistory-daemon.sh
cd /app/hadoop-2.6.0-cdh5.7.0/share/hadoop/mapreduce
hadoop jar hadoop-mapreduce-examples-2.6.0-cdh5.7.0.jar pi 2 3
下载 hbase hbase-1.2.0-cdh5.7.0 下载
wget http://archive.cloudera.com/cdh5/cdh/5/hbase-1.2.0-cdh5.7.0.tar.gz
- 添加环境变量:
export HBASE_HOME=/app/hbase-1.2.0-cdh5.7.0
export PATH=$HBSE_HOME/bin:$PATH
- 配置生效:
source profile
- 配置:
cd /app/hbase-1.2.0-cdh5.7.0/conf
vim hbase-env.sh
export JAVA_HOME=/app/jdk1.8.0_171
export HBASE_MANAGES_ZK=false
vim hbase-site.xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://localhost:8020/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>localhost:2181</value>
</property>
</configuration>
vim regionservers
localhost
- 启动:
cd /app/hadoop-2.6.0-cdh5.7.0/sbin
./start-dfs.sh
cd /app/hbase-1.2.0-cdh5.7.0/bin
./start-hbase.sh
./hbase shell
- hbase 命令:
查看表 list
创建表 create 'my_hbase','info'
查详情 desc 'my_hbase'
查记录 scan 'my_hbase'
rowkey设计
wget https://archive.apache.org/dist/spark/spark-2.2.0/spark-2.2.0.tgz
- 解压:
tar -zxvf spark-2.2.0.tgz
- 修改:
cd /app/spark-2.2.0/dev
vim data-distribution.sh
将 VERSION->spark2.2.0 ,
SCALA_VERSION 2.11 ,
SPARK_HADOOP_VERSION ,
SPARK_HIVE
注释掉,直接写上自己的版本
#VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ 2>/dev/null | grep -v "INFO" | tail -n 1)
#SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ 2>/dev/null\
# | grep -v "INFO"\
# | tail -n 1)
#SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
# | grep -v "INFO"\
# | tail -n 1)
#SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
# | grep -v "INFO"\
# | fgrep --count "<id>hive</id>";\
# # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
# # because we use "set -o pipefail"
# echo -n)
将以下的内容贴在注释掉的那个脚本的后面即可
VERSION=2.2.0
SCALA_VERSION=2.11.8
SPARK_HADOOP_VERSION=2.6.0-cdh5.7.0
SPARK_HIVE=1
121行: 修改为:
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
执行时会遇到各种问题:
[ERROR] Failed to execute goal on project spark-launcher_2.11: Could not resolve dependencies for project org.apache.spark:spark-launcher_2.11:jar:2.2.0: Failure to find org.apache.hadoop:hadoop-client:jar:2.6.0-cdh5.7.0 in https://repo1.maven.org/maven2 was cached in the local repository, resolution will not be reattempted until the update interval of central has elapsed or updates are forced -> [Help 1]
这是因为默认的是apache的仓库,但是我们hadoop的版本写的是CDH,这时要将CDH的仓库配进来,打开spark目录下的pom.xml文件,将CDH的仓库配进去
添加:
vim /app/spark-2.2.0/pom.xml
第223行 :set un || /repository
<repository>
<id>cloudera</id>
<name>cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository>
- 执行:
./dev/make-distribution.sh --name 2.6.0-cdh5.7.0 --tgz -Phadoop-2.6 -Dhadoop.version=2.6.0-cdh5.7.0 -Phive -Phive-thriftserver
注:
大约需要半小时, 服务器配置要大
安装spark spark-2.2.0-bin-2.6.0-cdh5.7.0.tgz
- 添加环境变量:
export SPARK_HOME=/app/spark-2.2.0-bin-2.6.0-cdh5.7.0
export PATH=$SPARK_HOME/bin:$PATH
- 配置生效:
source profile
- 测试:
./spark-shell --version
./spark-shell --master local[2]
spark 处理文件系统
打开IntelliJ IDEA create new project 选择scala IDEA
项目添加maven
右键项目 > add framework support > maven
添加pom.xml文件
<dependencies>
<!--spark-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<!--scala-->
<dependency>
<groupId>com.fastexml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.11</artifactId>
<version>2.6.5</version>
</dependency>
<!--lz4-->
<dependency>
<groupId>net.jpountz.lz4</groupId>
<artifactId>lz4</artifactId>
<version>1.3.0</version>
</dependency>
<!---log4j-->
<dependency>
<groupId>org.apache.flume.flume-ng-clients</groupId>
<artifactId>flume-ng-log4jappender</artifactId>
<version>1.6.0</version>
</dependency>
</dependencies>
使用spark 测试
def main(args: Array[String]): Unit = {
println("hello")
}
使用spark Streaming处理文件系统(local/hdfs)的数据
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
object project {
def main(args: Array[String]): Unit = {
// 配置主机、名字
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("Hello")
// 设置5秒内检测
val ssc = new StreamingContext(sparkConf, Seconds(5))
// 检测文件目录下的文件
val lines = ssc.textFileStream("file:///Users/guo/Desktop")
// 截取空格获取1位相同数相加
val result = lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
// 打印结果
result.print()
ssc.start()
ssc.awaitTermination()
}
}
Spark Streaming整合Flume&Kafka打造通用流处理基础
创建log4j.properties
设置本地连接
log4j.rootLogger=INFO,stdout,flume
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} - %m%n
配置flume链接
log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
log4j.appender.flume.Hostname = 0.0.0.0
log4j.appender.flume.Port = 41414
log4j.appender.flume.UnsafeMode = true
创建Logeneration.java 测试文件
public class Logeneration {
private static Logger logger = Logger.getLogger(Logeneration.class.getName());
public static void main(String[] args) throws Exception{
int index = 0;
while(true) {
Thread.sleep(1000);
logger.info("value is :" + index++);
}
}
}
Flume 收集 log4j中数据
log4j-flume.conf
log4j-flume.sources = avro-source
log4j-flume.sinks = log-sink
log4j-flume.channels = logger-channel
log4j-flume.sources.avro-source.type = avro
log4j-flume.sources.avro-source.bind = 0.0.0.0
log4j-flume.sources.avro-source.port = 41414
log4j-flume.channels.logger-channel.type = memory
log4j-flume.sinks.log-sink.type = logger
log4j-flume.sources.avro-source.channels =logger-channel
log4j-flume.sinks.log-sink.channel = logger-channel
运行
flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/log4j-flume.conf -name log4j-flume -Dflume.root.logger=INFO,console
运行
netstat -an | grep 41414
kill -9 *
kafaka 连接 flume 收集 log4j 数据
运行
启动kafka
启动zookeepr zkServer.sh start
kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties
测试:
jps -m
创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic mytopic
查看topic
kafka-topics.sh --list --zookeeper localhost:2181
查看topic详细信息
kafka-topics.sh --describe --zookeeper localhost:2181
发送消息
kafka-console-producer.sh --broker-list localhost:9092 --topic mytopic
消费
kafka-console-consumer.sh --zookeeper localhost:2181 --topic mytopic --from-beginning
flume-kafka.conf
log4j-flume.sources = avro-source
log4j-flume.channels = logger-channel
log4j-flume.sinks = kafka-sink
log4j-flume.sources.avro-source.type = avro
log4j-flume.sources.avro-source.bind = 0.0.0.0
log4j-flume.sources.avro-source.port = 41414
log4j-flume.channels.logger-channel.type = memory
log4j-flume.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
log4j-flume.sinks.kafka-sink.topic = mytopic
log4j-flume.sinks.kafka-sink.brokerList = localhost:9092
log4j-flume.sinks.kafka-sink.batchSize = 10
log4j-flume.sinks.kafka-sink.requiredAcks = 1
log4j-flume.sources.avro-source.channels =logger-channel
log4j-flume.sinks.kafka-sink.channel = logger-channel
运行
flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/flume-kafka.conf -name log4j-flume -Dflume.root.logger=INFO,console
Flume 收集 mysql 中数据
配置:
- 创建数据库
- 创建本地状态文件 touch /app/tmp/sql-source.status
- 准备Jar包
flume-ng-sql-source
http://book2s.com/java/jar/f/flume-ng-sql-source/download-flume-ng-sql-source-1.4.1.html
wget http://central.maven.org/maven2/org/keedio/flume/flume-ng-sources/flume-ng-sql-source/1.4.1/flume-ng-sql-source-1.4.1.jar
mysql-connector-java
wget http://www.java2s.com/Code/JarDownload/mysql/mysql-connector-java-5.1.16.jar.zip
flume-mysql-log.conf
flume-mysql.channels = log-channel
flume-mysql.sinks = log-sink
flume-mysql.sources = sql-source
flume-mysql.sources.sql-source.type = org.keedio.flume.source.SQLSource
flume-mysql.sources.sql-source.hibernate.connection.url = jdbc:mysql://47.95.214.178:3306/bigdata
flume-mysql.sources.sql-source.hibernate.connection.user = www
flume-mysql.sources.sql-source.hibernate.connection.password = Weizhong2018!@#
flume-mysql.sources.sql-source.hibernate.connection.autocommit = true
flume-mysql.sources.sql-source.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect
flume-mysql.sources.sql-source.hibernate.connection.driver_class = com.mysql.jdbc.Driver
flume-mysql.sources.sql-source.run.query.delay= 1
flume-mysql.sources.sql-source.status.file.path = /app/tmp
flume-mysql.sources.sql-source.status.file.name = sql-source.status
flume-mysql.sources.sql-source.run.query.delay=20000
flume-mysql.sources.sql-source.start.from = 0
flume-mysql.sources.sql-source.custom.query = select `id`, `name` from test
flume-mysql.sources.sql-source.batch.size = 1000
flume-mysql.sources.sql-source.max.rows = 1000
flume-mysql.sources.sql-source.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
flume-mysql.sources.sql-source.hibernate.c3p0.min_size=1
flume-mysql.sources.sql-source.hibernate.c3p0.max_size=10
flume-mysql.channels.log-channel.type = memory
flume-mysql.sinks.log-sink.type = logger
flume-mysql.sinks.log-sink.channel = log-channel
flume-mysql.sources.sql-source.channels = log-channel
运行
flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/flume-mysql-log.conf --name flume-mysql -Dflume.root.logger=INFO,console
mysql-flume-kafka.conf
flume-mysql.sources = sql-source
flume-mysql.channels = log-channel
flume-mysql.sinks = kafka-sink
flume-mysql.sources.sql-source.type = org.keedio.flume.source.SQLSource
flume-mysql.sources.sql-source.hibernate.connection.url = jdbc:mysql://47.95.214.178:3306/bigdata
flume-mysql.sources.sql-source.hibernate.connection.user = www
flume-mysql.sources.sql-source.hibernate.connection.password = Weizhong2018!@#
flume-mysql.sources.sql-source.hibernate.connection.autocommit = true
flume-mysql.sources.sql-source.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect
flume-mysql.sources.sql-source.hibernate.connection.driver_class = com.mysql.jdbc.Driver
flume-mysql.sources.sql-source.run.query.delay= 1
flume-mysql.sources.sql-source.status.file.path = /app/tmp
flume-mysql.sources.sql-source.status.file.name = sql-source.status
flume-mysql.sources.sql-source.run.query.delay=20000
flume-mysql.sources.sql-source.start.from = 0
flume-mysql.sources.sql-source.custom.query = select `id`, `name` from test where type = 2
flume-mysql.sources.sql-source.batch.size = 1000
flume-mysql.sources.sql-source.max.rows = 1000
flume-mysql.sources.sql-source.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
flume-mysql.sources.sql-source.hibernate.c3p0.min_size=1
flume-mysql.sources.sql-source.hibernate.c3p0.max_size=10
flume-mysql.channels.log-channel.type = memory
flume-mysql.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
flume-mysql.sinks.kafka-sink.topic = mytopic
flume-mysql.sinks.kafka-sink.brokerList = localhost:9092
flume-mysql.sinks.kafka-sink.batchSize = 10
flume-mysql.sinks.kafka-sink.requiredAcks = 1
flume-mysql.sinks.kafka-sink.channel = log-channel
flume-mysql.sources.sql-source.channels = log-channel
运行
flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/mysql-flume-kafka.conf --name flume-mysql -Dflume.root.logger=INFO,console
flume-mysql.sources = sql-source
flume-mysql.channels = log-channel
flume-mysql.sinks = kafka-sink
flume-mysql.sources.sql-source.type = org.keedio.flume.source.SQLSource
flume-mysql.sources.sql-source.hibernate.connection.url = jdbc:mysql://47.95.214.178:3306/newdb
flume-mysql.sources.sql-source.hibernate.connection.user = www
flume-mysql.sources.sql-source.hibernate.connection.password = Weizhong2018!@#
flume-mysql.sources.sql-source.hibernate.connection.autocommit = true
flume-mysql.sources.sql-source.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect
flume-mysql.sources.sql-source.hibernate.connection.driver_class = com.mysql.jdbc.Driver
flume-mysql.sources.sql-source.run.query.delay= 1
flume-mysql.sources.sql-source.status.file.path = /app/tmp
flume-mysql.sources.sql-source.status.file.name = sql-source.status
flume-mysql.sources.sql-source.run.query.delay=20000
flume-mysql.sources.sql-source.start.from = 0
flume-mysql.sources.sql-source.custom.query = select * from staffs
flume-mysql.sources.sql-source.batch.size = 1000
flume-mysql.sources.sql-source.max.rows = 1000
flume-mysql.sources.sql-source.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
flume-mysql.sources.sql-source.hibernate.c3p0.min_size=1
flume-mysql.sources.sql-source.hibernate.c3p0.max_size=10
flume-mysql.channels.log-channel.type = memory
flume-mysql.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
flume-mysql.sinks.kafka-sink.topic = mytopic
flume-mysql.sinks.kafka-sink.brokerList = localhost:9092
flume-mysql.sinks.kafka-sink.batchSize = 10
flume-mysql.sinks.kafka-sink.requiredAcks = 1
flume-mysql.sinks.kafka-sink.channel = log-channel
flume-mysql.sources.sql-source.channels = log-channel