Spark straming

Hadoop

  1. 分布式文件系统 HDFS
  2. 分布式存储 HDFS
  3. 分布式计算 mapReduce、 spark sparkSql/SparkStreaming
  4. 资源管理调度 yarn

安装Java jdk1.8.0_144

版本 1.8 or 以上   官网下载

  • 添加环境变量:
export JAVA_HOME=/app/jdk1.8.0_144
export PATH=$JAVA_HOME/bin:$PATH
  • 配置生效:
source profile
  • 测试
java -version

安装flume apache-flume-1.6.0-cdh5.7.0-bin cdh5 下载

  • 添加环境变量
export FLUME_HOME=/app/apache-flume-1.6.0-cdh5.7.0-bin
export PATH=$FLUME_HOME/bin:$PATH
  • 配置生效
source profile
  • 配置设置文件: $FLUME_HOME/conf/
cp flume-env.sh.template flume-env.sh
  1. flume-env.sh的配置
export JAVA_HOME=/app/jdk1.8.0_144
  • 测试:
flume-ng version

使用flume关键就是配置文件

  1. 配置Source
  2. 配置Channel
  3. 配置Sink
  4. 把以上三个组件串起来
  1. a1 agent名称
  2. r1 source名称
  3. k1 sink名称
  4. c1 channel名称
  1. --name agent的名称
  2. --conf flume配置文件
  3. --conf-file 编辑的配置文件

技术选型 一

a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
  • 运行
flume-ng agent --name a1 --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/example.conf  -Dflume.root.logger=INFO,console
  • 测试
 使用telnet进行测试: telnet localhost 44444

技术选型 二

  1. exec source + memory channel + logger sink
a1.sources = r1
a1.sinks = k1
a1.channels = c1

a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /spark/data.log
a1.sources.r1.shell = /bin/sh -c

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
  • 运行
flume-ng agent --name a1 --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/exec-memory-logger.conf  -Dflume.root.logger=INFO,console

技术选型 三

  1. exec source + memory channel + avro sink
  2. avro source + memory channel + logger sink
exec-memory-avro.sources = exec-source
exec-memory-avro.sinks = avro-sink
exec-memory-avro.channels = memory-channel

exec-memory-avro.sources.exec-source.type = exec
exec-memory-avro.sources.exec-source.command = tail -F /spark/data.log
exec-memory-avro.sources.exec-source.shell = /bin/sh -c

exec-memory-avro.sinks.avro-sink.type = avro
exec-memory-avro.sinks.avro-sink.hostname = localhost
exec-memory-avro.sinks.avro-sink.port = 44444

exec-memory-avro.channels.memory-channel.type = memory

exec-memory-avro.sources.exec-source.channels = memory-channel
exec-memory-avro.sinks.avro-sink.channel = memory-channel


avro-memory-logger.conf

avro-memory-logger.sources = avro-source
avro-memory-logger.sinks = logger-sink
avro-memory-logger.channels = memory-channel

avro-memory-logger.sources.avro-source.type = avro
avro-memory-logger.sources.avro-source.bind = localhost
avro-memory-logger.sources.avro-source.port= 44444

avro-memory-logger.sinks.logger-sink.type =logger

avro-memory-logger.channels.memory-channel.type = memory
avro-memory-logger.sources.avro-source.channels = memory-channel
avro-memory-logger.sinks.logger-sink.channel = memory-channel
  • 运行

avro-memory-logger.conf

flume-ng agent --name avro-memory-logger --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/avro-memory-logger.conf -Dflume.root.logger=INFO,console

exec-memory-avro.conf

flume-ng agent --name exec-memory-avro  --conf $FLUME_HOME/conf  --conf-file $FLUME_HOME/conf/exec-memory-avro.conf -Dflume.root.logger=INFO,console

Kafka架构

  1. producer: 生产者
  2. consumer: 消费者
  3. broker: 篮子
  4. topic: 主题,标签

安装 zookeeper

版本 zookeeper-3.4.5-cdh5.7.0.tar.gz    cdh5 下载

  • 添加环境变量:
export ZK_HOME=/app/zookeeper-3.4.5-cdh5.7.0
export PATH=$ZK_HOME/bin:$PATH
  • 配置生效
source profile
  • 配置设置文件 $ZK_HOME/conf/
cp zoo_sample.cfg zoo.cfg
  • zoo.cfg的配置:
dataDir = /app/tmp/zk
  • 测试
./zkServer.sh start

安装kafka: kafka_2.11-0.9    下载

  • 添加环境变量:
export KAFKA_HOME=/app/kafka_2.11-0.9.0.0
export PATH=$KAFKA_HOME/bin:$PATH
  • 配置生效:
 source profile
  • 配置设置文件: $KAFKA_HOME/conf/
 server.properties Setting 1) broker.id = 0
                           2) listeners=PLAINTEXT://:9092
                           3) host.name=localhost
                           4) log.dirs = /app/tmp/kafka-logs
                           5) zookeeper.connect = localhost:2181
  • 启动kafka:
  1. 先启动zookeepr    zkServer.sh start
kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties
  • 测试
jps -m
  1. 创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
  1. 查看
kafka-topics.sh --list --zookeeper localhost:2181
  1. 查看topic详细信息
kafka-topics.sh --describe --zookeeper localhost:2181
  1. 发送消息
kafka-console-producer.sh --broker-list localhost:9092 --topic test
  1. 消费消息
 kafka-console-consumer.sh --zookeeper localhost:2181 --topic test --from-beginning
  • 配置kafka单节点多broker
cp server.properties server-1.properties
cp server.properties server-2.properties
  • 配置设置文件: $KAFKA_HOME/conf/
server-1.properties  1) broker.id = 1
                     2) listeners=PLAINTEXT://:9093
                     3) host.name=localhost
                     4) log.dirs = /app/tmp/kafka-logs-1
                     5) zookeeper.connect = localhost:2181
server-2.properties  1) broker.id = 2
                     2) listeners=PLAINTEXT://:9094
                     3) host.name=localhost
                     4) log.dirs = /app/tmp/kafka-logs-2
                     5) zookeeper.connect = localhost:2181
  • 启动多节点broker:
kafka-server-start.sh -daemon $KAFKA_HOME/config/server-1.properties &
...
kafka-server-start.sh -daemon $KAFKA_HOME/config/server-2.properties &
...
  • 测试
jps -m
  • 创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic my-replicated-topic
  • 发送消息
kafka-console-producer.sh --broker-list localhost:9093,localhost:9094 --topic my-replicated-topic
  • 消费消息
kafka-console-consumer.sh --zookeeper localhost:2181 --topic my-replicated-topic

整合Flume和kafka的综合使用

  • exec source + memory channel + avro sink
  • avro source + memory channel + kafka sink

exec-memory-avro.conf

exec-memory-avro.sources = exec-source
exec-memory-avro.sinks = avro-sink
exec-memory-avro.channels = memory-channel

exec-memory-avro.sources.exec-source.type = exec
exec-memory-avro.sources.exec-source.command = tail -F /app/data.log
exec-memory-avro.sources.exec-source.shell = /bin/sh -c

exec-memory-avro.sinks.avro-sink.type = avro
exec-memory-avro.sinks.avro-sink.hostname = localhost
exec-memory-avro.sinks.avro-sink.port = 44444

exec-memory-avro.channels.memory-channel.type = memory

exec-memory-avro.sources.exec-source.channels = memory-channel
exec-memory-avro.sinks.avro-sink.channel = memory-channel


avro-memory-kafka.conf

avro-memory-kafka.sources = avro-source
avro-memory-kafka.sinks = kafka-sink
avro-memory-kafka.channels = memory-channel

avro-memory-kafka.sources.avro-source.type = avro
avro-memory-kafka.sources.avro-source.bind = localhost
avro-memory-kafka.sources.avro-source.port= 44444

avro-memory-kafka.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
avro-memory-kafka.sinks.kafka-sink.brokerList = localhost:9092
avro-memory-kafka.sinks.kafka-sink.topic = test
avro-memory-kafka.sinks.kafka-sink.batchSize = 5
avro-memory-kafka.sinks.kafka-sink.requiredAcks = 1

avro-memory-kafka.channels.memory-channel.type = memory
avro-memory-kafka.sources.avro-source.channels = memory-channel
avro-memory-kafka.sinks.kafka-sink.channel = memory-channel
  • 运行 启动kafka
启动zookeepr zkServer.sh start
kafka-server-start.sh $KAFKA_HOME/config/server.properties
  • 运行

avro-memory-kafka.conf

flume-ng agent --name avro-memory-kafka --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/avro-memory-kafka.conf -Dflume.root.logger=INFO,console

exec-memory-avro.conf

flume-ng agent --name exec-memory-avro  --conf $FLUME_HOME/conf  --conf-file $FLUME_HOME/conf/exec-memory-avro.conf -Dflume.root.logger=INFO,console
  • 测试
jps -m
  • 创建topic
kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
  • 查看topic
kafka-topics.sh --list --zookeeper localhost:2181
  • 查看topic详细信息
kafka-topics.sh --describe --zookeeper localhost:2181
  • 发送消息
kafka-console-producer.sh --broker-list localhost:9092 --topic test
  • 消费
kafka-console-consumer.sh --zookeeper localhost:2181 --topic test --from-beginning

安装scala 2.11.8 下载

wget https://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.tgz
  • 添加环境变量:
export SCALA_HOME=/app/scala-2.11.8
export PATH=$SCALA_HOME/bin:$PATH
  • 配置生效:
 source profile
  • 测试:
scala

安装Maven 3.3.9     下载

  • 添加环境变量:
export MAVEN_HOME=/app/apache-maven-3.3.9
export PATH=$MAVEN_HOME/bin:$PATH
  • 配置生效:
source profile
  • 测试:
source profile
  • 配置:/app/apache-maven-3.3.9/conf/settings.xml
vim <localRepository>/app/tmp/maven_repos/</localRepository>

安装hadoop hadoop-2.6.0-cdh5.7.0 下载

wget http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.6.0-cdh5.7.0.tar.gz
  • 添加环境变量:
export HADOOP_HOME=/app/hadoop-2.6.0-cdh5.7.0
export PATH=$HADOOP_HOME/bin:$PATH
  • 配置生效:
source profile
  • 配置:
ssh-keygen -t rsa
cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys
  • 解压文件
tar -zxvf hadoop-2.6.0-cdh5.7.0.tar.gz
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/app/jdk1.8.0_171
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/core-site.xml
      <configuration>
        <property>
          <name>fs.defaultFS</name>
          <value>hdfs://localhost:8020</value>
        </property>
        <property>
          <name>hadoop.tmp.dir</name>
          <value>/app/tmp</value>
        </property>
      </configuration>
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/hdfs-site.xml
      <configuration>
        <property>
          <name>dfs.replication</name>
          <value>1</value>
        </property>
      </configuration>
vim /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop/slaves
      localhost
./hdfs namenode -format
cd /app/hadoop-2.6.0-cdh5.7.0/sbin/start-dfs.sh

访问 www.samor.site:50070

cd /app/hadoop-2.6.0-cdh5.7.0/etc/hadoop
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
    <configuration>
      <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
      </property>
    </configuration>
vim yarn-site.xml
    <configuration>
    <!-- Site specific YARN configuration properties -->
      <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
      </property>
    </configuration>
cd /app/hadoop-2.6.0-cdh5.7.0/sbin
./start-yarn.sh

访问 www.samor.site:8088

./hadoop fs -ls /
./hadoop fs -mkdir /data
./hadoop fs -ls /data
./hadoop fs -put mr-jobhistory-daemon.sh /data/
./hadoop fs -ls /data
./hadoop fs -text /data/mr-jobhistory-daemon.sh

cd /app/hadoop-2.6.0-cdh5.7.0/share/hadoop/mapreduce
hadoop jar hadoop-mapreduce-examples-2.6.0-cdh5.7.0.jar pi 2 3

下载 hbase hbase-1.2.0-cdh5.7.0 下载

wget http://archive.cloudera.com/cdh5/cdh/5/hbase-1.2.0-cdh5.7.0.tar.gz
  • 添加环境变量:
export HBASE_HOME=/app/hbase-1.2.0-cdh5.7.0
export PATH=$HBSE_HOME/bin:$PATH
  • 配置生效:
source profile
  • 配置:
cd  /app/hbase-1.2.0-cdh5.7.0/conf
    vim hbase-env.sh
    export JAVA_HOME=/app/jdk1.8.0_171
    export HBASE_MANAGES_ZK=false

    vim hbase-site.xml
     <configuration>
       <property>
         <name>hbase.rootdir</name>
         <value>hdfs://localhost:8020/hbase</value>
       </property>
       <property>
         <name>hbase.cluster.distributed</name>
         <value>true</value>
       </property>
       <property>
         <name>hbase.zookeeper.quorum</name>
         <value>localhost:2181</value>
       </property>
     </configuration>
    vim regionservers
    localhost
  • 启动:
cd /app/hadoop-2.6.0-cdh5.7.0/sbin
./start-dfs.sh
cd /app/hbase-1.2.0-cdh5.7.0/bin
./start-hbase.sh

./hbase shell
  • hbase 命令:
查看表   list  
创建表   create 'my_hbase','info'
查详情   desc 'my_hbase'
查记录   scan 'my_hbase'
rowkey设计

下载SPARK源码 spark-2.2.0

wget https://archive.apache.org/dist/spark/spark-2.2.0/spark-2.2.0.tgz
  • 解压:
tar -zxvf  spark-2.2.0.tgz
  • 修改:
cd /app/spark-2.2.0/dev
vim data-distribution.sh
将  VERSION->spark2.2.0  ,
    SCALA_VERSION 2.11 ,
    SPARK_HADOOP_VERSION ,
    SPARK_HIVE

注释掉,直接写上自己的版本

 #VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ 2>/dev/null | grep -v "INFO" | tail -n 1)
            #SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ 2>/dev/null\
            #    | grep -v "INFO"\
            #    | tail -n 1)
            #SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
            #    | grep -v "INFO"\
            #    | tail -n 1)
            #SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
            #    | grep -v "INFO"\
            #    | fgrep --count "<id>hive</id>";\
            #    # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
            #    # because we use "set -o pipefail"
            #    echo -n)
将以下的内容贴在注释掉的那个脚本的后面即可
 VERSION=2.2.0
 SCALA_VERSION=2.11.8
 SPARK_HADOOP_VERSION=2.6.0-cdh5.7.0
 SPARK_HIVE=1

121行: 修改为:

<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>

执行时会遇到各种问题:
[ERROR] Failed to execute goal on project spark-launcher_2.11: Could not resolve dependencies for project org.apache.spark:spark-launcher_2.11:jar:2.2.0: Failure to find org.apache.hadoop:hadoop-client:jar:2.6.0-cdh5.7.0 in https://repo1.maven.org/maven2 was cached in the local repository, resolution will not be reattempted until the update interval of central has elapsed or updates are forced -> [Help 1]

这是因为默认的是apache的仓库,但是我们hadoop的版本写的是CDH,这时要将CDH的仓库配进来,打开spark目录下的pom.xml文件,将CDH的仓库配进去
添加:

vim /app/spark-2.2.0/pom.xml
第223行 :set un || /repository
        <repository>
          <id>cloudera</id>
          <name>cloudera Repository</name>
          <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
        </repository>
  • 执行:
./dev/make-distribution.sh --name 2.6.0-cdh5.7.0  --tgz -Phadoop-2.6 -Dhadoop.version=2.6.0-cdh5.7.0 -Phive -Phive-thriftserver

注:
大约需要半小时, 服务器配置要大


安装spark spark-2.2.0-bin-2.6.0-cdh5.7.0.tgz

  • 添加环境变量:
export SPARK_HOME=/app/spark-2.2.0-bin-2.6.0-cdh5.7.0
export PATH=$SPARK_HOME/bin:$PATH
  • 配置生效:
source profile
  • 测试:
./spark-shell --version
./spark-shell --master local[2]

spark 处理文件系统

打开IntelliJ IDEA create new project 选择scala IDEA

项目添加maven
右键项目 > add framework support > maven

添加pom.xml文件

<dependencies>
        <!--spark-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <!--scala-->
        <dependency>
            <groupId>com.fastexml.jackson.module</groupId>
            <artifactId>jackson-module-scala_2.11</artifactId>
            <version>2.6.5</version>
        </dependency>
        <!--lz4-->
        <dependency>
            <groupId>net.jpountz.lz4</groupId>
            <artifactId>lz4</artifactId>
            <version>1.3.0</version>
        </dependency>
        <!---log4j-->
        <dependency>
            <groupId>org.apache.flume.flume-ng-clients</groupId>
            <artifactId>flume-ng-log4jappender</artifactId>
            <version>1.6.0</version>
        </dependency>
    </dependencies>

使用spark 测试

def main(args: Array[String]): Unit = {
        println("hello")
    }

使用spark Streaming处理文件系统(local/hdfs)的数据

import org.apache.spark._
import org.apache.spark.streaming._ 
import org.apache.spark.streaming.StreamingContext._

object project {

    def main(args: Array[String]): Unit = {
        // 配置主机、名字
        val sparkConf = new SparkConf().setMaster("local[2]").setAppName("Hello")
        // 设置5秒内检测
        val ssc = new StreamingContext(sparkConf, Seconds(5))
			// 检测文件目录下的文件
        val lines = ssc.textFileStream("file:///Users/guo/Desktop")
			// 截取空格获取1位相同数相加
        val result = lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
        // 打印结果
			result.print()
        ssc.start()
        ssc.awaitTermination()
    }
}

Spark Streaming整合Flume&Kafka打造通用流处理基础

创建log4j.properties

设置本地连接

log4j.rootLogger=INFO,stdout,flume
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} - %m%n

配置flume链接

log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
log4j.appender.flume.Hostname = 0.0.0.0
log4j.appender.flume.Port = 41414
log4j.appender.flume.UnsafeMode = true

创建Logeneration.java 测试文件

public class Logeneration {

    private static Logger logger = Logger.getLogger(Logeneration.class.getName());

    public static void main(String[] args) throws Exception{

        int index = 0;
        while(true) {
            Thread.sleep(1000);
            logger.info("value is :" + index++);
        }
    }
}

Flume 收集 log4j中数据

log4j-flume.conf

log4j-flume.sources = avro-source
log4j-flume.sinks = log-sink
log4j-flume.channels = logger-channel

log4j-flume.sources.avro-source.type = avro
log4j-flume.sources.avro-source.bind = 0.0.0.0
log4j-flume.sources.avro-source.port = 41414

log4j-flume.channels.logger-channel.type = memory

log4j-flume.sinks.log-sink.type = logger

log4j-flume.sources.avro-source.channels =logger-channel
log4j-flume.sinks.log-sink.channel = logger-channel

运行

flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/log4j-flume.conf -name log4j-flume -Dflume.root.logger=INFO,console

运行

netstat -an | grep 41414
kill -9 *

kafaka 连接 flume 收集 log4j 数据

运行

启动kafka
启动zookeepr zkServer.sh start
    kafka-server-start.sh -daemon  $KAFKA_HOME/config/server.properties
    
测试:
    jps -m

创建topic
    kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic mytopic
      
查看topic
    kafka-topics.sh --list --zookeeper localhost:2181

查看topic详细信息
    kafka-topics.sh --describe --zookeeper localhost:2181

发送消息
    kafka-console-producer.sh --broker-list localhost:9092 --topic mytopic

消费
    kafka-console-consumer.sh --zookeeper localhost:2181 --topic mytopic --from-beginning

flume-kafka.conf

log4j-flume.sources = avro-source
log4j-flume.channels = logger-channel
log4j-flume.sinks = kafka-sink

log4j-flume.sources.avro-source.type = avro
log4j-flume.sources.avro-source.bind = 0.0.0.0
log4j-flume.sources.avro-source.port = 41414

log4j-flume.channels.logger-channel.type = memory

log4j-flume.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
log4j-flume.sinks.kafka-sink.topic = mytopic
log4j-flume.sinks.kafka-sink.brokerList = localhost:9092
log4j-flume.sinks.kafka-sink.batchSize = 10
log4j-flume.sinks.kafka-sink.requiredAcks = 1

log4j-flume.sources.avro-source.channels =logger-channel
log4j-flume.sinks.kafka-sink.channel = logger-channel

运行

flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/flume-kafka.conf -name log4j-flume -Dflume.root.logger=INFO,console

Flume 收集 mysql 中数据

配置:

  1. 创建数据库
  2. 创建本地状态文件    touch /app/tmp/sql-source.status
  3. 准备Jar包

flume-ng-sql-source

http://book2s.com/java/jar/f/flume-ng-sql-source/download-flume-ng-sql-source-1.4.1.html
wget http://central.maven.org/maven2/org/keedio/flume/flume-ng-sources/flume-ng-sql-source/1.4.1/flume-ng-sql-source-1.4.1.jar

mysql-connector-java
wget http://www.java2s.com/Code/JarDownload/mysql/mysql-connector-java-5.1.16.jar.zip

flume-mysql-log.conf

flume-mysql.channels = log-channel
flume-mysql.sinks = log-sink
flume-mysql.sources = sql-source

flume-mysql.sources.sql-source.type = org.keedio.flume.source.SQLSource
flume-mysql.sources.sql-source.hibernate.connection.url = jdbc:mysql://47.95.214.178:3306/bigdata
flume-mysql.sources.sql-source.hibernate.connection.user = www
flume-mysql.sources.sql-source.hibernate.connection.password = Weizhong2018!@#
flume-mysql.sources.sql-source.hibernate.connection.autocommit = true
flume-mysql.sources.sql-source.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect

flume-mysql.sources.sql-source.hibernate.connection.driver_class = com.mysql.jdbc.Driver
flume-mysql.sources.sql-source.run.query.delay= 1


flume-mysql.sources.sql-source.status.file.path = /app/tmp
flume-mysql.sources.sql-source.status.file.name = sql-source.status

flume-mysql.sources.sql-source.run.query.delay=20000
flume-mysql.sources.sql-source.start.from = 0
flume-mysql.sources.sql-source.custom.query = select `id`, `name` from test 
flume-mysql.sources.sql-source.batch.size = 1000
flume-mysql.sources.sql-source.max.rows = 1000
flume-mysql.sources.sql-source.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
flume-mysql.sources.sql-source.hibernate.c3p0.min_size=1
flume-mysql.sources.sql-source.hibernate.c3p0.max_size=10

flume-mysql.channels.log-channel.type = memory

flume-mysql.sinks.log-sink.type = logger


flume-mysql.sinks.log-sink.channel = log-channel
flume-mysql.sources.sql-source.channels = log-channel

运行

flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/flume-mysql-log.conf --name flume-mysql -Dflume.root.logger=INFO,console

mysql-flume-kafka.conf

flume-mysql.sources = sql-source
flume-mysql.channels = log-channel
flume-mysql.sinks = kafka-sink

flume-mysql.sources.sql-source.type = org.keedio.flume.source.SQLSource
flume-mysql.sources.sql-source.hibernate.connection.url = jdbc:mysql://47.95.214.178:3306/bigdata
flume-mysql.sources.sql-source.hibernate.connection.user = www
flume-mysql.sources.sql-source.hibernate.connection.password = Weizhong2018!@#
flume-mysql.sources.sql-source.hibernate.connection.autocommit = true
flume-mysql.sources.sql-source.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect

flume-mysql.sources.sql-source.hibernate.connection.driver_class = com.mysql.jdbc.Driver
flume-mysql.sources.sql-source.run.query.delay= 1


flume-mysql.sources.sql-source.status.file.path = /app/tmp
flume-mysql.sources.sql-source.status.file.name = sql-source.status

flume-mysql.sources.sql-source.run.query.delay=20000
flume-mysql.sources.sql-source.start.from = 0
flume-mysql.sources.sql-source.custom.query =  select `id`, `name` from test where type = 2
flume-mysql.sources.sql-source.batch.size = 1000
flume-mysql.sources.sql-source.max.rows = 1000
flume-mysql.sources.sql-source.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
flume-mysql.sources.sql-source.hibernate.c3p0.min_size=1
flume-mysql.sources.sql-source.hibernate.c3p0.max_size=10

flume-mysql.channels.log-channel.type = memory

flume-mysql.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
flume-mysql.sinks.kafka-sink.topic = mytopic
flume-mysql.sinks.kafka-sink.brokerList = localhost:9092
flume-mysql.sinks.kafka-sink.batchSize = 10
flume-mysql.sinks.kafka-sink.requiredAcks = 1

flume-mysql.sinks.kafka-sink.channel = log-channel
flume-mysql.sources.sql-source.channels = log-channel

运行

flume-ng agent --conf $FLUME_HOME/conf --conf-file $FLUME_HOME/conf/mysql-flume-kafka.conf --name flume-mysql -Dflume.root.logger=INFO,console
flume-mysql.sources = sql-source
flume-mysql.channels = log-channel
flume-mysql.sinks = kafka-sink

flume-mysql.sources.sql-source.type = org.keedio.flume.source.SQLSource
flume-mysql.sources.sql-source.hibernate.connection.url = jdbc:mysql://47.95.214.178:3306/newdb
flume-mysql.sources.sql-source.hibernate.connection.user = www
flume-mysql.sources.sql-source.hibernate.connection.password = Weizhong2018!@#
flume-mysql.sources.sql-source.hibernate.connection.autocommit = true
flume-mysql.sources.sql-source.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect

flume-mysql.sources.sql-source.hibernate.connection.driver_class = com.mysql.jdbc.Driver
flume-mysql.sources.sql-source.run.query.delay= 1


flume-mysql.sources.sql-source.status.file.path = /app/tmp
flume-mysql.sources.sql-source.status.file.name = sql-source.status

flume-mysql.sources.sql-source.run.query.delay=20000
flume-mysql.sources.sql-source.start.from = 0
flume-mysql.sources.sql-source.custom.query =  select * from staffs
flume-mysql.sources.sql-source.batch.size = 1000
flume-mysql.sources.sql-source.max.rows = 1000
flume-mysql.sources.sql-source.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
flume-mysql.sources.sql-source.hibernate.c3p0.min_size=1
flume-mysql.sources.sql-source.hibernate.c3p0.max_size=10

flume-mysql.channels.log-channel.type = memory

flume-mysql.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
flume-mysql.sinks.kafka-sink.topic = mytopic
flume-mysql.sinks.kafka-sink.brokerList = localhost:9092
flume-mysql.sinks.kafka-sink.batchSize = 10
flume-mysql.sinks.kafka-sink.requiredAcks = 1

flume-mysql.sinks.kafka-sink.channel = log-channel
flume-mysql.sources.sql-source.channels = log-channel
posted @ 2022-07-20 18:37  linsonga  阅读(28)  评论(0编辑  收藏  举报