关闭防火墙
systemctl stop firewalld.service
systemctl disable firewalld.service
关闭selinux
vim /etc/selinux/config
# SELIUNX改为 disabled
SELIUNX=disabled
解压所有压缩包到指定目录
配置环境变量
export FLINK_HOME=/opt/flink-1.10.2
export FLUME_HOME=/opt/flume-1.7.0
export KAFKA_HOME=/opt/kafka-2.0.0
export SPARK_HOME=/usr/local/src/spark-2.1.1
export HIVE_HOME=/opt/hive-2.3.4
export ZOOKEEPER_HOME=/opt/zookeeper-3.6.3
export JAVA_HOME=/usr/java/jdk1.8.0_101
export HADOOP_HOME=/opt/hadoop-2.7.1
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$ZOOKEEPER_HOME/bin:$HIVE_HOME/bin:
$SPARK_HOME/bin:$KAFKA_HOME/bin:$FLUME_HOME/bin:$FLINK_HOME/bin:
ssh免密登录
修改/etc/hosts文件,添加IP地址 master
ssh-keygen -t rsa
ssh-copy-id master
ssh-copy-id slave1
ssh-copy-id slave2
完全分布式
修改hadoop配置文件
hadoop-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_101
slave
master
slave1
slave2
core-site.xml
<!-- 指定HDFS中NameNode的地址 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:8020</value>
</property>
<!-- 指定hadoop运行时产生文件的存储目录,注意tmp目录需要创建 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/data/tmp</value>
</property>
hdfs-site.xml
<!-- 设置dfs副本数,不设置默认是3个 , 默认的三个够用,可以不用写
<property>
<name>dfs.replication</name>
<value>2</value>
</property> -->
<!-- 设置namenode数据存放路径 -->
<property>
<name>dfs.name.dir</name>
<value>/opt/dataname</value>
</property>
<!-- 设置datanode数据存放路径 -->
<property>
<name>dfs.data.dir</name>
<value>/opt/data/data</value>
</property>
<!-- 设置secondname的端口 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>slave1:50090</value>
</property>
<!-- 禁止权限访问 -->
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
mapredu-stie.xml
<!-- 指定mr运行在yarn上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定YARN的ResourceManager的地址 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<!--开启日志聚集功能-->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!--设置日志聚集服务器地址-->
<property>
<name>yarn.log.server.url</name>
<value>http://hadoop121:19888/jobhistory/logs</value>
</property>
<!--设置日志保留时间为7天-->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<!--设置yarn历史服务器地址-->
<property>
<name>yarn.log.server.url</name>
<value>http://hadoop121:19888/jobhistory/logs</value>
</property>
<!--是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<!--是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
格式化namenode
hadoop namenode -format
启动集群
start-all.sh
start-yarn.sh
hive
添加mysql连接驱动到lib目录下
mv mysql-connector-java-5.1.39.jar /opt/lib
重命名hive-env.sh
mv hive-env.sh.template hive-env.sh
修改hive-env.sh
# 配置HADOOP_HOME路径
export HADOOP_HOME=/opt/hadoop-2.7.1
# 配置HIVE_CONF_DIR路径
export HIVE_CONF_DIR=/opt/hive/conf
创建hive-site.xml
vim hive-stie.xml
添加内容
初始化hive
schematool -dbType mysql -initSchema
# 第二种
schematool -dbType derby -initSchema
scala
解压scala
tar zxvf scala-2.11.11.tgz -C /usr/local/src/
配置环境变量
vim /root/.bash_profile
# 添加内容
export SCALA_HOME=/usr/local/src/scala-2.11.11
export PATH=$PATH:$SCALA_HOME/bin:
刷新环境变量
source /root/.bash_profile
启动scala
scala
# 退出
:quit
spark
进入spark目录下的conf,修改后缀名
mv spark-env.sh.template spark-env.sh
mv spark-defaults.conf.template spark-defaults.conf.sh
mv slaves.template slaves
修改配置文件
slaves
master
slave1
slave2
spark-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_144
export SPARK_HOME=/usr/local/src/spark
export SCALA_HOME=/usr/local/src/scala
export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop
export SPARK_LOCAL_DIRS=/opt/spark
SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=master:2181,slave01:2181,slave02:2181 -Dspark.deploy.zookeeper.dir=/my-spark"
SPARK_EXECUTOR_MEMORY=1G
SPARK_EXECUTOR_cores=2
SPARK_WORKER_CORES=2
SPARK_MASTER_HOST=master
SPARK_MASTER_PORT=7077
#可选 修改webui的端口号
SPARK_MASTER_WEBUI_PORT=8081
SPARK_SWORKER_WEBUI_PORT=8082
启动spark
./sbin/start-all.sh
启动spark shell
./bin/spark-shell --master spark://192.168.6.161:7077 --total-executor-cores 1 --executor-memory 1g
运行jar包
# 开发模式
spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode cluster \
./examples/jars/spark-examples_2.11-2.1.1.jar
# 测试模式
spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode client \
./examples/jars/spark-examples_2.11-2.1.1.jar
进入flume目录下conf修改配置文件
重命名并编辑文件
mv flume-env.sh.template flume-env.sh
添加内容
export JAVA_HOME=/usr/java/jdk1.8.0_144
flume配置
Flume source 的四种type
# 1. spooldir
a1.sources.r1.type = spooldir
al.sources.r1.spoolDir = PATH #d:/home/soft
# 2. NetCat Source
a1.sources.r1.type = netcat
a1.sources.r1.bind = host
a1.sources.r1.port = 7777
# 3.Avro Source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444
# 4.HTTP Source
a1.sources.r1.type = http
a1.sources.r1.port = 66666
配置./conf/flume-kafka.conf
taildir读取文件
a1.sources=r1
a1.channels=c1 c2
a1.sources.r1.type=TAILDIR
a1.sources.r1.positionFile=/opt/flume/file/log.json
a1.sources.r1.filegroups=f1
a1.sources.r1.filegroups.f1=/opt/flume/file/app.+
a1.sources.r1.channels=c1
# configure channel配置Channel
# KafkaChannel 首字母大写
a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=master:9092,slave1:9092,slave2:9092
a1.channels.c1.kafka.topic=mytopic
a1.channels.c1.parseAsFlumeEvent=false
a1.channels.c1.kafka.consumer.group.id=mytopics
netcat端口读取
#Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#Describe/configure the source
#a1.sources.r1.type = spooldir
#a1.sources.r1.spoolDir=/home/hadoop/flume_kafka
a1.sources.r1.type = netcat
a1.sources.r1.bind = 192.168.138.161
a1.sources.r1.port = 7777
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = mytopic
a1.sinks.k1.kafka.bootstrap.servers = mark1:9092,mark2:9092,mark3:9092
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1
a1.sinks.ki.kafka.producer.compression.type = snappy
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume
./bin/flume-ng agent -c ./conf -f ./conf/flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console
flink配置
修改conf/下的flink-conf.yaml
(原文件里都有,只需要去掉#号后根据集群状况修改即可)
jobmanager.rpc.address: master
high-availability: zookeeper
high-availability.storageDir: hdfs://master:8020/flink
high-availability.zookeeper.quorum: master:2181,slave1:2181,slave2:2181
修改slaves
master
slave1
slave2
修改masters
master:8081
slave1:8081
启动flink on yarn模式
yarn-session.sh -n 2 -jm 1024 -tm 1096 -s 1
kafka
进入kafka目录下的config
修改server.proprties
vim server.properties
修改内容(部分内容是kafka原有的,可以通过原有的基础上进行改动)
port=9092
log.dirs=/opt/data/kafka/tmp
zookeeper.connect=master:2181,slave1:2181,slave2:2181
delete.topic.enable=true
分发安装包
scp -r kafka slave1:/opt/
scp -r kafka slave2:/opt/
创建软连
ln -s kafka kafka
修改每一台机器上的server.proprties里的boroker.id为0,1,2
启动集群
进入到kafka的目录中
./bin/kafka-server-start.sh -daemon ./config/server.properties
关闭集群
./bin/kafka-server-stop.sh ./config/server.properties
启动kafka
# 静默启动
./bin/kafka-server-start.sh -daemon ./config/server.properties
# ./bin/kafka-server-stop.sh ./config/server.properties
# 创建topic
kafka-topics.sh --create --partitions 3 --replication-factor 2 --topic mytopic --zookeeper master:2181, slave1:2181, slave2:2181
# 创建生产者
kafka-console-producer.sh --broker-list master:9092,slave1:9092,slave2:9092 --topic mytopic
# 创建消费者
# 1.0.1
kafka-console-consumer.sh --zookeeper master:2181, slave1:2181, slave2:2181 --topic mytopic --from-beginning
# 2.0.0
kafka-console-consumer.sh --bootstrap-server master:9092,slave1:9092,slave2:9092 --topic mytopic --from-beginning
flink on yarn
flink run -m yarn-cluster -c retailers.Calculation /
启动nc
nc master 7777