【大数据】hadoop3.0worker集群+flink+zeppelin+kafaka+zookeeper安装部署

零、环境

0.1软件版本

hadoop3.0

java 1.8.241

flink-1.12.3

zeppelin-0.9.0-bin-all

kafka.1.1.1(详见kafka集群部署)

0.2硬件

192.168.0.24  8c32G500SSD hadoop-master

192.168.0.25 8c32G500SSD  hadoop-client-1 

192.168.0.27 8c32G500SSD  hadoop-client-2

0.3 架构方式

workers集群

一、部署初始化

1.0 各个服务器免密

# ssh-keygen -t rsa
# ssh-copy-id node01
# 保证这三个文件每个服务器一致

.ssh/
total 16
-rw-r--r--. 1 root root 400 May 7 18:46 authorized_keys
-rw-------. 1 root root 1675 May 7 18:32 id_rsa
-rw-r--r--. 1 root root 400 May 7 18:32 id_rsa.pub

1.1 java安装

rpm -ivh jdk-8u241-linux-x64.rpm

1.2 修改环境变量

cat /etc/profile 
export JAVA_HOME=/usr/java/jdk1.8.0_241-amd64/
export FLINK_HOME=/export/servers/flink-1.12.3/
export CLASSPATH=$JAVA_HOME/lib
export HADOOP_HOME=/export/servers/hadoop-3.3.0
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export ZEPPELIN_HOME=/export/servers/zeppelin-0.9.0-bin-all
export KAFKA_HOME=/export/servers/kafka

export ZK_HOME=/export/servers/zookeeper/
export PATH=$PATH:$JAVA_HOME/bin:$ZK_HOME/bin:

export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$FLINK_HOME/bin:$ZEPPELIN_HOME/bin:$ZK_HOME/bin:$KAFKA_HOME/bin

  

1.3 检查主机名访问

#cat /etc/localhost

192.168.0.24   hadoop-master

192.168.0.25   hadoop-client-1 

192.168.0.27   hadoop-client-2

1.4下载相关软件

java 需手动上传

单独安装:rpm -ivh jdk-8u241-linux-x64.rpm

# hadoop
wget
https://downloads.apache.org/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz
# flink
wget https://mirror-hk.koddos.net/apache/flink/flink-1.12.3/flink-1.12.3-bin-scala_2.11.tgz
# zeppelin
wget https://mirror-hk.koddos.net/apache/zeppelin/zeppelin-0.9.0/zeppelin-0.9.0-bin-all.tgz

1.5配置时间服务器等

## 安装
yum install -y ntp

## 启动定时任务
crontab -e

## 随后在输入界面键入
*/1 * * * * /usr/sbin/ntpdate ntp4.aliyun.com;

# nc安装
yum install -y nc

# 文件夹规划
mkdir -p /export/servers    # 安装目录
mkdir -p /export/softwares  # 软件包存放目录
mkdir -p /export/scripts    # 启动脚本目录 

 

二、配置hadoop

2.1 解压

tar -xf hadoop-3.3.0.tar.gz -C /export/servers/
tar -xf flink-1.12.3-bin-scala_2.11.tgz -C /export/servers/
tar -xf zeppelin-0.9.0-bin-all.tgz -C /export/servers/

ll /export/servers/

 

2.2核对环境变量

cat  /etc/profile
export JAVA_HOME=/usr/java/jdk1.8.0_241-amd64/
export FLINK_HOME=/export/servers/flink-1.12.3/
export CLASSPATH=$JAVA_HOME/lib
export HADOOP_HOME=/export/servers/hadoop-3.3.0
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export ZEPPELIN_HOME=/export/servers/zeppelin-0.9.0-bin-all
export KAFKA_HOME=/export/servers/kafka

export ZK_HOME=/export/servers/zookeeper/
export PATH=$PATH:$JAVA_HOME/bin:$ZK_HOME/bin:

export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$FLINK_HOME/bin:$ZEPPELIN_HOME/bin:$ZK_HOME/bin:$KAFKA_HOME/bin

2.3配置hadoop

cd /export/servers/hadoop-3.3.0/
vim etc/hadoop/core-site.xml <configuration> <property> <name>fs.default.name</name> <value>hdfs://hadoop-master:8020</value> </property> <property> <name>hadoop.tmp.dir</name> <value>/export/servers/hadoop-3.3.0/hadoopDatas/tempDatas</value> </property> <!-- 缓冲区大小,实际工作中根据服务器性能动态调整 --> <property> <name>io.file.buffer.size</name> <value>4096</value> </property> <!-- 开启hdfs的垃圾桶机制,删除掉的数据可以从垃圾桶中回收,单位分钟 --> <property> <name>fs.trash.interval</name> <value>10080</value> </property> </configuration>


# vim sbin/start-dfs.sh

HDFS_DATANODE_USER=root
HDFS_DATANODE_SECURE_USER=hdfs
HDFS_NAMENODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root

 # vim sbin/stop-dfs.sh

HDFS_DATANODE_USER=root
HDFS_DATANODE_SECURE_USER=hdfs
HDFS_NAMENODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root

# vim sbin/start-yarn.sh

YARN_RESOURCEMANAGER_USER=root
HDFS_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root

# vim sbin/stop-yarn.sh

YARN_RESOURCEMANAGER_USER=root
HDFS_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root

 

# vim etc/hadoop/capacity-scheduler.xml

 

<property>
<name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
<value>0.5</value>
<description>
Maximum percent of resources in the cluster which can be used to run
application masters i.e. controls number of concurrent running
applications.
</description>
</property>

 

 

# vim etc/hadoop/hdfs-site.xml

<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop-master:50090</value>
</property>

<property>
<name>dfs.namenode.http-address</name>
<value>hadoop-master:50070</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///export/servers/hadoop-3.3.0/hadoopDatas/namenodeDatas,file:///export/servers/hadoop-3.3.0/hadoopDatas/namenodeDatas2</value>
</property>
<!-- 定义dataNode数据存储的节点位置,实际工作中,一般先确定磁盘的挂载目录,然后多个目录用,进行分割 -->

<property>
<name>dfs.datanode.data.dir</name>
<value>file:///export/servers/hadoop-3.3.0/hadoopDatas/datanodeDatas,file:///export/servers/hadoop-3.3.0/hadoopDatas/datanodeDatas2</value>
</property>

<property>
<name>dfs.namenode.edits.dir</name>
<value>file:///export/servers/hadoop-3.3.0/hadoopDatas/nn/edits</value>
</property>

<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>file:///export/servers/hadoop-3.3.0/hadoopDatas/snn/name</value>
</property>

<property>
<name>dfs.namenode.checkpoint.edits.dir</name>
<value>file:///export/servers/hadoop-3.3.0/hadoopDatas/dfs/snn/edits</value>
</property>

<property>
<name>dfs.replication</name>
<value>3</value>
</property>


<property>
<name>dfs.permissions</name>
<value>false</value>
</property>

<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>

 

# vim etc/hadoop/hadoop-env.sh

export JAVA_HOME=/usr/java/jdk1.8.0_241-amd64/
export HADOOP_SSH_OPTS="-p 32539"

 

# vim etc/hadoop/yarn-site.xml

<property>
<name>yarn.resourcemanager.hostname</name>
<value>node01</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>32</value>
<description>该节点上Yarn可使用的CPU个数</description>
</property>

<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
<description>单任务可申请的最小虚拟CPU个数</description>
</property>

<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>4</value>
<description>单任务可申请的最大虚拟CPU个数</description>
</property>


<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>52000</value>
<description>该节点上Yarn可使用的物理内存</description>
</property>

<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>2048</value>
<description>单任务可申请的最小物理内存</description>
</property>

<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>640000</value>
<description>单任务可申请的最大物理内存</description>
</property>

# vim mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>

# vim etc/hadoop/workers
hadoop-client-1
hadoop-client-2

# source /etc/profile

 2.4启动并验证hadoop

# 主服务器
hadoop namenode -format
sbin/start-dfs.sh
sbin/start-yarn.sh

# 主

# jps
13650 Jps
11308 SecondaryNameNode
11645 ResourceManager
11006 NameNode

# worker2

# jps
7561 DataNode
7740 NodeManager
9487 Jps

# jps
24664 Jps
22748 DataNode
22925 NodeManager

 

 

2.5配置flink集群和zeppelin

# flink
# vim conf/flink-conf.yaml

jobmanager.rpc.address: 192.168.0.24
jobmanager.rpc.port: 6123
jobmanager.memory.process.size: 14g
taskmanager.memory.process.size: 16g
taskmanager.numberOfTaskSlots: 16
parallelism.default: 2
jobmanager.execution.failover-strategy: region
rest.port: 8081
taskmanager.memory.network.fraction: 0.15
taskmanager.memory.network.min: 128mb
taskmanager.memory.network.max: 2gb

rest.bind-port: 50100-50200



# zeppelin
# cd
/export/servers/zeppelin-0.9.0-bin-all
# vim conf/zeppelin-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_241-amd64/
export USE_HADOOP=true
export ZEPPELIN_ADDR=192.168.0.24
export ZEPPELIN_PORT=8082
export ZEPPELIN_LOCAL_IP=192.168.0.24
export ZEPPELIN_JAVA_OPTS="-Dspark.executor.memory=8g -Dspark.cores.max=8"
export ZEPPELIN_MEM="-Xms1024m -Xmx4096m -XX:MaxMetaspaceSize=512m"
export HADOOP_CONF_DIR=/export/servers/hadoop-3.3.0/etc/hadoop
export ZEPPELIN_INTERPRETER_OUTPUT_LIMIT=2500000

# vim conf/shiro.ini
admin = password1, admin

 

2.6启动并验证flink集群和zeppelin

# 启动命令
bin/yarn-session.sh -tm 2048 -s 4 -d

bin/zeppelin-daemon.sh start

 三、使用yarn模式调试

http://<ip>:8082/#/interpreter

 

 flink.conf

%flink.conf
flink.execution.mode yarn

heartbeat.timeout 180000

flink.execution.packages org.apache.flink:flink-connector-jdbc_2.11:1.12.0,mysql:mysql-connector-java:8.0.16,org.apache.flink:flink-sql-connector-kafka_2.11:1.12.0,org.apache.flink:flink-sql-connector-elasticsearch7_2.12:1.12.1

table.exec.source.cdc-events-duplicate true
taskmanager.memory.task.off-heap.size 512MB
table.exec.mini-batch.enabled true
table.exec.mini-batch.allow-latency 5000
table.exec.mini-batch.size 50000

flink.jm.memory 2048
flink.tm.memory 4096
flink.tm.slot 2

flink.yarn.appName t_apibet_report

书写%flink.ssql

http://<ip>:8081调试

复杂表还是要拆表,拿yarn直接跑

 

四、实用命令

# jps
24642 Kafka
2818 RemoteInterpreterServer
1988 YarnSessionClusterEntrypoint
24452 YarnTaskExecutorRunner
23688 ZeppelinServer
21448 Jps
10952 NameNode
3530 YarnSessionClusterEntrypoint
20365 RemoteInterpreterServer
18961 NodeManager
11155 DataNode
28247 RemoteInterpreterServer
3864 YarnTaskExecutorRunner
32155 RemoteInterpreterServer
30491 RemoteInterpreterServer
13404 YarnTaskExecutorRunner
15964 YarnTaskExecutorRunner
21084 YarnSessionClusterEntrypoint
414 YarnSessionClusterEntrypoint
18783 ResourceManager
24040 QuorumPeerMain
26152 YarnTaskExecutorRunner
29160 YarnSessionClusterEntrypoint
14825 YarnTaskExecutorRunner
25322 CanalLauncher
24746 RemoteInterpreterServer
31214 YarnSessionClusterEntrypoint
25839 YarnSessionClusterEntrypoint
25136 CanalAdminApplication
1267 RemoteInterpreterServer
31604 YarnTaskExecutorRunner
11515 SecondaryNameNode


# yarn app -list

# yarn app -kill <Application-Id>

 

posted @ 2021-05-14 22:39  shuyang  阅读(617)  评论(0编辑  收藏  举报