Hadoop3.x高可用集群,HDFS、Yarn集群
集群环境规划
将整个 ha 搭建完成后,集群将形成以下模样
hadoop101 | hadoop102 | hadoop103 |
---|---|---|
NameNode | NameNode | NameNode |
JournalNode | JournalNode | JournalNode |
DataNode | DataNode | DataNode |
Zookeeper | Zookeeper | Zookeeper |
ZKFC | ZKFC | ZKFC |
ResourceManager | ResourceManager | ResourceManager |
NodeManager | NodeManager | NodeManager |
1.配置 core-site.xml
<configuration>
<!-- 把多个 NameNode 的地址组装成一个集群 mycluster -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!-- 指定 hadoop 运行时产生文件的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/ha/hadoop-3.3.1/data</value>
</property>
<!--webUI展示时的用户-->
<property>
<name>hadoop.http.staticuser.user</name>
<value>hadoop</value>
</property>
<!-- 指定 zkfc 要连接的 zkServer 地址 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>hadoop101:2181,hadoop102:2181,hadoop103:2181</value>
</property>
<!-- NN 连接 JN 重试次数,默认是 10 次 -->
<property>
<name>ipc.client.connect.max.retries</name>
<value>20</value>
</property>
<!-- 重试时间间隔,默认 1s -->
<property>
<name>ipc.client.connect.retry.interval</name>
<value>5000</value>
</property>
</configuration>
2.配置 hdfs-site.xml
<configuration>
<!-- NameNode 数据存储目录 -->
<property>
<name>dfs.namenode.name.dir</name>
<value>file://${hadoop.tmp.dir}/name</value>
</property>
<!-- DataNode 数据存储目录 -->
<property>
<name>dfs.datanode.data.dir</name>
<value>file://${hadoop.tmp.dir}/data</value>
</property>
<!-- JournalNode 数据存储目录 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>${hadoop.tmp.dir}/jn</value>
</property>
<!-- 完全分布式集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!-- 集群中 NameNode 节点都有哪些 -->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2,nn3</value>
</property>
<!-- NameNode 的 RPC 通信地址 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>hadoop101:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>hadoop102:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn3</name>
<value>hadoop103:8020</value>
</property>
<!-- NameNode 的 http 通信地址 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>hadoop101:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>hadoop102:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn3</name>
<value>hadoop103:9870</value>
</property>
<!-- 指定 NameNode 元数据在 JournalNode 上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://hadoop101:8485;hadoop102:8485;hadoop103:8485/mycluster</value>
</property>
<!-- 访问代理类: client 用于确定哪个 NameNode 为 Active -->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时需要 ssh 秘钥登录-->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/usr1/.ssh/id_rsa</value>
</property>
<!-- 启用 nn 故障自动转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
3.配置 yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 启用 resourcemanager ha -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 声明两台 resourcemanager 的地址 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>cluster-yarn1</value>
</property>
<!--指定 resourcemanager 的逻辑列表-->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2,rm3</value>
</property>
<!-- ========== rm1 的配置 ========== -->
<!-- 指定 rm1 的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>hadoop101</value>
</property>
<!-- 指定 rm1 的 web 端地址 -->
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>hadoop101:8088</value>
</property>
<!-- 指定 rm1 的内部通信地址 -->
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>hadoop101:8032</value>
</property>
<!-- 指定 AM 向 rm1 申请资源的地址 -->
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>hadoop101:8030</value>
</property>
<!-- 指定供 NM 连接的地址 -->
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>hadoop101:8031</value>
</property>
<!-- ========== rm2 的配置 ========== -->
<!-- 指定 rm2 的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>hadoop102</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>hadoop102:8088</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>hadoop102:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>hadoop102:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>hadoop102:8031</value>
</property>
<!-- ========== rm3 的配置 ========== -->
<!-- 指定 rm1 的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm3</name>
<value>hadoop103</value>
</property>
<!-- 指定 rm1 的 web 端地址 -->
<property>
<name>yarn.resourcemanager.webapp.address.rm3</name>
<value>hadoop103:8088</value>
</property>
<!-- 指定 rm1 的内部通信地址 -->
<property>
<name>yarn.resourcemanager.address.rm3</name>
<value>hadoop103:8032</value>
</property>
<!-- 指定 AM 向 rm1 申请资源的地址 -->
<property>
<name>yarn.resourcemanager.scheduler.address.rm3</name>
<value>hadoop103:8030</value>
</property>
<!-- 指定供 NM 连接的地址 -->
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm3</name>
<value>hadoop103:8031</value>
</property>
<!-- 指定 zookeeper 集群的地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>hadoop101:2181,hadoop102:2181,hadoop103:2181</value>
</property>
<!-- 启用自动恢复 -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 指定 resourcemanager 的状态信息存储在 zookeeper 集群 -->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 环境变量的继承 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLAS
SPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
4.配置mapred-site.xml
<configuration>
<!-- 指定MapReduce程序运行在Yarn上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
5.最后配置works
hadoop101
hadoop102
hadoop103
分发配置好的 hadoop 环境到其他节点
6.启动 HDFS-HA 集群
前提先启动zookeeper集群
1) 初始化 HA 在 Zookeeper 中状态:
[hadoop@hadoop101 ~]$ hdfs zkfc -formatZK
2) 在各个 JournalNode 节点上,输入以下命令启动 journalnode 服务
[hadoop@hadoop101 ~]$ hdfs --daemon start journalnode
[hadoop@hadoop102 ~]$ hdfs --daemon start journalnode
[hadoop@hadoop103 ~]$ hdfs --daemon start journalnode
3)在[nn1]上,对其进行格式化, 并启动
[hadoop@hadoop101 ~]$ hdfs namenode -format
[hadoop@hadoop101 ~]$ hdfs --daemon start namenode
4)在[nn2]和[nn3]上,同步 nn1 的元数据信息
[hadoop@hadoop102 ~]$ hdfs namenode -bootstrapStandby
[hadoop@hadoop103 ~]$ hdfs namenode -bootstrapStandby
5)启动[nn2]和[nn3]
[hadoop@hadoop102 ~]$ hdfs --daemon start namenode
[hadoop@hadoop103 ~]$ hdfs --daemon start namenode
6) 启动 HDFS 服务:
[hadoop@hadoop101 ~]$ start-dfs.sh
7) 启动 YARN
[hadoop@hadoop101 ~]$ start-yarn.sh
8)查看hdfs是否 Active
[hadoop@hadoop101 ~]$ hdfs haadmin -getServiceState nn1
- 也可以去 zkCli.sh 客户端查看 Namenode 选举锁节点内容,去到zookeeper目录下:
[hadoop@hadoop101 zookeeper-3.5.7]$ bin/zkCli.sh
[zk: localhost:2181(CONNECTED) 0] get -s /hadoop-ha/mycluster/ActiveStandbyElectorLock
myclusternn2 hadoop102 �>(�>
cZxid = 0xd00000004
ctime = Tue Feb 15 20:18:04 CST 2022
mZxid = 0xd00000004
mtime = Tue Feb 15 20:18:04 CST 2022
pZxid = 0xd00000004
cversion = 0
dataVersion = 0
aclVersion = 0
ephemeralOwner = 0x200002da9ac0000
dataLength = 33
numChildren = 0
9) 查看yarn服务状态
[hadoop@hadoop101 ~]$ yarn rmadmin -getServiceState rm1
- 可以去 zkCli.sh 客户端查看 ResourceManager 选举锁节点内容:
[hadoop@hadoop101 zookeeper-3.5.7]$ bin/zkCli.sh
[zk: localhost:2181(CONNECTED) 0] get -s /yarn-leader-election/cluster-yarn1/ActiveStandbyElectorLock
cluster-yarn1rm3
cZxid = 0xe0000012c
ctime = Tue Feb 15 22:19:03 CST 2022
mZxid = 0xe0000012c
mtime = Tue Feb 15 22:19:03 CST 2022
pZxid = 0xe0000012c
cversion = 0
dataVersion = 0
aclVersion = 0
ephemeralOwner = 0x20000014a5e0012
dataLength = 20
numChildren = 0
PS:如果是用root的用户的话,需要额外配置hadoop-env.sh
export HDFS_NAMENODE_USER="root"
export HDFS_DATANODE_USER="root"
export HDFS_ZKFC_USER="root"
export HDFS_JOURNALNODE_USER="root"
export YARN_NODEMANAGER_USER="root"
export YARN_RESOURCEMANAGER_USER="root"