Fork me on GitHub

搭建一套高可用的hadoop集群

hadoop部署

  • 准备好三台机器,规划配置如下
linux01 10.0.0.155   NameNode DataNode NodeManager
linux02 10.0.0.156   SecondaryNameNode  DataNode  NodeManager ResourceManager
linux04 10.0.0.161   DataNode  NodeManager

1.java安装

# 我用的是:jdk1.8.0_281.tar.gz
# 直接在 /usr/local/java/ 下解压
[linux01 /usr/local/java ]# tar zxvf jdk1.8.0_281.tar.gz
  • 添加环境变量
vi /etc/profile
export JAVA_HOME=/usr/local/java/jdk1.8.0_281/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
PATH=${JAVA_HOME}/bin:$PATH
export PATH

2.hosts配置

  • 先给三台主机改下名字
hostnamectl set-hostname  <名字>
bash # 刷新
  • 三台主机配置hosts 映射
vi /etc/hosts
10.0.0.156 linux02
10.0.0.155 linux01
10.0.0.161 linux04

3.免密登录配置:

# 比如你在linux01机器上。输入如下命令,然后一路小回车即可
[linux01 ~]# ssh-keygen -t rsa
# 然后ssh-copy-id把本地的ssh公钥文件安装到远程主机.将公钥分配给linux02, linux04
[linux01 ~]# ssh-copy-id linux02
[linux01 ~]# ssh-copy-id linux04
# 更改权限:
[linux01 ~]# chmod 700 .ssh/
[linux01 ~]# chmod 600 .ssh/authorized_keys
# 还需要的是将公钥放到authorized_keys内,否则后面start-dfs.sh还会让你输入密码。链接原理:https://blog.csdn.net/xinneya/article/details/102767327
cat id_rsa.pub >> authorized_keys

4.下载hadoop

[linux01 ~]# tar zxvf hadoop-2.7.1.tar.gz
  • 改个名字
[linux01 ~]# mv hadoop-2.7.1 hadoop  

5.配置文件修改

  • hadoop按照目录再/root/hadoop,我们进入到etc/hadoop,也就是/root/hadoop/etc/hadoop
  • vim hdfs-site.xml 提前创建好name目录
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address</name>
        <value>linux01:9000</value>
        <description></description>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/root/hadoop/name</value>
        <description></description>
    </property>
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>linux02:50090</value>
        <description></description>
    </property>
</configuration>
# dfs.namenode.name.dir # namenode
# dfs.replication  # 文件副本数量
# dfs.namenode.secondary.http-address   SecondaryNameNode
  • hadoop-env.sh
修改为:export JAVA_HOME=/usr/local/java/jdk1.8.0_281/
  • core-site.xml 提前创建好 tmp目录
<configuration>
        <property>
            <name>fs.defaultFS</name>
            <value>hdfs://linux01:9000</value>
        </property>
        <property>
            <name>hadoop.tmp.dir</name>
            <value>/root/hadoop/tmp</value>
        </property>
        <property>
            <name>dfs.namenode.name.dir</name>
            <value>file://${hadoop.tmp.dir}/dfs/name</value>
        </property>
        <property>
            <name>dfs.datanode.data.dir</name>
            <value>file://${hadoop.tmp.dir}/dfs/data</value>
        </property>
</configuration>
  • slaves slaves 添加datanode
linux02
linux04
linux01
  • yarn-site.xml
<configuration>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.resourcemanager.hostname</name>    
    <value>linux02</value>                          
  </property>  
  <property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
  </property>
  <property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>106800</value>
  </property>
</configuration>
  • mapred-site.xml
<configuration>
        <property>
            <name>mapreduce.framework.name</name>
            <value>yarn</value>
        </property>
        <property>
            <name>mapreduce.jobhistory.address</name>
            <value>linux01:10020</value>
        </property>
        <property>
            <name>mapreduce.jobhistory.webapp.address</name>
            <value>linux01:19888</value>
        </property>
</configuration>

6.分发配置文件

  • 将胚子和好文件分发给其他节点
scp -r hadoop/ linux02:$PWD
scp -r hadoop/ linux04:$PWD
  • 配置hadoop和yarn环境变量
export HADOOP_HOME=/root/hadoop/
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
PATH=${JAVA_HOME}/bin:$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export PATH
  • linux01节点(也就是master节点) format
dfs namenode -format

7.启动服务

  • 启动dfs服务
start-dfs.sh
  • 启动yarn
start-yarn.sh
  • 启动yarn的web管理界面,这里在linux02 启动了ResourceManager
yarn-daemon.sh start resourcemanager

8.简单测试

  • 上传文件
hdfs dfs -put t1.txt /

9.hadoop完全分布式配置

1.Namenode HA 搭建

  • hdfs-site.xml 修改
<configuration>
    <!-- 完全分布式集群名称 -->
    <property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
    </property>
    <!-- 集群中NameNode节点名称 -->
    <property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
    </property>
    <!-- nn1的RPC通信地址 -->
    <property>
      <name>dfs.namenode.rpc-address.mycluster.nn1</name>
      <value>linux01:8020</value>
    </property>
    <!-- nn2的RPC通信地址 -->
    <property>
      <name>dfs.namenode.rpc-address.mycluster.nn2</name>
      <value>linux02:8020</value>
    </property>
    <!-- nn1的http通信地址 -->
    <property>
        <name>dfs.namenode.http-address.mycluster.nn1</name>
        <value>linux01:50070</value>
    </property>

    <!-- nn2的http通信地址 -->
    <property>
          <name>dfs.namenode.http-address.mycluster.nn2</name>
          <value>linux02:50070</value>
    </property>
    <!-- 指定NameNode元数据在JournalNode上的存放位置 -->
    <property>
          <name>dfs.namenode.shared.edits.dir</name>
            <value>qjournal://linux01:8485;linux02:8485;linux04:8485/mycluster</value>
    </property>
    <!-- 访问代理类:client,mycluster,active配置失败自动切换实现方式-->
    <property>
          <name>dfs.client.failover.proxy.provider.mycluster</name>
          <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    <!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
    <property>
          <name>dfs.ha.fencing.methods</name>
          <value>sshfence</value>
    </property>
    <!-- 使用隔离机制时需要ssh无秘钥登录-->
    <property>
          <name>dfs.ha.fencing.ssh.private-key-files</name>
          <value>/root/.ssh/id_rsa</value>
    </property>
    <!-- 声明journalnode服务器存储目录-->
    <property>
          <name>dfs.journalnode.edits.dir</name>
          <value>/root/hadoop/jn</value>
    </property>
    <!-- 关闭权限检查-->
    <property>
        <name>dfs.permissions.enable</name>
        <value>false</value>
    </property>
    <!--开启Automatic Failover模式 -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
</configuration>
  • core-site.xml
<configuration>
        <property>
                <name>fs.defaultFS</name>
                <value>hdfs://mycluster</value>
        </property>
        <property>
                  <name>hadoop.tmp.dir</name>
                  <value>/root/hadoop/tmp</value>
        </property>
        <!-- 文件回收时间默认分钟 -->
        <property>
                 <name>fs.trash.interval</name>
                 <value>2</value>
        </property>
        <!-- 垃圾回收的检查间隔小于fs.trash.interval -->
        <property>
                <name>fs.trash.checkpoint.interval</name>
                <value>1</value>
        </property>
        <property>
                <name>hadoop.http.staticuser.user</name>
                <value>root</value>
        </property>
        <property>
                <name>ha.zookeeper.quorum</name>
                <value>linux01:2181,linux02:2181,linux04:2181</value>
         </property>
</configuration>
  • 配置好内容发送给linux02和linux03
scp -r hadoop/ linux04:$PWD
scp -r hadoop/ linux02:$PWD
  • 三台设备配置好zookeeper环境变量
export ZOOKEEPEERPATH=/opt/zookeeper-3.4.6
...:$ZOOKEEPEERPATH/bin
  • 检查三台机器是否都启动zookeeper。
[root@linu01 bin]#zkServer.sh start
[root@linu02 bin]#zkServer.sh start
[root@linu04 bin]#zkServer.sh start
  • linux01节点和linux02节点,linux04上启动QJM , journalnode主要是通过独立进程同步多个NameNode数据。这里journalnode要求启动奇数数量,并且至少3台。
[root@linux01 ~]# hadoop-daemon.sh start journalnode
# jps查看 是否启动成功
[root@linux01 ~]# jps
30604 JournalNode


[root@linux02 ~]# hadoop-daemon.sh start journalnode
[root@linux02 ~]# jps
9786 JournalNode

[root@linux04 ~]# hadoop-daemon.sh start journalnode
[root@linux04 ~]# jps
9781 JournalNode
  • 将linux01的namenode格式化
[root@linux01 ~]# hdfs namenode -format
  • 启动linux01的namenode
[root@linux01 ~]# hadoop-daemon.sh start namenode
# 检查
[root@linux01 ~]# jps
595 JournalNode
1242 NameNode
1692 QuorumPeerMain
1327 Jps
  • 第二个namenode 同步第一个
[root@linux02 ~]# hdfs namenode -bootstrapStandby
  • 启动第二个namenode
hadoop-daemon.sh start namenode
  • 查看页面均已经启动:

  • linux01

  • linux02

  • 手动强制切换

[root@linux01 ~]# hdfs haadmin -transitionToActive --forcemanual nn1
  • 查看nn2,nn1状态 可以看到nn1为激活,nn2为准备
[root@linux01 ~]# hdfs haadmin -getServiceState nn1
active
[root@linux01 ~]# hdfs haadmin -getServiceState nn2
standby
  • zookeeper设置故障自转移
[root@linux01 ~]# hdfs zkfc -formatZK

# 验证,进入zookeeper客户端:
[root@linux01 ~]# zkCli.sh -server linux01:2181
[zk: linux01:2181(CONNECTED) 0] ls /
[..hadoop-ha..]
# 可以看到hadoop-ha 代表成功
  • 集群启动
[root@linux01 ~]# start-dfs.sh
  • 三台机器JPS启动状态
[root@linux01 ~]# jps
4256 DFSZKFailoverController
3745 NameNode
1692 QuorumPeerMain
4316 Jps
3854 DataNode
4062 JournalNode
[root@linux02 ~]# jps
19936 JournalNode
19812 DataNode
30230 QuorumPeerMain
20076 DFSZKFailoverController
20190 Jps
19742 NameNode
[root@linux04 hadoop]# jps
11922 Jps
11620 JournalNode
11404 DataNode
6605 QuorumPeerMain
  • 验证
# 查看2个namenode状态
[root@linux01 hadoop]# hdfs haadmin -getServiceState nn2
active
[root@linux01 hadoop]# hdfs haadmin -getServiceState nn1
standby
# 可以看到nn1 为准备状态,nn2为运行状态,杀掉nn2的 namenode
[root@linux02 ~]# kill -9 19742
# 查看nn1状态,为激活状态
[root@linux01 hadoop]# hdfs haadmin -getServiceState nn1
active
# 查看nn2状态, 已经无法连接
# 重新启动同步nn1,并启动nn2
[root@linux02 ~]# hdfs namenode -bootstrapStandby
[root@linux02 ~]# hadoop-daemon.sh start namenode
# 查看重启nn2状态,准备中
[root@linux02 ~]# hdfs haadmin -getServiceState nn2
standby
# 杀死nn1,看会不会将namenode切换nn2
[root@linux01 ~]# kill -9 3745
[root@linux01 ~]# hdfs haadmin -getServiceState nn2
active
# 切换到nn2上

2.Yarn的HA搭建

  • 修改yarn-site.xml
<configuration>
	<property>
		<name>yarn.nodemanager.aux-services</name>
		<value>mapreduce_shuffle</value>
	</property>
	<!--日志聚合-->
	<property>
		<name>yarn.log-aggregation-enable</name>
		<value>true</value>
	</property>
	<!--任务历史服务-->
	<property>
		<name>yarn.log.server.url</name>
		<value>http://linux01:19888/jobhistory/logs/</value>
	</property>
	<property>
		<name>yarn.log-aggregation.retain-seconds</name>
		<value>86400</value>
	</property>
	<!--启用resourcemanager ha-->
	<property>
		<name>yarn.resourcemanager.ha.enabled</name>
		<value>true</value>
	</property>
	<!--声明两台resourcemanager的地址-->
	<property>
		<name>yarn.resourcemanager.cluster-id</name>
		<value>cluster-yarn1</value>
	</property>
	<property>
		<name>yarn.resourcemanager.ha.rm-ids</name>
		<value>rm1,rm2</value>
	</property>
	<property>
		<name>yarn.resourcemanager.hostname.rm1</name>
		<value>linux02</value>
	</property>
	<property>
		<name>yarn.resourcemanager.hostname.rm2</name>
		<value>linux04</value>
	</property>
	<!--指定zookeeper集群的地址-->
	<property>
		<name>yarn.resourcemanager.zk-address</name>
		<value>linux01:2181,linux02:2181,linux03:2181,linux04:2181</value>
	</property>
	<!--启用自动恢复-->
	<property>
		<name>yarn.resourcemanager.recovery.enabled</name>
		<value>true</value>
	</property>
	<!--指定resourcemanager的状态信息存储在zookeeper集群-->
	<property>
		<name>yarn.resourcemanager.store.class</name>     
		<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
	</property>
</configuration>
  • 我在linux02和linux04配置ResourceManager 所以在linux02启动
# linux02启动yarn
[root@linux02 ~]# start-yarn.sh
# jps查看,可以看到ResourceManager已经启动
[root@linux02 ~]# jps
7298 NameNode
8355 ResourceManager
30230 QuorumPeerMain
7756 NodeManager
7484 JournalNode
7372 DataNode
8717 Jps
7630 DFSZKFailoverController
# 启动linux04的yarn
[root@linux04 ~]# start-yarn.sh
# jps查看,可以看到ResourceManager已经启动
[root@linux04 ~]# jps
28293 JournalNode
5754 Jps
27931 DataNode
5532 ResourceManager
6605 QuorumPeerMain
28622 NodeManager
  • 查看ResourceManager的状态
[root@linux01 current]# yarn rmadmin -getServiceState rm1
active
[root@linux01 current]# yarn rmadmin -getServiceState rm2
standby
# 可以看到rm1为活跃状态,rm2为准备状态
  • 验证:
# 杀死rm1,rm1在linux02上
[root@linux02 ~]# kill -9 8355
# 查看rm1和rm2状态
[root@linux02 ~]# yarn rmadmin -getServiceState rm2
active
[root@linux02 ~]# yarn rmadmin -getServiceState rm1
21/06/18 09:12:29 INFO ipc.Client: Retrying connect to server: linux02/10.0.0.156:8033. Already tried 0 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=1, sleepTime=1000 MILLISECONDS)
Operation failed: Call From linux02/10.0.0.156 to linux02:8033 failed on connection exception: java.net.ConnectException: 拒绝连接; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused
# 此时rm2为活跃状态而rm1无法连接


# 重启rm1,并杀死rm2看yarn是否能切换到rm1
[root@linux02 ~]# start-yarn.sh
# 此时rm2为激活状态,rm1为准备状态
[root@linux02 ~]# yarn rmadmin -getServiceState rm2
active
[root@linux02 ~]# yarn rmadmin -getServiceState rm1
standby
# 杀死rm2
[root@linux04 ~]# kill -9 5532
# 查看rm1
[root@linux04 ~]# yarn rmadmin -getServiceState rm1
active
# 此时rm1也成功切换
  • 参考文献:

https://blog.csdn.net/baidu_28997655/article/details/81906591

https://blog.csdn.net/oschina_41140683/article/details/80332080

https://www.notion.so/Spark-ed90b76b7c83408990a2959b0030c2fb

https://blog.csdn.net/mashuai720/article/details/80097217

posted @ 2021-06-18 09:24  是阿凯啊  阅读(148)  评论(0编辑  收藏  举报