Hadoop 集群搭建

#1.配置机器名字
vim /etc/sysconfig/network
NETWORKING=yes  #使用网络
HOSTNAME=bigdata-senior01.yicheng.com  #设置主机名

#2.配置host
vim /etc/hosts
192.168.197.100 bigdata-senior01.yicheng.com
192.168.197.101 bigdata-senior02.yicheng.com
192.168.197.102 bigdata-senior03.yicheng.com

#3.关闭防火墙
systemctl disable firewalld

#4.关闭SELinux
vim /etc/sysconfig/selinux

    # This file controls the state of SELinux on the system.
    # SELINUX= can take one of these three values:
    #     enforcing - SELinux security policy is enforced.
    #     permissive - SELinux prints warnings instead of enforcing.
    #     disabled - No SELinux policy is loaded.
    #SELINUX=enforcing
    SELINUX=disable
    # SELINUXTYPE= can take one of three two values:
    #     targeted - Targeted processes are protected,
    #     minimum - Modification of targeted policy. Only selected processes are protected. 
    #     mls - Multi Level Security protection.
    SELINUXTYPE=targeted 

#5.查看JDK版本,如果是open jdk 安装 JPS
java -version
    openjdk version "1.8.0_131"
    OpenJDK Runtime Environment (build 1.8.0_131-b12)
    OpenJDK 64-Bit Server VM (build 25.131-b12, mixed mode)
yum install -y  java-1.8.0-openjdk-devel

[root@bigdata-senior01 ~]# jps
41598 Jps
[root@bigdata-senior01 ~]# 

#6.创建用户 hadoop
[root@bigdata-senior01 ~]# groupadd bigdata
[root@bigdata-senior01 ~]# useradd -m -g bigdata hadoop
[root@bigdata-senior01 ~]# ls /home/
alex  hadoop
[root@bigdata-senior01 ~]# ll /home/
总用量 4
drwx------. 14 alex   alex    4096 4月  10 08:19 alex
drwx------.  3 hadoop bigdata   78 4月  10 10:02 hadoop
[root@bigdata-senior01 ~]# 

#7.克隆虚拟机
#a.虚拟机--》右键--》clone--》克隆全部
#b.重新生成网卡MAC地址
#c.启动虚拟机修改网络地址
192.168.197.101 bigdata-senior02.yicheng.com
192.168.197.102 bigdata-senior03.yicheng.com

#8.调整到字符界面运行
[root@bigdata-senior03 ~]# systemctl get-default
graphical.target
[root@bigdata-senior03 ~]# systemctl set-default multi-user.target
Removed symlink /etc/systemd/system/default.target.
Created symlink from /etc/systemd/system/default.target to /usr/lib/systemd/system/multi-user.target.
[root@bigdata-senior03 ~]# 

#9.配置SSH无密码互联
#在bigdata01上生成公钥
 ssh-keygen -t rsa
#分发公钥
ssh-copy-id bigdata-senior01.yicheng.com
ssh-copy-id bigdata-senior02.yicheng.com
ssh-copy-id bigdata-senior03.yicheng.com
#在bigdata02,bigdata03机器上做相同的操作

#服务器功能规划
bigdata-senior01.yicheng.com    bigdata-senior02.yicheng.com    bigdata-senior03.yicheng.com
    NameNode                    ResourceManage     
    DataNode                    DataNode                        DataNode
    NodeManager                    NodeManager                        NodeManager
    HistoryServer                                                 SecondaryNameNode

#10.在第一台机器上安装Hadoop
wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz
tar -xvf hadoop-3.3.0.tar.gz

#配置环境变量 export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64/
[hadoop@bigdata-senior01 ~]$ which java
/usr/bin/java
[hadoop@bigdata-senior01 ~]$ ls -lr /user/bin/java
ls: 无法访问/user/bin/java: 没有那个文件或目录
[hadoop@bigdata-senior01 ~]$ ls -lr /usr/bin/java
lrwxrwxrwx. 1 root root 22 4月  10 09:40 /usr/bin/java -> /etc/alternatives/java
[hadoop@bigdata-senior01 ~]$ ls -lrt /etc/alternatives/java
lrwxrwxrwx. 1 root root 73 4月  10 09:40 /etc/alternatives/java -> /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64/jre/bin/java
[hadoop@bigdata-senior01 ~]$

#配置core-site.xml
<configuration>
 <property>
   <name>fs.defaultFS</name>
   <value>hdfs://bigdata-senior01.yicheng.com:8020</value>
 </property>
 <property>
   <name>hadoop.tmp.dir</name>
   <value>/home/hadoop/hadoop-3.3.0/data/tmp</value>
 </property>
</configuration>
#hadoop.tmp.dir为hadoop临时目录的地址,默认情况下,NameNode和DataNode的数据文件都会存在这个目录下的对应子目录下。应该保证此目录是存在的,如果不存在,先创建。

#配置hdfs-site.xml
<configuration>
 <property>
   <name>dfs.namenode.secondary.http-address</name>
   <value>bigdata-senior03.yicheng.com:50090</value>
 </property>
</configuration>
#dfs.namenode.secondary.http-address是指定secondaryNameNode的http访问地址和端口号

#配置slave hadoop 3 中已经修改为 workers 文件了
[hadoop@bigdata-senior01 hadoop]$ vim workers
[hadoop@bigdata-senior01 hadoop]$ pwd
/home/hadoop/hadoop-3.3.0/etc/hadoop
[hadoop@bigdata-senior01 hadoop]$ cat workers 
bigdata-senior01.yicheng.com
bigdata-senior02.yicheng.com
bigdata-senior03.yicheng.com
[hadoop@bigdata-senior01 hadoop]$ 
#slaves文件是指定HDFS上有哪些DataNode节点,hadoop 3中需要修改为workers 不然在主节点执行 start-all.sh的时候
#从节点的datanode服务无法启动,需要手动启动。

#配置yarn-site.xml
<property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
</property>
<property>
    <name>yarn.resourcemanager.hostname</name>
    <value>bigdata-senior02.yicheng.com</value>
</property>
<property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
</property>
<property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>106800</value>
</property>
#根据规划yarn.resourcemanager.hostname这个指定resourcemanager服务器指向bigdata-senior02.yicheng.com
#yarn.log-aggregation-enable是配置是否启用日志聚集功能。
#yarn.log-aggregation.retain-seconds是配置聚集的日志在HDFS上最多保存多长时间

#配置mapred-site.xml
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>bigdata-senior01.yicheng.com:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>bigdata-senior01.yicheng.com:19888</value>
    </property>
#mapreduce.framework.name设置mapreduce任务运行在yarn上
#mapreduce.jobhistory.address是设置mapreduce的历史服务安装在BigData03机器上。
#mapreduce.jobhistory.webapp.address是设置历史服务的web页面地址和端口号。

#通过scp 分发配置好的Hadoop
scp -r /home/hadoop/hadoop-3.3.0/ bigdata-senior02.yicheng.com:/home/hadoop
scp -r /home/hadoop/hadoop-3.3.0/ bigdata-senior03.yicheng.com:/home/hadoop

#11.执行格式化
#在core-site.xml 中指定dsf.namenode.name.dir dfs.datanode.data.dir 目录
<property>
     <name>dfs.namenode.name.dir</name>
     <value>file://${hadoop.tmp.dir}/dfs/name</value>
  </property>
<property>
     <name>dfs.datanode.data.dir</name>
     <value>file://${hadoop.tmp.dir}/dfs/data</value>
</property>
#保存好后scp 到另外两个机器
[hadoop@bigdata-senior01 hadoop]$ scp /home/hadoop/hadoop-3.3.0/etc/hadoop/core-site.xml bigdata-senior02.yicheng.com:/home/hadoop/hadoop-3.3.0/etc/hadoop
core-site.xml                                                                                                                                                                  100% 1232   600.7KB/s   00:00    
[hadoop@bigdata-senior01 hadoop]$ scp /home/hadoop/hadoop-3.3.0/etc/hadoop/core-site.xml bigdata-senior03.yicheng.com:/home/hadoop/hadoop-3.3.0/etc/hadoop
core-site.xml                                                                                                                                                                  100% 1232   291.8KB/s   00:00    
[hadoop@bigdata-senior01 hadoop]$ 
#因为每次格式化,默认是创建一个集群ID,并写入NameNode和DataNode的VERSION文件中(VERSION文件所在目录为dfs/name/current 和 dfs/data/current),
#重新格式化时,默认会生成一个新的集群ID,如果不删除原来的目录,会导致namenode中的VERSION文件中是新的集群ID,而DataNode中是旧的集群ID,不一致时会报错。
#另一种方法是格式化时指定集群ID参数,指定为旧的集群ID。
#执行格式化操作
[hadoop@bigdata-senior01 hadoop]$ /home/hadoop/hadoop-3.3.0/bin/hdfs namenode -format


#12.启动集群
#启动hdfs
[hadoop@bigdata-senior01 ~]$ /home/hadoop/hadoop-3.3.0/sbin/start-dfs.sh
Starting namenodes on [bigdata-senior01.yicheng.com]
bigdata-senior01.yicheng.com: ERROR: JAVA_HOME is not set and could not be found.
Starting datanodes
localhost: Warning: Permanently added 'localhost' (ECDSA) to the list of known hosts.
localhost: ERROR: JAVA_HOME is not set and could not be found.
Starting secondary namenodes [bigdata-senior03.yicheng.com]
bigdata-senior03.yicheng.com: ERROR: JAVA_HOME is not set and could not be found.
[hadoop@bigdata-senior01 ~]$ echo $JAVA_HOME
/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64/
[hadoop@bigdata-senior01 ~]$ 
#在hadoop-env.sh 中指定JAVA_HOME
[hadoop@bigdata-senior01 hadoop]$ export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64/
[hadoop@bigdata-senior01 hadoop]$ scp /home/hadoop/hadoop-3.3.0/etc/hadoop/hadoop-env.sh bigdata-senior02.yicheng.com:/home/hadoop/hadoop-3.3.0/etc/hadoop
[hadoop@bigdata-senior01 hadoop]$ scp /home/hadoop/hadoop-3.3.0/etc/hadoop/hadoop-env.sh bigdata-senior03.yicheng.com:/home/hadoop/hadoop-3.3.0/etc/hadoop
[hadoop@bigdata-senior01 hadoop-3.3.0]$ /home/hadoop/hadoop-3.3.0/sbin/start-dfs.sh
Starting namenodes on [bigdata-senior01.yicheng.com]
Starting datanodes
Starting secondary namenodes [bigdata-senior03.yicheng.com]
bigdata-senior03.yicheng.com: WARNING: /home/hadoop/hadoop-3.3.0/logs does not exist. Creating.
[hadoop@bigdata-senior01 hadoop-3.3.0]$

    #启动yarn
    [hadoop@bigdata-senior01 hadoop-3.3.0]$ /home/hadoop/hadoop-3.3.0/sbin/start-yarn.sh
    Starting resourcemanager
    Starting nodemanagers
    [hadoop@bigdata-senior01 hadoop-3.3.0]$

    #在BigData02上启动ResourceManager:
    [hadoop@bigdata-senior02 hadoop-3.3.0]$ /home/hadoop/hadoop-3.3.0/sbin/yarn-daemon.sh start resourcemanager
    WARNING: Use of this script to start YARN daemons is deprecated.
    WARNING: Attempting to execute replacement "yarn --daemon start" instead.
    WARNING: /home/hadoop/hadoop-3.3.0/logs does not exist. Creating.
    [hadoop@bigdata-senior02 hadoop-3.3.0]$
    #3.3.0 中已经不推荐使用这种启动方式了,直接用yarn命令启动
    [hadoop@bigdata-senior02 bin]$ ./yarn --daemon start resourcemanager
    resourcemanager is running as process 5361.  Stop it first.
    [hadoop@bigdata-senior02 bin]$ 
    #前面已经启动起来,此命令也只是查看

#hadoop 3.0后直接在BigData02上启动yarn resourcemanager 和nodemanagers 就可以一起启动起来了。


#bigdata01上启动日志服务
[hadoop@bigdata-senior01 hadoop-3.3.0]$ sbin/mr-jobhistory-daemon.sh start historyserver
WARNING: Use of this script to start the MR JobHistory daemon is deprecated.
WARNING: Attempting to execute replacement "mapred --daemon start" instead.
[hadoop@bigdata-senior01 hadoop-3.3.0]$
#3.3.0中这种启动方式同样被抛弃,直接用 mapred命令启动
[hadoop@bigdata-senior03 hadoop-3.3.0]$ ./bin/mapred --daemon start historyserver


#查看 web 页面
#Once the Hadoop cluster is up and running check the web-ui of the components as described below:
#NameNode
http://192.168.197.100:9870/
#ResourceManager
http://192.168.197.101:8088/
#MapReduce JobHistory Server
http://192.168.197.100:19888/

#测试 maperreduce 的 wordcount job
#上传测试文件
[hadoop@bigdata-senior01 data]$ rz

[hadoop@bigdata-senior01 data]$ ls
SingleCluster.html  tmp
[hadoop@bigdata-senior01 data]$ 
#在hdfs 创建输入目录 input
[hadoop@bigdata-senior01 hadoop-3.3.0]$ ./bin/hdfs dfs -mkdir /input
[hadoop@bigdata-senior01 hadoop-3.3.0]$ 
#将文件SingleCluster.html 上传到hdfs
[hadoop@bigdata-senior01 hadoop-3.3.0]$ ./bin/hdfs dfs -put /home/hadoop/hadoop-3.3.0/data/SingleCluster.html /input/SingleCluster.html
[hadoop@bigdata-senior01 hadoop-3.3.0]$ ./bin/hdfs dfs -ls /input
Found 1 items
-rw-r--r--   3 hadoop supergroup      36814 2021-04-10 21:13 /input/SingleCluster.html
[hadoop@bigdata-senior01 hadoop-3.3.0]$ 
#运行hadoop自带的mapreduce Demo
 bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0.jar wordcount /input/SingleCluster.html /output
#报错
    2_0002_000002 exited with  exitCode: 1
    Failing this attempt.Diagnostics: [2021-04-10 21:18:33.832]Exception from container-launch.
    Container id: container_1618053263682_0002_02_000001
    Exit code: 1

    [2021-04-10 21:18:33.838]Container exited with a non-zero exit code 1. Error file: prelaunch.err.
    Last 4096 bytes of prelaunch.err :
    Last 4096 bytes of stderr :
    错误: 找不到或无法加载主类 org.apache.hadoop.mapreduce.v2.app.MRAppMaster


    [2021-04-10 21:18:33.838]Container exited with a non-zero exit code 1. Error file: prelaunch.err.
    Last 4096 bytes of prelaunch.err :
    Last 4096 bytes of stderr :
    错误: 找不到或无法加载主类 org.apache.hadoop.mapreduce.v2.app.MRAppMaster


    For more detailed output, check the application tracking page: http://bigdata-senior02.yicheng.com:8088/cluster/app/application_1618053263682_0002 Then click on links to logs of each attempt.
    . Failing the application.
    2021-04-10 21:18:34,892 INFO mapreduce.Job: Counters: 0

#将执行路径写入到配置文件
[hadoop@bigdata-senior01 hadoop-3.3.0]$ bin/hadoop classpath
/home/hadoop/hadoop-3.3.0/etc/hadoop:/home/hadoop/hadoop-3.3.0/share/hadoop/common/lib/*:/home/hadoop/hadoop-3.3.0/share/hadoop/common/*:/home/hadoop/hadoop-3.3.0/share/hadoop/hdfs:/home/hadoop/hadoop-3.3.0/share/hadoop/hdfs/lib/*:/home/hadoop/hadoop-3.3.0/share/hadoop/hdfs/*:/home/hadoop/hadoop-3.3.0/share/hadoop/mapreduce/*:/home/hadoop/hadoop-3.3.0/share/hadoop/yarn:/home/hadoop/hadoop-3.3.0/share/hadoop/yarn/lib/*:/home/hadoop/hadoop-3.3.0/share/hadoop/yarn/*
vim yarn-site.xml
<property>
    <name>yarn.application.classpath</name>
    <value>/home/hadoop/hadoop-3.3.0/etc/hadoop:/home/hadoop/hadoop-3.3.0/share/hadoop/common/lib/*:/home/hadoop/hadoop-3.3.0/share/hadoop/common/*:/home/hadoop/hadoop-3.3.0/share/hadoop/hdfs:/home/hadoop/hadoop-3.3.0/share/hadoop/hdfs/lib/*:/home/hadoop/hadoop-3.3.0/share/hadoop/hdfs/*:/home/hadoop/hadoop-3.3.0/share/hadoop/mapreduce/*:/home/hadoop/hadoop-3.3.0/share/hadoop/yarn:/home/hadoop/hadoop-3.3.0/share/hadoop/yarn/lib/*:/home/hadoop/hadoop-3.3.0/share/hadoop/yarn/*</value>
</property>

scp /home/hadoop/hadoop-3.3.0/etc/hadoop/yarn-site.xml bigdata-senior03.yicheng.com:/home/hadoop/hadoop-3.3.0/etc/hadoop

#重新运行hadoop自带的mapreduce Demo
 bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0.jar wordcount /input/SingleCluster.html /output

 [hadoop@bigdata-senior01 hadoop-3.3.0]$ bin/hdfs dfs -ls /output
Found 2 items
-rw-r--r--   3 hadoop supergroup          0 2021-04-10 22:10 /output/_SUCCESS
-rw-r--r--   3 hadoop supergroup      20914 2021-04-10 22:10 /output/part-r-00000
[hadoop@bigdata-senior01 hadoop-3.3.0]$ 
#查看统计结果
[hadoop@bigdata-senior01 hadoop-3.3.0]$ bin/hdfs dfs -cat /output/part-r-00000 | more

 

posted @ 2022-11-18 16:23  Alex-Zeng  阅读(81)  评论(0编辑  收藏  举报