1.搭建3台虚拟机
2.建立账户及信任关系
3.安装java
wget jdk-xxx rpm -i jdk-xxx
4.添加环境变量(全部)
export JAVA_HOME=/usr/java/jdk1.8.0_141 export JRE_HOME=$JAVA_HOME/jre export PATH=$PATH:$JAVA_HOME/bin:$JAVA_HOME/jre/bin export CLASSPATH=$CLASSPATH:.:$JAVA_HOME/lib:$JAVA_HOME/jre/lib export HADOOP_HOME=/data/spark/bin/hadoop export PATH=$PATH:$HADOOP_HOME/bin/:$HADOOP_HOME/sbin export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native" export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export SPARK_HOME=/data/spark/bin/spark export PATH=$PATH:$SPARK_HOME/bin
5.搭建hadoop
1>vi $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_141
2>vi $HADOOP_HOME/etc/hadoop/core-site.xml
<configuration> <property> <name>fs.defaultFS</name> <value>hdfs://10.0.0.5:9000</value> </property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/spark/bin/hadoop/tmp</value>
</property> </configuration>
3>vi $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<configuration> <property> <name>dfs.namenode.name.dir</name> <value>file:///data/spark/hdfs/name</value> </property> <property> <name>dfs.datanode.data.dir</name> <value>file:///data1/hdfs-ext,file:///data2/hdfs-ext,file:///data3/hdfs-ext</value> </property> <property> <name>dfs.namenode.checkpoint.dir</name> <value>/data/spark/hdfs/namesecondary</value> </property> <property> <name>dfs.namenode.http-address</name> <value>0.0.0.0:50070</value> </property> <property> <name>dfs.namenode.secondary.http-address</name> <value>0.0.0.0:50090</value> </property> <property> <name>dfs.datanode.http.address</name> <value>0.0.0.0:50075</value> </property> <property> <name>dfs.namenode.datanode.registration.ip-hostname-check</name> <value>false</value> </property> </configuration>
4>vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
<configuration> <property> <name>yarn.resourcemanager.hostname</name> <value>10.0.0.5</value> </property> <property> <name>yarn.nodemanager.local-dirs</name> <value>/data/spark/hdfs/nm-local-dir</value> </property> <property> <name>yarn.nodemanager.aux-services</name> <value>mapreduce_shuffle</value> </property> <property> <name>yarn.nodemanager.resource.memory-mb</name> <value>8192</value> </property> <property> <name>yarn.nodemanager.resource.cpu-vcores</name> <value>4</value> </property> <property> <name>yarn.resourcemanager.webapp.address</name> <value>0.0.0.0:8088</value> </property> <property> <name>yarn.nodemanager.webapp.address</name> <value>0.0.0.0:8042</value> </property> <property> <name>yarn.nodemanager.pmem-check-enabled</name> <value>false</value> </property> <property> <name>yarn.nodemanager.vmem-check-enabled</name> <value>false</value> </property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>5</value>
</property> </configuration>
5>vi $HADOOP_HOME/etc/hadoop/slaves
10.0.0.5 10.0.0.6 10.0.0.7
6>拷贝hadoop文件到各个从机,并设置PATH
7>hdfs namenode格式化
hdfs namenode -format
8>启动hdfs并查看日志
start-dfs.sh
9>启动yarn并查看日志
start-yarn.sh
10>查看各节点进程情况,一定要看日志
jps
一定要看日志
11>测试并查看日志
cd /xxx echo "this is a test for hdfs" > 1.txt hadoop fs -mkdir /spark hadoop fs -mkdir /spark/test hadoop fs -appendToFile 1.txt hdfs://10.0.0.5:9000/spark/test/1.txt hadoop fs -cat hdfs://10.0.0.5:9000/spark/test/1.txt
6.搭建spark
1>修改spark-env.sh
mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh vi $SPARK_HOME/conf/spark-env.sh
export SPARK_HOME=/data/spark/bin/spark export JAVA_HOME=/usr/java/jdk1.8.0_141 export HADOOP_HOME=/data/spark/bin/hadoop export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop export SPARK_MASTER_IP=10.0.0.5 export SPARK_LOCAL_DIRS=/data/spark/bin/spark export SPARK_LIBARY_PATH=.:$JAVA_HOME/lib:$JAVA_HOME/jre/lib:$HADOOP_HOME/lib/native export SPARK_LOG_DIR=/data/spark/bin/spark/logs
2>修改spark-defaults.conf
mv $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf vi $SPARK_HOME/conf/spark-defaults.conf
spark.yarn.jars hdfs://10.0.0.5:9000/spark/jars/*
3>上传jars
cd $SPARK_HOME/jars hadoop fs -mkdir /spark/jars hadoop fs -put * hdfs://10.0.0.5:9000/spark/jars/
4>修改slave(没什么用)
mv $SPARK_HOME/conf/slaves.template $SPARK_HOME/conf/slaves vi $SPARK_HOME/conf/slaves
10.0.0.5 10.0.0.6 10.0.0.7
5>单点交互测试
pyspark --master local[4]
6>集群交互测试
pyspark --master yarn --deploy-mode client
7>建立测试脚本 vi test.py
from __future__ import print_function import sys from random import random from operator import add from pyspark.sql import SparkSession if __name__ == "__main__": """ Usage: pi [partitions] """ spark = SparkSession\ .builder\ .appName("PythonPi")\ .getOrCreate() lines = spark.sparkContext.textFile("hdfs://10.0.0.5:9000/spark/test/1.txt") num = lines.count() p_str = lines.first() print("--------------------"+str(num)+"---------------------") print("--------------------"+p_str+"---------------------") spark.stop()
8>单点任务测试
spark-submit --master local[4] test.py
9>集群任务测试
spark-submit --master yarn --deploy-mode cluster test.py