Spark:使用Java代码提交spark任务
本文基于spark-1.6.2-bin-hadoop2.6
提交到本地
程序中指定的参数(param)和spark-submit提交时参数配置一致:
import org.apache.spark.deploy.SparkSubmit;
public class Dr {
public static void main(String[] args) {
String[] param = new String[]{
"--name", "app_name",
"--master", "local[*]",
"--class", "Tkmeans",
"/mnt/tkmeans_2.10-1.0.jar"
};
SparkSubmit.main(param);
}
}
提交到YARN
用spark-submit提交任务到YARN集群,只需要HADOOP_CONF_DIR
环境变量指向YARN的配置文件目录就好。
用程序提交虽然也要求指定HADOOP_CONF_DIR
环境变量,但指定了却并不起作用,需要在程序里指定yarn配置:
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.deploy.yarn.Client;
import org.apache.spark.deploy.yarn.ClientArguments;
public class Test {
public static void main(String[] args) {
String[] param = new String[] {
"--name", "test java submit job to yarn",
"--class", "Tkmeans",
// "--executor-memory","1G",
// "--arg","hdfs://node101:8020/user/root/log.txt",
// "--arg","hdfs://node101:8020/user/root/badLines_yarn_",
"--jar", "/mnt/tkmeans_2.10-1.0.jar" };
Configuration conf = new Configuration();
String os = System.getProperty("os.name");
boolean cross_platform = false;
if (os.contains("Windows")) {
cross_platform = true;
}
// 配置使用跨平台提交任务
conf.setBoolean("mapreduce.app-submission.cross-platform", cross_platform);
conf.set("fs.defaultFS", "hdfs://data60:9000"); // 指定namenode
conf.set("mapreduce.framework.name", "yarn"); // 指定使用yarn框架
conf.set("yarn.resourcemanager.address", "data60:8032"); // 指定resourcemanager
conf.set("yarn.resourcemanager.scheduler.address", "data60:8030"); // 指定资源分配器
conf.set("mapreduce.jobhistory.address", "data60:10020");
System.setProperty("SPARK_YARN_MODE", "true");
// 防止每次提交任务都上传此jar,只能指定1个jar文件而不能批量指定或指定目录。
// 可以通过指定SPARK_JAR环境变量,但此种方式已经弃用。
System.setProperty("spark.yarn.jar", "hdfs:///jars/spark-assembly-1.6.2-hadoop2.6.0.jar");
SparkConf sparkConf = new SparkConf();
ClientArguments cArgs = new ClientArguments(param, sparkConf);
Client client = new Client(cArgs, conf, sparkConf);
client.run(); // 这种提交方式无法反馈任务状态
}
}