org.apache.spark.launcher.Main源码分析
public static void main(String[] argsArray) throws Exception { //org.apache.spark.launcher.Main checkArgument(argsArray.length > 0, "Not enough arguments: missing class name."); /** * java -cp spark_home/lib/spark-assembly-1.6.0-hadoop2.6.0.jar org.apache.spark.launcher.Main org.apache.spark.deploy.SparkSubmit * --class org.apache.spark.repl.Main --name "Spark shell" --master spark://ip:7077 这个main方法最终会将org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --name "Spark shell" --master spark://luyl152:7077 给spark-class的 exec "${CMD[@]}"执行 */ List<String> args = new ArrayList<>(Arrays.asList(argsArray)); //spark-shell执行时第1个参数就是org.apache.spark.deploy.SparkSubmit String className = args.remove(0);//从args中移除org.apache.spark.deploy.SparkSubmit参数 //可以在spark-class或别的配制文件中 export SPARK_PRINT_LAUNCH_COMMAND=任何值,只要不为空即可 //可以用它来打印cmd,也就是spark-class的 exec "${CMD[@]}"中的值 boolean printLaunchCommand = !isEmpty(System.getenv("SPARK_PRINT_LAUNCH_COMMAND")); AbstractCommandBuilder builder;//创建命令解析器 if (className.equals("org.apache.spark.deploy.SparkSubmit")) { try { //将参数解析到spark对应的变量中,如 --class的值 放到mainClass变量中。 //如果有多出来的参数则将该参数放到SparkSubmitCommandBuilder成员sparkArgs这个集合中 builder = new SparkSubmitCommandBuilder(args); } catch (IllegalArgumentException e) { printLaunchCommand = false; System.err.println("Error: " + e.getMessage()); System.err.println(); MainClassOptionParser parser = new MainClassOptionParser(); try { parser.parse(args); } catch (Exception ignored) { // Ignore parsing exceptions. } List<String> help = new ArrayList<>(); if (parser.className != null) { help.add(parser.CLASS); help.add(parser.className); } help.add(parser.USAGE_ERROR); builder = new SparkSubmitCommandBuilder(help); } } else {
//第一个参数如果不是:org.apache.spark.deploy.SparkSubmit,则使用SparkClassCommandBuilder,解析器 builder = new SparkClassCommandBuilder(className, args); } Map<String, String> env = new HashMap<>(); List<String> cmd = builder.buildCommand(env);//这个在SparkSubmitCommandBuilder中重新实现了,不是抽象类中的函数 if (printLaunchCommand) { System.err.println("Spark Command: " + join(" ", cmd)); System.err.println("========================================"); } if (isWindows()) { System.out.println(prepareWindowsCommand(cmd, env)); } else { // In bash, use NULL as the arg separator since it cannot be used in an argument.
//返回有效的参数,会通过打印的方式给spark-class的 exec "${CMD[@]}"执行
/** '\0'和空格不是同一个概念。
'\0'表示字符串结束符,代表字符串结束,而空格是一个普通字符,显示在文本中可以选中。
'\0'的ASCII码为0,空格的ASCII码为32,两个不是同一个字符
在计算机程序中通常使用'\0'表示字符串结束,空格为文本字符,二者完全不同
*/
List<String> bashCmd = prepareBashCommand(cmd, env); for (String c : bashCmd) { System.out.print(c);//打印返回的命令字符,即被spark-class的CMD接收 System.out.print('\0'); } } }
// org.apache.spark.launcher.SparkSubmitCommandBuilder SparkSubmitCommandBuilder(List<String> args) { // args参数是这些:--class org.apache.spark.repl.Main --name "Spark shell" --master spark://ip:7077 //或者:pyspark-shell-main --name PySparkShell //或者: --master yarn --deploy-mode cluster --name Hbase --verbose --conf xxxx //即PYSPARK_SHELL,SPARKR_SHELL,RUN_EXAMPLE这种运行的第一个参数就是这些类型的名称 this.allowsMixedArguments = false; this.sparkArgs = new ArrayList<>(); boolean isExample = false; List<String> submitArgs = args; if (args.size() > 0) { switch (args.get(0)) { //第一个参数值是pyspark-shell-main,如果python执行的 case PYSPARK_SHELL: this.allowsMixedArguments = true; appResource = PYSPARK_SHELL; submitArgs = args.subList(1, args.size()); break; case SPARKR_SHELL: //"sparkr-shell-main" this.allowsMixedArguments = true; appResource = SPARKR_SHELL; submitArgs = args.subList(1, args.size()); break; case RUN_EXAMPLE: isExample = true; submitArgs = args.subList(1, args.size()); } this.isExample = isExample; //作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中,如 --class的值 放到mainClass变量中 //submitArgs会去除PYSPARK_SHELL,SPARKR_SHELL,RUN_EXAMPLE名称参数 // OptionParser属于org.apache.spark.launcher.SparkSubmitCommandBuilder内部类 //private class OptionParser extends SparkSubmitOptionParser //重新实现了handle系列的函数 OptionParser parser = new OptionParser(); parser.parse(submitArgs); //它的父类方法SparkSubmitOptionParser实现的 this.isAppResourceReq = parser.isAppResourceReq; } else { this.isExample = isExample; this.isAppResourceReq = false; } }
/** * Parse a list of spark-submit command line options. * <p> * See SparkSubmitArguments.scala for a more formal description of available options. * * @throws IllegalArgumentException If an error is found during parsing. * 参数是这些:--class org.apache.spark.repl.Main --name"Spark shell" --master spark://ip:7077. * 作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中,如 --class的值 放到mainClass变量中 * org.apache.spark.launcher.SparkSubmitOptionParser(parse) */ protected final void parse(List<String> args) { //spark-submit可以传sparkConf参数:--confPROP=VALUE ,参数可以看org.apache.spark.deploy.SparkSubmitArguments类最后面 //或spark-submit-h就可以查看 Pattern eqSeparatedOpt = Pattern.compile("(--[^=]+)=(.+)"); int idx = 0; for (idx = 0; idx < args.size(); idx++) { String arg = args.get(idx); String value = null; Matcher m = eqSeparatedOpt.matcher(arg); if (m.matches()) { arg = m.group(1);//--conf PROP value = m.group(2);//VALUE } // Look for options with a value. //该方法主要是找到spark-submit后面的带有--参数,如args 放进"--class",和opts二维数组进行匹配 //匹配到的还是返回--class,如果没有匹配到则null String name = findCliOption(arg, opts); if (name != null) { if (value == null) { if (idx == args.size() - 1) {//如果匹配了并且没有参数值则报错,如:只有 --class ,则size是1,idx此时0, 1-1=0 throw new IllegalArgumentException( String.format("Missing argument for option '%s'.", arg)); } idx++; value = args.get(idx); //如果有值,则idx索引的下一位就是参数对应的值 } //name就是spark-submit的参数如--class,而value就是参数对应的值 //OptionParser属于org.apache.spark.launcher.SparkSubmitCommandBuilder内部类 //private class OptionParser extends SparkSubmitOptionParser // 在它的自身OptionParser做的实现,作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中 //如 --class的值放到mainClass变量中(里面实现很easy,就不写了) if (!handle(name, value)) { //调用的是OptionParser类重新实现的handle函数 break; } continue; } // Look for a switch. // 如果上面没有匹配到,会再去匹配一下是否有出现-verbose这样参数 name = findCliOption(arg, switches); if (name != null) { if (!handle(name, null)) {//调用的是OptionParser类重新实现的handle函数 break; } continue; } if (!handleUnknown(arg)) {//调用的是OptionParser类重新实现的handleUnknown函数 break; } } if (idx < args.size()) { idx++; } //将多出来的参数加到 SparkSubmitCommandBuilder() {his.sparkArgs = new ArrayList<String>();..} handleExtraArgs(args.subList(idx, args.size()));//调用的是OptionParser类重新实现的handleExtraArgs函数 }
//其中一些key的定义 protected final String CLASS = "--class"; protected final String CONF = "--conf"; protected final String DEPLOY_MODE = "--deploy-mode"; protected final String DRIVER_CLASS_PATH = "--driver-class-path"; protected final String DRIVER_CORES = "--driver-cores"; protected final String DRIVER_JAVA_OPTIONS = "--driver-java-options"; protected final String DRIVER_LIBRARY_PATH = "--driver-library-path"; protected final String DRIVER_MEMORY = "--driver-memory"; protected final String EXECUTOR_MEMORY = "--executor-memory"; protected final String FILES = "--files"; protected final String JARS = "--jars"; protected final String KILL_SUBMISSION = "--kill"; protected final String MASTER = "--master"; protected final String NAME = "--name"; protected final String PACKAGES = "--packages"; protected final String PACKAGES_EXCLUDE = "--exclude-packages"; protected final String PROPERTIES_FILE = "--properties-file"; protected final String PROXY_USER = "--proxy-user"; protected final String PY_FILES = "--py-files"; protected final String REPOSITORIES = "--repositories"; protected final String STATUS = "--status"; protected final String TOTAL_EXECUTOR_CORES = "--total-executor-cores"; // Options that do not take arguments. protected final String HELP = "--help"; protected final String SUPERVISE = "--supervise"; protected final String USAGE_ERROR = "--usage-error"; protected final String VERBOSE = "--verbose"; protected final String VERSION = "--version"; // Standalone-only options. // YARN-only options. protected final String ARCHIVES = "--archives"; protected final String EXECUTOR_CORES = "--executor-cores"; protected final String KEYTAB = "--keytab"; protected final String NUM_EXECUTORS = "--num-executors"; protected final String PRINCIPAL = "--principal"; protected final String QUEUE = "--queue"; /** * This is the canonical list of spark-submit options. Each entry in the array contains the * different aliases for the same option; the first element of each entry is the "official" * name of the option, passed to {@link #handle(String, String)}. * <p> * Options not listed here nor in the "switch" list below will result in a call to * {@link #handleUnknown(String)}. * <p> * These two arrays are visible for tests. */ final String[][] opts = { { ARCHIVES }, { CLASS }, { CONF, "-c" }, { DEPLOY_MODE }, { DRIVER_CLASS_PATH }, { DRIVER_CORES }, { DRIVER_JAVA_OPTIONS }, { DRIVER_LIBRARY_PATH }, { DRIVER_MEMORY }, { EXECUTOR_CORES }, { EXECUTOR_MEMORY }, { FILES }, { JARS }, { KEYTAB }, { KILL_SUBMISSION }, { MASTER }, { NAME }, { NUM_EXECUTORS }, { PACKAGES }, { PACKAGES_EXCLUDE }, { PRINCIPAL }, { PROPERTIES_FILE }, { PROXY_USER }, { PY_FILES }, { QUEUE }, { REPOSITORIES }, { STATUS }, { TOTAL_EXECUTOR_CORES }, }; /** * List of switches (command line options that do not take parameters) recognized by spark-submit. */ final String[][] switches = { { HELP, "-h" }, { SUPERVISE }, { USAGE_ERROR }, { VERBOSE, "-v" }, { VERSION }, };
private class OptionParser extends SparkSubmitOptionParser { boolean isAppResourceReq = true; /** *作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中 */ @Override protected boolean handle(String opt, String value) { switch (opt) { case MASTER: master = value; break; case DEPLOY_MODE: deployMode = value; break; case PROPERTIES_FILE: propertiesFile = value; break; case DRIVER_MEMORY: conf.put(SparkLauncher.DRIVER_MEMORY, value); break; case DRIVER_JAVA_OPTIONS: conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, value); break; case DRIVER_LIBRARY_PATH: conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, value); break; case DRIVER_CLASS_PATH: conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, value); break; case CONF: String[] setConf = value.split("=", 2); checkArgument(setConf.length == 2, "Invalid argument to %s: %s", CONF, value); conf.put(setConf[0], setConf[1]); break; case CLASS: // The special classes require some special command line handling, since they allow // mixing spark-submit arguments with arguments that should be propagated to the shell // itself. Note that for this to work, the "--class" argument must come before any // non-spark-submit arguments. mainClass = value; if (specialClasses.containsKey(value)) { allowsMixedArguments = true; appResource = specialClasses.get(value); } break; ................................................
@Override public List<String> buildCommand(Map<String, String> env) //此处看一下SparkSubmitOptionParser.buildCommand(Map)这个方法 throws IOException, IllegalArgumentException { //PYSPARK_SHELL_RESOURCE表示python,SPARKR_SHELL_RESOURCE表示r语言 if (PYSPARK_SHELL.equals(appResource) && isAppResourceReq) { return buildPySparkShellCommand(env); } else if (SPARKR_SHELL.equals(appResource) && isAppResourceReq) { return buildSparkRCommand(env); } else { //这个env就是一个空的Map,会调用buildSparkSubmitCommand()方法 return buildSparkSubmitCommand(env); } }
//org.apache.spark.launcher.SparkSubmitCommandBuilder$buildSparkSubmitCommand) private List<String> buildSparkSubmitCommand(Map<String, String> env) throws IOException, IllegalArgumentException { // Load the properties file and check whether spark-submit will be running the app's driver // or just launching a cluster app. When running the driver, the JVM's argument will be // modified to cover the driver's configuration. //加载属性文件,并检查spark-submit是否正在运行driver的应用程序或仅启动集群应用程序。 // 在运行驱动程序时,JVM的参数将被修改以涵盖驱动程序的配置。 Map<String, String> config = getEffectiveConfig(); boolean isClientMode = isClientMode(config); //默认如果standalone不匹配--deploy-mode cluster就是client,所以这个值是true // 这个DRIVER_EXTRA_CLASSPATH在client模式是不能直接在SparkConf中设置的,因为driver的JVM已经被Spark-submit通过反射启动起来了 // 而是通过参数:--driver-class-path来设置的 // 这个DRIVER_EXTRA_CLASSPATH在client模式是不能直接在SparkConf中设置的, // 因为driver的JVM已经被Spark-submit通过反射启动起来了,应该通过参数:--driver-class-path来设置的 String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null; List<String> cmd = buildJavaCommand(extraClassPath); // Take Thrift Server as daemon if (isThriftServer(mainClass)) { addOptionString(cmd, System.getenv("SPARK_DAEMON_JAVA_OPTS")); } //SPARK_SUBMIT_OPTS就是在spark-shell中提到的,需要将java的classpath手动设置到scala中 // SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS-Dscala.usejavacp=true" addOptionString(cmd, System.getenv("SPARK_SUBMIT_OPTS")); // We don't want the client to specify Xmx. These have to be set by their corresponding // memory flag --driver-memory or configuration entry spark.driver.memory String driverExtraJavaOptions = config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS); if (!isEmpty(driverExtraJavaOptions) && driverExtraJavaOptions.contains("Xmx")) { String msg = String.format("Not allowed to specify max heap(Xmx) memory settings through " + "java options (was %s). Use the corresponding --driver-memory or " + "spark.driver.memory configuration instead.", driverExtraJavaOptions); throw new IllegalArgumentException(msg); } if (isClientMode) { // Figuring out where the memory value come from is a little tricky due to precedence. // Precedence is observed in the following order: // - explicit configuration (setConf()), which also covers --driver-memory cli argument. // - properties file. // - SPARK_DRIVER_MEMORY env variable // - SPARK_MEM env variable // - default value (1g) // Take Thrift Server as daemon String tsMemory = isThriftServer(mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null; String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY), System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM); cmd.add("-Xmx" + memory);//最大、小堆内存默认是1g addOptionString(cmd, driverExtraJavaOptions); mergeEnvPathList(env, getLibPathEnvName(), config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH)); } cmd.add("org.apache.spark.deploy.SparkSubmit"); //buildSparkSubmitArgs()返回list将上面spark-submit参数注入进来的参数及对应值取出来 cmd.addAll(buildSparkSubmitArgs()); return cmd; }
//org.apache.spark.launcher.SparkSubmitCommandBuilder$buildSparkSubmitArgs List<String> buildSparkSubmitArgs() { List<String> args = new ArrayList<>(); SparkSubmitOptionParser parser = new SparkSubmitOptionParser(); if (!allowsMixedArguments && isAppResourceReq) { checkArgument(appResource != null, "Missing application resource."); } if (verbose) { args.add(parser.VERBOSE); } if (master != null) { args.add(parser.MASTER); args.add(master); } if (deployMode != null) { args.add(parser.DEPLOY_MODE); args.add(deployMode); } if (appName != null) { args.add(parser.NAME); args.add(appName); } for (Map.Entry<String, String> e : conf.entrySet()) { args.add(parser.CONF); args.add(String.format("%s=%s", e.getKey(), e.getValue())); } if (propertiesFile != null) { args.add(parser.PROPERTIES_FILE); args.add(propertiesFile); } if (isExample) { jars.addAll(findExamplesJars()); } if (!jars.isEmpty()) { args.add(parser.JARS); args.add(join(",", jars)); } if (!files.isEmpty()) { args.add(parser.FILES); args.add(join(",", files)); } ...................................................
//CMD的命令如下 /usr/java/jdk1.7.0_79/bin/java -cp /opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/conf/:/opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/jars/*:/opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/conf/yarn-conf/ -Dscala.usejavacp=true -Xmx1g -XX:MaxPermSize=256m org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --name Spark shell spark-shell //或者 /usr/java/jdk1.7.0_79/bin/java -cp /opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/conf/:/opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/jars/*:/etc/hadoop/:/etc/hadoop/conf.cloudera.yarn/ -XX:MaxPermSize=256m org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode cluster --conf spark.driver.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* --conf spark.scheduler.mode=FAIR --conf spark.executorEnv.JAVA_HOME=/usr/java/jdk1.8 --conf spark.yarn.appMasterEnv.JAVA_HOME=/usr/java/jdk1.8 --conf spark.yarn.maxAppAttempts=1 --class opHbase.opHbase.TopHbase --name Hbase --verbose --files /etc/hadoop/conf/log4j.properties,/etc/hive/conf/hive-site.xml --jars hdfs://10.8.18.74:8020/ada/spark/share/tech_component/tc.plat.spark.jar,hdfs://10.8.18.74:8020/ada/spark/share/tech_component/bigdata4i-1.0.jar,hdfs://10.8.18.74:8020/ada/spark/share/tech_component/bigdata-sparklog-1.0.jar,hdfs://108474.server.bigdata.com.cn:8020/user/lyy/App/tc.app.test.opHbase-1.0.jar,hdfs://10.8.18.74:8020/ada/spark/share/tech_component/mysql-connector-java-5.1.24-bin.jar hdfs://108474.server.bigdata.com.cn:8020/user/lyy/App/opHbase.opHbase.jar loglevel=ALL path=hdfs://108474.server.bigdata.com.cn:8020/user/lyy/data/hfile hbtab=hbase_tes