spark应用运行机制解析1

bin/spark-submit

＃设置SPARK_HOME
if [ -z "${SPARK_HOME}" ]; then
  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
fi

# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0
#调用org.apache.spark.deploy.SparkSubmit的main方法
exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"

org.apache.spark.deploy.SparkSubmit

  def main(args: Array[String]): Unit = {
    val appArgs = new SparkSubmitArguments(args)
    if (appArgs.verbose) {
      // scalastyle:off println
      printStream.println(appArgs)
      // scalastyle:on println
    }
    appArgs.action match {
      //解析参数，调用submit(appArgs)方法
      case SparkSubmitAction.SUBMIT => submit(appArgs)
      case SparkSubmitAction.KILL => kill(appArgs)
      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
    }
  }

org.apache.spark.deploy.SparkSubmit#submit

  private def submit(args: SparkSubmitArguments): Unit = {
  //首先要解析参数，得到需要的classpath，环境变量，mainClass
    val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)

    def doRunMain(): Unit = {
      if (args.proxyUser != null) {
      //.....简化代码
    	runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
      } else {
      //如果未设置代理的话，将调用此处方法
        runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
      }
    }

org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment

此方法是整个启动过程的核心代码，if条件太多，有点凌乱,最终返回值

   * Prepare the environment for submitting an application.
   * This returns a 4-tuple:
   *   (1) the arguments for the child process,
   *   (2) a list of classpath entries for the child,
   *   (3) a map of system properties, and
   *   (4) the main class for the child
   * Exposed for testing.
   */

1. 核心思想就是返回子进程需要的参数、子进程需要的classpath、环境变量和mainClass
2. 此处我们只需寻找yarn－client和yarn－cluster的过程。其它mesos和standalone先不做分析。

    // Set the cluster manager
    val clusterManager: Int = args.master match {
      case m if m.startsWith("yarn") => YARN
      case m if m.startsWith("spark") => STANDALONE
      case m if m.startsWith("mesos") => MESOS
      case m if m.startsWith("local") => LOCAL
      case _ => printErrorAndExit("Master must start with yarn, spark, mesos, or local"); -1
    }

    // Set the deploy mode; default is client mode
    var deployMode: Int = args.deployMode match {
      case "client" | null => CLIENT
      case "cluster" => CLUSTER
      case _ => printErrorAndExit("Deploy mode must be either client or cluster"); -1
    }

以上代码是判断我们的集群类型和部署方式，集群类型分为四种，分别为YARN/STANDALONE/MESOS/LOCAL。部署方式分为 CLIENT/CLUSTER

    // In client mode, launch the application main class directly
    // In addition, add the main application jar and any added jars (if any) to the classpath
    if (deployMode == CLIENT) {
      childMainClass = args.mainClass
      if (isUserJar(args.primaryResource)) {
        childClasspath += args.primaryResource
      }
      if (args.jars != null) { childClasspath ++= args.jars.split(",") }
      if (args.childArgs != null) { childArgs ++= args.childArgs }
    }

CLIENT的模式，那么直接用我们application的mainClass， mainClass要么用--class指定，要么从jar包的读取。

mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")

并且将application的jar包加到我们classpath中。

      // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
   if (isYarnCluster) {
     childMainClass = "org.apache.spark.deploy.yarn.Client"
     if (args.isPython) {
       childArgs += ("--primary-py-file", args.primaryResource)
       if (args.pyFiles != null) {
         childArgs += ("--py-files", args.pyFiles)
       }
       childArgs += ("--class", "org.apache.spark.deploy.PythonRunner")
     } else if (args.isR) {
       val mainFile = new Path(args.primaryResource).getName
       childArgs += ("--primary-r-file", mainFile)
       childArgs += ("--class", "org.apache.spark.deploy.RRunner")
     } else {
       if (args.primaryResource != SPARK_INTERNAL) {
         childArgs += ("--jar", args.primaryResource)
       }
       childArgs += ("--class", args.mainClass)
     }
     if (args.childArgs != null) {
       args.childArgs.foreach { arg => childArgs += ("--arg", arg) }
     }
   }

CLUSTER 模式呢,mainClass指定为 org.apache.spark.deploy.yarn.Client，对application本身的mainClass做了一层封装。
将application自己的mainClass以--class参数的方式传递到 org.apache.spark.deploy.yarn.Client的main方法中。在Client中，会以此参数作为区别CLUSTER和CLIENT的依据。

def isClusterMode: Boolean = userClass != null
为什么需要在Client类中还要区分CLUSTER和CLIENT呢？后面会有详细的介绍。
最终我们生成了mainClass和classPath了，下一步需要执行了。

    /**
   * Run the main method of the child class using the provided launch environment.
   *
   * Note that this main class will not be the one provided by the user if we're
   * running cluster deploy mode or python applications.
   */
  private def runMain(
      childArgs: Seq[String],
      childClasspath: Seq[String],
      sysProps: Map[String, String],
      childMainClass: String,
      verbose: Boolean): Unit = {
    // scalastyle:off println
    if (verbose) {
      printStream.println(s"Main class:
$childMainClass")
      printStream.println(s"Arguments:
${childArgs.mkString("
")}")
      printStream.println(s"System properties:
${sysProps.mkString("
")}")
      printStream.println(s"Classpath elements:
${childClasspath.mkString("
")}")
      printStream.println("
")
    }
    // scalastyle:on println

//自定义classLoader，具体classLoader的机制，请自行Google。
    val loader =
      if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
        new ChildFirstURLClassLoader(new Array[URL](0),
          Thread.currentThread.getContextClassLoader)
      } else {
        new MutableURLClassLoader(new Array[URL](0),
          Thread.currentThread.getContextClassLoader)
      }
      //设置新的classLoader
    Thread.currentThread.setContextClassLoader(loader)

	//此处将我们定义好的classpath加载到我们定义好的classloader，这样我们才能找到application定义好的mainClass呢。
    for (jar <- childClasspath) {
      addJarToClasspath(jar, loader)
    }

//设置环境变量
    for ((key, value) <- sysProps) {
      System.setProperty(key, value)
    }

    var mainClass: Class[_] = null

    try {
    //加载mainClass
      mainClass = Utils.classForName(childMainClass)
    } catch {
      case e: ClassNotFoundException =>
        e.printStackTrace(printStream)
        //如果想用hive的话，自行编译spark 
        if (childMainClass.contains("thriftserver")) {
          // scalastyle:off println
          printStream.println(s"Failed to load main class $childMainClass.")
          printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
          // scalastyle:on println
        }
        System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
      case e: NoClassDefFoundError =>
        e.printStackTrace(printStream)
        if (e.getMessage.contains("org/apache/hadoop/hive")) {
          // scalastyle:off println
          printStream.println(s"Failed to load hive class.")
          printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
          // scalastyle:on println
        }
        System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
    }

    // SPARK-4170
    //scala代码有两种app运行方式，一种是用main方法，另外一种是继承App的方式
    if (classOf[scala.App].isAssignableFrom(mainClass)) {
      printWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
    }

	//找到main方法
    val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass)
    if (!Modifier.isStatic(mainMethod.getModifiers)) {
      throw new IllegalStateException("The main method in the given main class must be static")
    }

    def findCause(t: Throwable): Throwable = t match {
      case e: UndeclaredThrowableException =>
        if (e.getCause() != null) findCause(e.getCause()) else e
      case e: InvocationTargetException =>
        if (e.getCause() != null) findCause(e.getCause()) else e
      case e: Throwable =>
        e
    }

    try {
    	//开始执行
      mainMethod.invoke(null, childArgs.toArray)
    } catch {
      case t: Throwable =>
        findCause(t) match {
          case SparkUserAppException(exitCode) =>
            System.exit(exitCode)

          case t: Throwable =>
            throw t
        }
    }
  }

CLIENT模式，那么application的mainClass开始执行，一般是初始化SparkConf，创建SparkContext，生成DAGScheduler、TaskScheduler和YarnClientSchedulerBackend。并且在YarnClientSchedulerBackend中启动org.apache.spark.deploy.yarn.Client

    override def start() {
    val driverHost = conf.get("spark.driver.host")
    val driverPort = conf.get("spark.driver.port")
    val hostport = driverHost + ":" + driverPort
    sc.ui.foreach { ui => conf.set("spark.driver.appUIAddress", ui.appUIAddress) }

    val argsArrayBuf = new ArrayBuffer[String]()
    argsArrayBuf += ("--arg", hostport)
    argsArrayBuf ++= getExtraClientArguments

    logDebug("ClientArguments called with: " + argsArrayBuf.mkString(" "))
    val args = new ClientArguments(argsArrayBuf.toArray, conf)
    totalExpectedExecutors = args.numExecutors
    client = new Client(args, conf)
    appId = client.submitApplication()

    // SPARK-8687: Ensure all necessary properties have already been set before
    // we initialize our driver scheduler backend, which serves these properties
    // to the executors
    super.start()

    waitForApplication()

    // SPARK-8851: In yarn-client mode, the AM still does the credentials refresh. The driver
    // reads the credentials from HDFS, just like the executors and updates its own credentials
    // cache.
    if (conf.contains("spark.yarn.credentials.file")) {
      YarnSparkHadoopUtil.get.startExecutorDelegationTokenRenewer(conf)
    }
    monitorThread = asyncMonitorApplication()
    monitorThread.start()
  }

CLUSTER模式下，在运行runMainClass的时候，会调用 org.apache.spark.deploy.yarn.Client的main方法

    def main(argStrings: Array[String]) {
    if (!sys.props.contains("SPARK_SUBMIT")) {
      logWarning("WARNING: This client is deprecated and will be removed in a " +
        "future version of Spark. Use ./bin/spark-submit with "--master yarn"")
    }

    // Set an env variable indicating we are running in YARN mode.
    // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
    System.setProperty("SPARK_YARN_MODE", "true")
    val sparkConf = new SparkConf

    val args = new ClientArguments(argStrings, sparkConf)
    // to maintain backwards-compatibility
    if (!Utils.isDynamicAllocationEnabled(sparkConf)) {
      sparkConf.setIfMissing("spark.executor.instances", args.numExecutors.toString)
    }
    new Client(args, sparkConf).run()
  }
  
    /**
   * Submit an application to the ResourceManager.
   * If set spark.yarn.submit.waitAppCompletion to true, it will stay alive
   * reporting the application's status until the application has exited for any reason.
   * Otherwise, the client process will exit after submission.
   * If the application finishes with a failed, killed, or undefined status,
   * throw an appropriate SparkException.
   */
  def run(): Unit = {
    this.appId = submitApplication()
    if (!launcherBackend.isConnected() && fireAndForget) {
      val report = getApplicationReport(appId)
      val state = report.getYarnApplicationState
      logInfo(s"Application report for $appId (state: $state)")
      logInfo(formatReportDetails(report))
      if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
        throw new SparkException(s"Application $appId finished with status: $state")
      }
    } else {
      val (yarnApplicationState, finalApplicationStatus) = monitorApplication(appId)
      if (yarnApplicationState == YarnApplicationState.FAILED ||
        finalApplicationStatus == FinalApplicationStatus.FAILED) {
        throw new SparkException(s"Application $appId finished with failed status")
      }
      if (yarnApplicationState == YarnApplicationState.KILLED ||
        finalApplicationStatus == FinalApplicationStatus.KILLED) {
        throw new SparkException(s"Application $appId is killed")
      }
      if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {
        throw new SparkException(s"The final status of application $appId is undefined")
      }
    }
  }

两者最终都会调用submitApplication的方法，提交应用到yarn，会在另外一篇博客中详细讲解。