spark[源码]-任务调度源码分析[三]

前言

在上一篇文章中，我主要是讲解了DAG阶段的处理，spark是如何将一个job根据宽窄依赖划分出多个stage的，在最后一步中是将生成的TaskSet提交给了TaskSchedulerInmpl的。

此次我们从taskScheduler.submitTasks开始讲，深入理解TaskScheduler的运行过程，这个地方是如何将taskSetManager和pool联系在一起的。

taskSetManager类继承了Schedulable，这个继承类是pool之间的桥梁，也是调度算法的桥梁。pool也继承了Schedulable，请带着疑问看源码吧。

taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))

taskScheduler.submitTasks()

  override def submitTasks(taskSet: TaskSet) {
    val tasks = taskSet.tasks
    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
    this.synchronized {
//创建TaskSetManager
      val manager = createTaskSetManager(taskSet, maxTaskFailures)
      val stage = taskSet.stageId
      val stageTaskSets =
        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
      stageTaskSets(taskSet.stageAttemptId) = manager
      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
        ts.taskSet != taskSet && !ts.isZombie
      }
      if (conflictingTaskSet) {
        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
      }
//将manager信息加入到调度器，这个地方是根据前面的调度算法，重写了addTaskSetManager方法。
      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)

      if (!isLocal && !hasReceivedTask) {
        starvationTimer.scheduleAtFixedRate(new TimerTask() {
          override def run() {
            if (!hasLaunchedTask) {
              logWarning("Initial job has not accepted any resources; " +
                "check your cluster UI to ensure that workers are registered " +
                "and have sufficient resources")
            } else {
              this.cancel()
            }
          }
        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
      }
      hasReceivedTask = true
    }
    backend.reviveOffers()
  }

CoarseGrainedSchedulerBackend.reviveOffers()

看到这是不是有些眼熟？如果你是先看的如何启动driver，如何启动app的，如何启动executor的，你可能就瞬间想起来了，因为在driver的时候就用了这个，只是他自己给我自己发送了一个空Object，进行验证而已。

回忆一下：CoarseGrainedSchedulerBackend的start会生成driverEndpoint，它是一个rpc的终端，一个RpcEndpoint接口，它由ThreadSafeRpcEndpoint接口实现，而ThreadSafeRpcEndpoint，CoarseGrainedSchedulerBackend的内部类DriverEndpoint实现。CoarseGrainedSchedulerBackend的reviveOffers就是发送给这个rpc的终端ReviveOffers信号，ReviveOffers就是一个case class。

CoarseGrainedSchedulerBackend.revive()

继续回忆一下：DriverEndpoint有两种发送信息的函数。一个是send，发送信息后不需要对方回复。一个是ask，发送信息后需要对方回复。对应着，也有两种接收信息的函数。一个是receive，接收后不回复对方：

看源码可以看到，调用了makeOffers()方法

CoarseGrainedSchedulerBackend.makeOffers()

    private def makeOffers() {
      // Filter out executors under killing
//过滤掉要被移除的和缺失的executor
      val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
//根据activeExecutors生成workOffers，
//即executor所能提供的资源信息
      val workOffers = activeExecutors.map { case (id, executorData) =>
        new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
      }.toSeq
//scheduler.resourceOffers分配资源，
//并launchTasks发送任务
      launchTasks(scheduler.resourceOffers(workOffers))
    }

接下来让我们看看是如何给task分派资源的resourceOffers()方法，既然是给task分配资源那可定是TaskSchedulerImpl的事情了。

TaskSchedulerImpl.resourceOffers()

我们先梳理一下思路：

TaskSchedulerImpl.resourceOffers()整体任务资源分配===>resourceOfferSingleTaskSet()当个task任务分配===>TaskSetManager.resourceOffer()分配task

  def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
    // Mark each slave as alive and remember its hostname
    // Also track if new executor is added
    var newExecAvail = false
    for (o <- offers) { //循环可用的每个workoffer的资源
      executorIdToHost(o.executorId) = o.host  //主机和executor进行绑定
      executorIdToTaskCount.getOrElseUpdate(o.executorId, 0) //在每个executor上执行的task数量
      if (!executorsByHost.contains(o.host)) {
        executorsByHost(o.host) = new HashSet[String]()
        executorAdded(o.executorId, o.host)
        newExecAvail = true
      }
      //为每个主机添加或者更新任务跟踪者
      for (rack <- getRackForHost(o.host)) {
        hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
      }
    }

    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
    // 为了避免将Task集中分配到某些机器，随机的打散它们
    val shuffledOffers = Random.shuffle(offers)
    // Build a list of tasks to assign to each worker.
    //构建分配给每个worker的任务列表
    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
    //记录各个worker的available Cpus
    val availableCpus = shuffledOffers.map(o => o.cores).toArray
    //获取按照调度策略排序好的TaskSetManager
    val sortedTaskSets = rootPool.getSortedTaskSetQueue
    for (taskSet <- sortedTaskSets) {
      logDebug("parentName: %s, name: %s, runningTasks: %s".format(
        taskSet.parent.name, taskSet.name, taskSet.runningTasks))
        //如果有新的executor加入
        //则需要从新计算TaskSetManager的就近原则
      if (newExecAvail) {
        taskSet.executorAdded()
      }
    }

    // Take each TaskSet in our scheduling order, and then offer it each node in increasing order
    // of locality levels so that it gets a chance to launch local tasks on all of them.
    // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
    var launchedTask = false
    // 得到调度序列中的每个TaskSet,
    // 然后按节点的locality级别增序分配资源
    // Locality优先序列为: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
    for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
      do {
        //resourceOfferSingleTaskSet为单个TaskSet分配资源，
        //若该LocalityLevel的节点下不能再为之分配资源了，
        //则返回false
        launchedTask = resourceOfferSingleTaskSet(
            taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
      } while (launchedTask)
    }

    if (tasks.size > 0) {
      hasLaunchedTask = true
    }
    return tasks
  }

TaskSchedulerImpl.resourceOffersSingleTaskSet()

单个TaskSet分配资源：

private def resourceOfferSingleTaskSet(
      taskSet: TaskSetManager,
      maxLocality: TaskLocality,
      shuffledOffers: Seq[WorkerOffer],
      availableCpus: Array[Int],
      tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
    var launchedTask = false
    for (i <- 0 until shuffledOffers.size) {
      val execId = shuffledOffers(i).executorId
      val host = shuffledOffers(i).host
      if (availableCpus(i) >= CPUS_PER_TASK) {
        try {
          for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
            tasks(i) += task
            val tid = task.taskId
            taskIdToTaskSetManager(tid) = taskSet
            taskIdToExecutorId(tid) = execId
            executorIdToTaskCount(execId) += 1
            executorsByHost(host) += execId
            availableCpus(i) -= CPUS_PER_TASK
            assert(availableCpus(i) >= 0)
            launchedTask = true
          }
        } catch {
          case e: TaskNotSerializableException =>
            logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
            // Do not offer resources for this task, but don't throw an error to allow other
            // task sets to be submitted.
            return launchedTask
        }
      }
    }
    return launchedTask
  }

TaskSetManager.resourceOffer（）

根据TaskScheduler所提供的单个Resource资源包括host，executor和locality的要求返回一个合适的Task，TaskSetManager内部会根据上一个任务的成功提交的时间，自动调整自身的Locality匹配策略，如果上一次成功提交任务的时间间隔很长，则降低对Locality的要求（例如从最差要求Process Local降低为最差要求Node Local），反之则提高对Locality的要求。这一动态调整Locality的策略为了提高任务在最佳Locality的情况下得到运行的机会，因为Resource资源是在短期内分批提供给TaskSetManager的，动态调整Locality门槛有助于改善整体的Locality分布情况。

def resourceOffer(
      execId: String,
      host: String,
      maxLocality: TaskLocality.TaskLocality)
    : Option[TaskDescription] =
  {
    if (!isZombie) {
      val curTime = clock.getTimeMillis()

      var allowedLocality = maxLocality

      if (maxLocality != TaskLocality.NO_PREF) {
        allowedLocality = getAllowedLocalityLevel(curTime)
        if (allowedLocality > maxLocality) {
          // We're not allowed to search for farther-away tasks
          allowedLocality = maxLocality
        }
      }

      dequeueTask(execId, host, allowedLocality) match {
        case Some((index, taskLocality, speculative)) => {
          // Found a task; do some bookkeeping and return a task description
          val task = tasks(index)
          val taskId = sched.newTaskId()
          // Do various bookkeeping
          copiesRunning(index) += 1
          val attemptNum = taskAttempts(index).size
          val info = new TaskInfo(taskId, index, attemptNum, curTime,
            execId, host, taskLocality, speculative)
          taskInfos(taskId) = info
          taskAttempts(index) = info :: taskAttempts(index)
          // Update our locality level for delay scheduling
          // NO_PREF will not affect the variables related to delay scheduling
          if (maxLocality != TaskLocality.NO_PREF) {
            currentLocalityIndex = getLocalityIndex(taskLocality)
            lastLaunchTime = curTime
          }
          // Serialize and return the task
          val startTime = clock.getTimeMillis()
          val serializedTask: ByteBuffer = try {
            Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser)
          } catch {
            // If the task cannot be serialized, then there's no point to re-attempt the task,
            // as it will always fail. So just abort the whole task-set.
            case NonFatal(e) =>
              val msg = s"Failed to serialize task $taskId, not attempting to retry it."
              logError(msg, e)
              abort(s"$msg Exception during serialization: $e")
              throw new TaskNotSerializableException(e)
          }
          if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
              !emittedTaskSizeWarning) {
            emittedTaskSizeWarning = true
            logWarning(s"Stage ${task.stageId} contains a task of very large size " +
              s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " +
              s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
          }
          addRunningTask(taskId)

          // We used to log the time it takes to serialize the task, but task size is already
          // a good proxy to task serialization time.
          // val timeTaken = clock.getTime() - startTime
          val taskName = s"task ${info.id} in stage ${taskSet.id}"
          logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," +
            s"$taskLocality, ${serializedTask.limit} bytes)")

          sched.dagScheduler.taskStarted(task, info)
          return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
            taskName, index, serializedTask))
        }
        case _ =>
      }
    }
    None
  }

resourceOffer()

CoarseGrainedSchedulerBackend.DriverEndpoint.launchTasks

launchTasks(scheduler.resourceOffers(workOffers))

这时我们在继续看一下lanchTasks这个

    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
      for (task <- tasks.flatten) {
        val serializedTask = ser.serialize(task)
//若序列话Task大小达到Rpc限制，则停止。
        if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
          scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
            try {
              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
                "spark.akka.frameSize or using broadcast variables for large values."
              msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
                AkkaUtils.reservedSizeBytes)
              taskSetMgr.abort(msg)
            } catch {
              case e: Exception => logError("Exception in error callback", e)
            }
          }
        }
        else {
          val executorData = executorDataMap(task.executorId)
 // 减少改task所对应的executor信息的core数量 
          executorData.freeCores -= scheduler.CPUS_PER_TASK
//向executorEndpoint 发送LaunchTask 信号
          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
        }
      }
    }

executorEndpoint接收到LaunchTask信号（包含SerializableBuffer(serializedTask) ）后，会开始执行任务。这样task就发送到了对应的executor上了。至此，TaskScheduler在发送任务给executor前的工作就全部完成了。