ElasticSearch-hadoop saveToEs源码分析

ElasticSearch-hadoop saveToEs源码分析：

类的调用路径关系为:

EsSpark -> 
     EsRDDWriter -> 
           RestService -> 
                  RestRepository -> 
                            RestClient

他们的作用：

EsSpark，读取ES和存储ES的入口
EsRDDWriter，调用RestService创建PartitionWriter,对ES进行数据写入
RestService，负责创建 RestRepository，PartitionWriter
RestRepository，bulk高层抽象，底层利用NetworkClient做真实的http bulk请求

各个类对应的源码追踪如下：

https://github.com/elastic/elasticsearch-hadoop/blob/2.1/spark/core/main/scala/org/elasticsearch/spark/rdd/EsSpark.scala

  def saveToEs(rdd: RDD[_], resource: String) { saveToEs(rdd, Map(ES_RESOURCE_WRITE -> resource)) }
  def saveToEs(rdd: RDD[_], resource: String, cfg: Map[String, String]) {
    saveToEs(rdd, collection.mutable.Map(cfg.toSeq: _*) += (ES_RESOURCE_WRITE -> resource))
  }
  def saveToEs(rdd: RDD[_], cfg: Map[String, String]) {
    CompatUtils.warnSchemaRDD(rdd, LogFactory.getLog("org.elasticsearch.spark.rdd.EsSpark"))

    if (rdd == null || rdd.partitions.length == 0) {
      return
    }

    val sparkCfg = new SparkSettingsManager().load(rdd.sparkContext.getConf)
    val config = new PropertiesSettings().load(sparkCfg.save())
    config.merge(cfg.asJava)

    rdd.sparkContext.runJob(rdd, new EsRDDWriter(config.save()).write _)
  }

https://github.com/elastic/elasticsearch-hadoop/blob/2.1/spark/core/main/scala/org/elasticsearch/spark/rdd/EsRDDWriter.scala

  def write(taskContext: TaskContext, data: Iterator[T]) {
    val writer = RestService.createWriter(settings, taskContext.partitionId, -1, log)

    taskContext.addOnCompleteCallback(() => writer.close())

    if (runtimeMetadata) {
      writer.repository.addRuntimeFieldExtractor(metaExtractor)
    }

    while (data.hasNext) {
      writer.repository.writeToIndex(processData(data))
    }
  }

https://github.com/elastic/elasticsearch-hadoop/blob/2.1/mr/src/main/java/org/elasticsearch/hadoop/rest/RestService.java

    public static PartitionWriter createWriter(Settings settings, int currentSplit, int totalSplits, Log log) {
        Version.logVersion();

        InitializationUtils.discoverEsVersion(settings, log);
        InitializationUtils.discoverNodesIfNeeded(settings, log);
        InitializationUtils.filterNonClientNodesIfNeeded(settings, log);
        InitializationUtils.filterNonDataNodesIfNeeded(settings, log);

        List<String> nodes = SettingsUtils.discoveredOrDeclaredNodes(settings);

        // check invalid splits (applicable when running in non-MR environments) - in this case fall back to Random..
        int selectedNode = (currentSplit < 0) ? new Random().nextInt(nodes.size()) : currentSplit % nodes.size();

        // select the appropriate nodes first, to spread the load before-hand
        SettingsUtils.pinNode(settings, nodes.get(selectedNode));

        Resource resource = new Resource(settings, false);

        log.info(String.format("Writing to [%s]", resource));

        // single index vs multi indices
        IndexExtractor iformat = ObjectUtils.instantiate(settings.getMappingIndexExtractorClassName(), settings);
        iformat.compile(resource.toString());

        RestRepository repository = (iformat.hasPattern() ? initMultiIndices(settings, currentSplit, resource, log) : initSingleIndex(settings, currentSplit, resource, log));

        return new PartitionWriter(settings, currentSplit, totalSplits, repository);
    }

https://github.com/elastic/elasticsearch-hadoop/blob/2.1/mr/src/main/java/org/elasticsearch/hadoop/rest/RestRepository.java

    /**
     * Writes the objects to index.
     *
     * @param object object to add to the index
     */
    public void writeToIndex(Object object) {
        Assert.notNull(object, "no object data given");

        lazyInitWriting();
        doWriteToIndex(command.write(object));
    }

    private void doWriteToIndex(BytesRef payload) {
        // check space first
        if (payload.length() > ba.available()) {
            if (autoFlush) {
                flush();
            }
            else {
                throw new EsHadoopIllegalStateException(
                        String.format("Auto-flush disabled and bulk buffer full; disable manual flush or increase capacity [current size %s]; bailing out", ba.capacity()));
            }
        }

        data.copyFrom(payload);
        payload.reset();

        dataEntries++;
        if (bufferEntriesThreshold > 0 && dataEntries >= bufferEntriesThreshold) {
            if (autoFlush) {
                flush();
            }
            else {
                // handle the corner case of manual flush that occurs only after the buffer is completely full (think size of 1)
                if (dataEntries > bufferEntriesThreshold) {
                    throw new EsHadoopIllegalStateException(
                            String.format(
                                    "Auto-flush disabled and maximum number of entries surpassed; disable manual flush or increase capacity [current size %s]; bailing out",
                                    bufferEntriesThreshold));
                }
            }
        }
    }

    public void flush() {
        BitSet bulk = tryFlush();
        if (!bulk.isEmpty()) {
            throw new EsHadoopException(String.format("Could not write all entries [%s/%s] (maybe ES was overloaded?). Bailing out...", bulk.cardinality(), bulk.size()));
        }
    }

    public BitSet tryFlush() {
        if (log.isDebugEnabled()) {
            log.debug(String.format("Sending batch of [%d] bytes/[%s] entries", data.length(), dataEntries));
        }

        BitSet bulkResult = EMPTY;

        try {
            // double check data - it might be a false flush (called on clean-up)
            if (data.length() > 0) {
                bulkResult = client.bulk(resourceW, data);
                executedBulkWrite = true;
            }
        } catch (EsHadoopException ex) {
            hadWriteErrors = true;
            throw ex;
        }

        // discard the data buffer, only if it was properly sent/processed
        //if (bulkResult.isEmpty()) {
        // always discard data since there's no code path that uses the in flight data
        discard();
        //}

        return bulkResult;
    }

https://github.com/elastic/elasticsearch-hadoop/blob/2.1/mr/src/main/java/org/elasticsearch/hadoop/rest/RestClient.java

    public BitSet bulk(Resource resource, TrackingBytesArray data) {
        Retry retry = retryPolicy.init();
        int httpStatus = 0;

        boolean isRetry = false;

        do {
            // NB: dynamically get the stats since the transport can change
            long start = network.transportStats().netTotalTime;
            Response response = execute(PUT, resource.bulk(), data);
            long spent = network.transportStats().netTotalTime - start;

            stats.bulkTotal++;
            stats.docsSent += data.entries();
            stats.bulkTotalTime += spent;
            // bytes will be counted by the transport layer

            if (isRetry) {
                stats.docsRetried += data.entries();
                stats.bytesRetried += data.length();
                stats.bulkRetries++;
                stats.bulkRetriesTotalTime += spent;
            }

            isRetry = true;

            httpStatus = (retryFailedEntries(response, data) ? HttpStatus.SERVICE_UNAVAILABLE : HttpStatus.OK);
        } while (data.length() > 0 && retry.retry(httpStatus));

        return data.leftoversPosition();
    }