spark调用第三方工具spark-excel,文件过大,oom

/**
   * https://github.com/crealytics/spark-excel
   * @param sparkSession
   * @param filePath
   * @param header
   * @return
   */
  def sparkExcel(sparkSession: SparkSession, filePath: String, header: Boolean): DataFrame = {
    println("--------------------sparkExcel-----:", filePath)
    import com.crealytics.spark.excel._
    val df = sparkSession.read.excel(
      header = header,  // Required
      //      dataAddress = "'My Sheet'!B3:C35", // Optional, default: "A1"
      treatEmptyValuesAsNulls = true,  // Optional, default: true
      inferSchema = false,  // Optional, default: false
      addColorColumns = false,  // Optional, default: false
      //      timestampFormat = "MM-dd-yyyy HH:mm:ss",  // Optional, default: yyyy-mm-dd hh:mm:ss[.fffffffff]
            maxRowsInMemory = 20  // Optional, default None. If set, uses a streaming reader which can help with big files
      //      excerptSize = 10,  // Optional, default: 10. If set and if schema inferred, number of rows to infer schema from
      //      workbookPassword = "pass"  // Optional, default None. Requires unlimited strength JCE for older JVMs
    ).load(filePath)

    df.show(5)

    df
  }

  设置:maxRowsInMemory

原文地址:https://www.cnblogs.com/long-yuan/p/13477372.html