spark算子整理

combineByKeyWithClassTag

相当于Mapreduce自定义combine,可以提高任务的执行效率,毕竟在map端已经聚合过了

def combineByKeyWithClassTag[C](
      createCombiner: V => C,  //map端,改变 v 的返回值类型
      mergeValue: (C, V) => C,  //map端,预聚合
      mergeCombiners: (C, C) => C,  //reduce端,聚合
      partitioner: Partitioner,  //分区对象
      mapSideCombine: Boolean = true,  //是否开启map端聚合,默认开启
      serializer: Serializer = null)

//使用的这种
def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C)

def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C,
      numPartitions: Int)


rddA.combineByKeyWithClassTag(
        line => mutable.Set(line),      //声明map的返回值类型
        (c1: mutable.Set[String], newLine) => c1 += newLine,    //map端预聚合
        (c1: mutable.Set[String], c2: mutable.Set[String]) => c1 ++= c2    //reduce端,聚合
      )      

原文地址:https://www.cnblogs.com/goldenSky/p/12979206.html