RDD的依赖关系


scala> val personRDD=sc.textFile("/tmp/person.txt")
personRDD: org.apache.spark.rdd.RDD[String] = /tmp/person.txt MapPartitionsRDD[39] at textFile at <console>:25

scala> val ageRDD=personRDD.map(x=>{val arr=x.split(",");(arr(2),1)})
ageRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[41] at map at <console>:27

scala> val grouprdd=ageRDD.groupByKey()
grouprdd: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[40] at groupByKey at <console>:29

scala> grouprdd.dependencies.foreach(dep=>{println(dep.getClass);println(dep.rdd);println(dep.rdd.partitions);println(dep.rdd.partitions.size)})
class org.apache.spark.ShuffleDependency
MapPartitionsRDD[34] at map at <console>:27
[Lorg.apache.spark.Partition;@2e33dd0d
2

scala> personRDD.dependencies.foreach(dep=>{println(dep.getClass);println(dep.rdd);println(dep.rdd.partitions);println(dep.rdd.partitions.size)})
class org.apache.spark.OneToOneDependency
/tmp/person.txt HadoopRDD[38] at textFile at <console>:25
[Lorg.apache.spark.Partition;@5b0f052f
2

scala> ageRDD.dependencies.foreach(dep=>{println(dep.getClass);println(dep.rdd);println(dep.rdd.partitions);println(dep.rdd.partitions.size)})
class org.apache.spark.OneToOneDependency
/tmp/person.txt MapPartitionsRDD[39] at textFile at <console>:25
[Lorg.apache.spark.Partition;@5b0f052f
2

原文地址:https://www.cnblogs.com/playforever/p/9450010.html