RDD

 

scala> val rdd1=sc.parallelize(Array("coffe","coffe","hellp","hellp","pandas","mokey") )
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[8] at parallelize at <console>:24

scala> val rdd1=sc.parallelize(Array("coffe","coffe","hellp","hellp","pandas","mokey"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[9] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(Array("coe","coe","help","help","pandas","mokey"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[10] at parallelize at <console>:24

scala> val rdd1_distinct=rdd1.distinct()
rdd1_distinct: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[13] at distinct at <console>:25

scala> rdd1_distinct.foreach(println)
hellp
mokey
pandas
coffe

scala> val rdd_union=rdd1.union(rdd2)
rdd_union: org.apache.spark.rdd.RDD[String] = UnionRDD[14] at union at <console>:27

scala> rdd1_union.foreach(println)
<console>:24: error: not found: value rdd1_union
       rdd1_union.foreach(println)
       ^

scala> rdd_union.foreach(println)
pandas
mokey
coffe
hellp
coffe
hellp
pandas
mokey
coe
help
help
coe

scala> val rdd_intersection=rdd1.intersession(rdd2)
<console>:27: error: value intersession is not a member of org.apache.spark.rdd.RDD[String]
       val rdd_intersection=rdd1.intersession(rdd2)
                                 ^

scala> val rdd_intersection=rdd1.intersection(rdd2)
rdd_intersection: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[20] at intersection at <console>:27

scala> rdd_intersection.foreach(println)
mokey
pandas

scala> val rdd_sub=rdd1.subtract(rdd2)
rdd_sub: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[24] at subtract at <console>:27

scala> rdd_sub.foreach(prinln)
<console>:26: error: not found: value prinln
       rdd_sub.foreach(prinln)
                       ^

scala> rdd_sub.foreach(println)
coffe
coffe
hellp
hellp

scala>

 

scala> val rdd=sc.parallelize(Array(1,2,2,3))
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[25] at parallelize at <console>:24

scala> rdd.collect()
res16: Array[Int] = Array(1, 2, 2, 3)

scala> rdd.reduce((x,y)=>x+y)
res18: Int = 8

scala> rdd.take(2)
res19: Array[Int] = Array(1, 2)

scala> rdd.take(3)
res20: Array[Int] = Array(1, 2, 2)

scala>

scala> rdd.top(1)
res21: Array[Int] = Array(3)

scala> rdd.top(2)
res22: Array[Int] = Array(3, 2)

scala> rdd.top(3)
res23: Array[Int] = Array(3, 2, 2)



 

原文地址:https://www.cnblogs.com/ggzhangxiaochao/p/9237200.html