Spark编程练习题

import org.apache.spark.sql.SparkSession

val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.enableHiveSupport()
.getOrCreate()

//开启隐式转换
import spark.implicits._

//任务:求data的平均值
import util.Random
val data = for(i<- List.range(1,10)) yield Random.nextInt(100)

//使用RDD编程实现
val rdd = sc.parallelize(data,5)
val mean = rdd.map(_.toDouble).reduce(_+_)/rdd.count
println(mean)

//使用SparkSQL编程实现
val df = data.toDF("value")
df.agg("value"->"avg").show

//任务:统计file中每个词的词频
val file = "wordcount"

//使用RDD编程实现
val rdd = sc.textFile(file)
rdd.flatMap(_.trim.split(" ")).map((_,1)).reduceByKey(_+_).collect

//使用SparkSQL编程实现
val df = spark.read.option("header","false").csv(file).toDF("value")
df.flatMap(row=>row(0).toString.trim.split(" ")).groupBy("value").count.show

//任务:
//有一批学生信息表格,包括name,age,score
//找出score排名前3的学生
val students = List(("LiLei",18,87),
                   ("HanMeiMei",16,77),
                   ("DaChui",16,66),
                   ("Jim",18,80),
                   ("RuHua",20,50))
val n = 3

//使用RDD编程实现
val rdd = sc.parallelize(students)
rdd.sortBy(_._3,ascending = false).take(n)

//使用SparkSQL编程实现
val df = students.toDF("name","age","score")
df.orderBy(df("score").desc).limit(n).show

//任务:求最大值最小值
val data = List(1,7,8,5,3,18,34,23,67,53,9,0,12,8)

//使用RDD编程实现,方案1
val rdd = sc.parallelize(data,3)
val max_value = rdd.reduce((a,b)=> if(a>b) a else b)
val min_value = rdd.reduce((a,b)=> if(a>b) b else a)
println("max_value:" + max_value)
println("min_value:" + min_value)

//使用RDD编程实现,方案2
val rdd = sc.parallelize(data,3)
val temp = rdd.mapPartitions(iterator => {
    var min = Integer.MAX_VALUE
    var max = Integer.MIN_VALUE
    for(x <- iterator){
        if(x>max) max = x
        if(x<min) min = x
    }
    Iterator((min,max))
})
val result = temp.reduce((a,b)=>
          {val min = if(a._1<= b._1) a._1 else b._1
           val max = if(a._2 >= b._2) a._2 else b._2
           (min,max)
          })

//使用SparkSQL编程实现
import org.apache.spark.sql.functions._
val df = data.toDF("value")
df.agg(max("value") as "max_value",min("value") as "min_value").show

//任务:排序并返回序号
val data = List(1,7,8,5,3,18,34,9,0,12,8)

//使用RDD编程实现:方案1
val rdd = sc.parallelize(data,3)
val len = rdd.count
val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
val index = sc.parallelize(0 to len.toInt-1,1)
index.zip(sortedrdd).collect

//使用RDD编程实现:方案2
val rdd = sc.parallelize(data,3)
val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
var idx = -1
sortedrdd.map(value => {
    idx+=1
    (idx,value)
}).collect

原文地址:https://www.cnblogs.com/hrnn/p/13387189.html