Spark SQL实战

一、程序

 1 package sparklearning
 2 
 3 import org.apache.log4j.Logger
 4 import org.apache.spark.SparkConf
 5 import org.apache.spark.SparkContext
 6 import org.apache.spark.sql.SQLContext
 7 import org.apache.spark.storage.StorageLevel
 8 import org.apache.log4j.Level
 9 
10 object OnLineTradeStatistics {
11 
12   case class User(userID:String,gender:String,age:Int,registerDate:String,provice:String,career:String)
13   case class TradeDetail(tradeID:String, tradeDate:String,productID:Int,amount:Int,userID:String)
14   def main(args: Array[String]){
15       
16     //关闭不必要的日志显示
17       Logger.getLogger("org.apache.hadoop").setLevel(Level.ERROR)
18       Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
19       Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
20       
21       //设置应用程序
22       val conf=new SparkConf().setAppName("On Line Trade Data").setMaster("local")
23       val ctx=new SparkContext(conf)
24       val sqlCtx=new SQLContext(ctx)
25       import sqlCtx.implicits._
26       
27       //读文件  RDD-->DataFrame  
28       val userDF= ctx.textFile("/home/hadoop/data/on_line_trade_user.txt").map(_.split(" ")).map(u=>User(u(0),u(1),u(2).toInt,u(3),u(4),u(5))).toDF()
29       userDF.registerTempTable("user")
30       userDF.persist(StorageLevel.MEMORY_ONLY_SER)
31       
32       val tradeDF= ctx.textFile("/home/hadoop/data/on_line_trade_detail.txt").map(_.split(" ")).map(u=>TradeDetail(u(0),u(1),u(2).toInt,u(3).toInt,u(4))).toDF()
33       tradeDF.registerTempTable("trade")//生成临时表
34       tradeDF.persist(StorageLevel.MEMORY_ONLY_SER)
35       
36       val countOfTrade2016 = sqlCtx.sql("SELECT * FROM trade where tradeDate like '2016%'").count()
37       println("2016 total money: "+countOfTrade2016)
38   }
39 }

二、结果

原文地址:https://www.cnblogs.com/liuzhongfeng/p/7017545.html