import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.hive._ /** * Created by soyo on 17-10-12. */ case class Persons(name:String,age:Int) case class Record(key: Int, value: String) object rdd_to_dataframe_parquet { val warehouseLocation = "file:${system:user.dir}/spark-warehouse" val spark=SparkSession.builder().config("spark.sql.warehouse.dir",warehouseLocation).enableHiveSupport().getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val df =spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/people.txt") .map(_.split(",")).map(x=>Person(x(0),x(1).trim.toInt)).toDF() df.write.parquet("file:///home/soyo/桌面/spark编程测试数据/people.parquet") val parquetFile=spark.read.parquet("file:///home/soyo/桌面/spark编程测试数据/people.parquet") parquetFile.createOrReplaceTempView("people") val result=spark.sql("select * from people") result.show()
spark.stop()
补充:需要多数据源整合查询时:
val data=result1.union(result2)
data.createOrReplaceTempView("data") 之后执行后续查询