SparkStreamingTest.scala

/**
 * Created by root on 9/8/15.
 */
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.sql.SQLContext

object SparkStreamingTest {
  def main(args: Array[String]) {
    //create a local StreamingContext with two working thread and batch interval of 1 second.
    val conf = new SparkConf().setAppName("Spark streaming test").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(10))
    //val sc = ssc.sparkContext
    val lines = ssc.socketTextStream("localhost", 9999)
    val words = lines.flatMap(_.split(" "))

    //word count
    val pairs = words.map(word => (word, 1))
    val wordCounts = pairs.reduceByKey(_ + _)
    wordCounts.print()

    //convert spark streaming to sparksql
    words.foreachRDD((rdd: RDD[String], time: Time) => {
      val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
      import sqlContext.implicits._
      val wordsDataFrame = rdd.map(w => Record(w)).toDF()
      wordsDataFrame.registerTempTable("words")
      val wordsCountDataFrame = sqlContext.sql("select word, count(*) as total from words group by word")
      println(s"======================= $time =======================")
      wordsCountDataFrame.show()
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

case class Record(word: String)

object SQLContextSingleton {
  @transient private var instance: SQLContext = _
  def getInstance(sparkContext: SparkContext): SQLContext = {
    if (instance == null) {
      instance = new SQLContext(sparkContext)
    }
    instance
  }
}