sparkStreaming消费kafka-1.0.1方式:direct方式(存储offset到Hbase)

话不多说,可以看上篇博文,关于offset存储到zookeeper

https://www.cnblogs.com/niutao/p/10547718.html

本篇博文主要告诉你如何将offset写到Hbase做存储:

最后存储到Hbase的展现形式:

testDirect:co:1552667595000  column=info:0, timestamp=1552667594784, value=66 
 testDirect:co:1552667595000   column=info:1, timestamp=1552667594784, value=269  
 testDirect:co:1552667595000   column=info:2, timestamp=1552667594784, value=67   
 testDirect:co:1552667600000   column=info:0, timestamp=1552667599864, value=66  
 testDirect:co:1552667600000   column=info:1, timestamp=1552667599864, value=269 
 testDirect:co:1552667600000   column=info:2, timestamp=1552667599864, value=67 
 testDirect:co:1552667605000   column=info:0, timestamp=1552667604778, value=66
 testDirect:co:1552667605000   column=info:1, timestamp=1552667604778, value=269
 testDirect:co:1552667605000   column=info:2, timestamp=1552667604778, value=67 
 testDirect:co:1552667610000   column=info:0, timestamp=1552667609777, value=66 
 testDirect:co:1552667610000   column=info:1, timestamp=1552667609777, value=269
版本:
scala:2.11.8
spark:2.11
hbase:1.2.0-cdh5.14.0


遇到的问题:

`java.lang.IllegalStateException: Consumer is not subscribed to any topics or assigned any partitions`
分析原因:

从指定的主题或者分区获取数据,在poll之前,你没有订阅任何主题或分区是不行的,每一次poll,消费者都会尝试使用最后一次消费的offset作为接下来获取数据的start offset,最后一次消费的offset也可以通过seek(TopicPartition, long)设置或者自动设置
通过源码可以找到:
public ConsumerRecords<K, V> poll(long timeout) {
    acquire();
    try {
        if (timeout < 0)
            throw new IllegalArgumentException("Timeout must not be negative");
        // 如果没有任何订阅,抛出异常
        if (this.subscriptions.hasNoSubscriptionOrUserAssignment())
            throw new IllegalStateException("Consumer is not subscribed to any topics or assigned any partitions");

        // 一直poll新数据直到超时
        long start = time.milliseconds();
        // 距离超时还剩余多少时间
        long remaining = timeout;
        do {
            // 获取数据,如果自动提交,则进行偏移量自动提交,如果设置offset重置,则进行offset重置
            Map<TopicPartition, List<ConsumerRecord<K, V>>> records = pollOnce(remaining);
            if (!records.isEmpty()) {
                // 再返回结果之前,我们可以进行下一轮的fetch请求,避免阻塞等待
                fetcher.sendFetches();
                client.pollNoWakeup();
                // 如果有拦截器进行拦截,没有直接返回
                if (this.interceptors == null)
                    return new ConsumerRecords<>(records);
                else
                    return this.interceptors.onConsume(new ConsumerRecords<>(records));
            }

            long elapsed = time.milliseconds() - start;
            remaining = timeout - elapsed;
        } while (remaining > 0);

        return ConsumerRecords.empty();
    } finally {
        release();
    }
}

解决:

因此,需要订阅当前的topic才能消费,我之前使用的api是:(适用于非新--已经被消费者消费过的)
因此,需要订阅当前的topic才能消费,我之前使用的api是:(适用于非新--已经被消费者消费过的)
`val inputDStream1 = KafkaUtils.createDirectStream[String, String](
  ssc,
  PreferConsistent,
  Assign[String, String](
  fromOffsets.keys,kafkaParams,fromOffsets)
)`

修改:(全新的topic,没有被消费者消费过)
`val inputDStream = KafkaUtils.createDirectStream[String, String](
  ssc,
  PreferConsistent,
  Subscribe[String, String](topics, kafkaParams)
)`

完整代码:

package offsetInHbase
import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.ConsumerStrategies._
import org.apache.spark.streaming.kafka010.{OffsetRange, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.kafka010.LocationStrategies._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkContext, SparkConf}
/**
  * Created by angel
  */
object KafkaOffsetsBlogStreamingDriver {

  def main(args: Array[String]) {

    if (args.length < 6) {
      System.err.println("Usage: KafkaDirectStreamTest " +
        "<batch-duration-in-seconds> " +
        "<kafka-bootstrap-servers> " +
        "<kafka-topics> " +
        "<kafka-consumer-group-id> " +
        "<hbase-table-name> " +
        "<kafka-zookeeper-quorum>")
      System.exit(1)
    }
    //5 cdh1:9092,cdh2:2181,cdh3:2181 testDirect co testDirect cdh1:2181,cdh2:2181,cdh3:2181

    val batchDuration = args(0)
    val bootstrapServers = args(1).toString
    val topicsSet = args(2).toString.split(",").toSet
    val consumerGroupID = args(3)
    val hbaseTableName = args(4)
    val zkQuorum = args(5)
    val zkKafkaRootDir = "kafka"
    val zkSessionTimeOut = 10000
    val zkConnectionTimeOut = 10000

    val sparkConf = new SparkConf().setAppName("Kafka-Offset-Management-Blog")
      .setMaster("local[4]")//Uncomment this line to test while developing on a workstation
    val sc = new SparkContext(sparkConf)
    val ssc = new StreamingContext(sc, Seconds(batchDuration.toLong))
    val topics = topicsSet.toArray
    val topic = topics(0)

    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> bootstrapServers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> consumerGroupID,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    /*
    Create a dummy process that simply returns the message as is.
     */
    def processMessage(message:ConsumerRecord[String,String]):ConsumerRecord[String,String]={
      message
    }

    /*
    Save Offsets into HBase
     */
    def saveOffsets(
                     TOPIC_NAME:String,
                     GROUP_ID:String,
                     offsetRanges:Array[OffsetRange],
                     hbaseTableName:String,
                     batchTime: org.apache.spark.streaming.Time
                   ) ={
      val hbaseConf = HBaseConfiguration.create()
      hbaseConf.addResource("src/main/resources/hbase-site.xml")
      val conn = ConnectionFactory.createConnection(hbaseConf)
      val table = conn.getTable(TableName.valueOf(hbaseTableName))
      val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
      val put = new Put(rowKey.getBytes)
      for(offset <- offsetRanges){
        put.addColumn(Bytes.toBytes("info"),Bytes.toBytes(offset.partition.toString),
          Bytes.toBytes(offset.untilOffset.toString))
      }
      table.put(put)
      conn.close()
    }

    /*
    Returns last committed offsets for all the partitions of a given topic from HBase in following cases.
      - CASE 1: SparkStreaming job is started for the first time. This function gets the number of topic partitions from
        Zookeeper and for each partition returns the last committed offset as 0
      - CASE 2: SparkStreaming is restarted and there are no changes to the number of partitions in a topic. Last
        committed offsets for each topic-partition is returned as is from HBase.
      - CASE 3: SparkStreaming is restarted and the number of partitions in a topic increased. For old partitions, last
        committed offsets for each topic-partition is returned as is from HBase as is. For newly added partitions,
        function returns last committed offsets as 0
     */
    def getLastCommittedOffsets(
                                 TOPIC_NAME:String,
                                 GROUP_ID:String,
                                 hbaseTableName:String,
                                 zkQuorum:String,
                                 zkRootDir:String,
                                 sessionTimeout:Int,
                                 connectionTimeOut:Int
                               ):Map[TopicPartition,Long] ={

      val hbaseConf = HBaseConfiguration.create()
      hbaseConf.addResource("src/main/resources/hbase-site.xml")
      val zkUrl = zkQuorum+"/"+zkRootDir
      val zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl,sessionTimeout,connectionTimeOut)
      val zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2,false)
      val zKNumberOfPartitionsForTopic = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size

      //Connect to HBase to retrieve last committed offsets
      val conn = ConnectionFactory.createConnection(hbaseConf)
      val table = conn.getTable(TableName.valueOf(hbaseTableName))
      val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
      val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
      val scan = new Scan()
      val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
      val result = scanner.next()
      //Set the number of partitions discovered for a topic in HBase to 0
      var hbaseNumberOfPartitionsForTopic = 0
      if (result != null){
        //If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells
        //listCells 获取列族下的列
        hbaseNumberOfPartitionsForTopic = result.listCells().size()
      }

      val fromOffsets = collection.mutable.Map[TopicPartition,Long]()
      //初始化时候的hbase
      if(hbaseNumberOfPartitionsForTopic == 0){
        // initialize fromOffsets to beginning
        for (partition <- 0 to zKNumberOfPartitionsForTopic-1){
          fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
        //增加了topic的分区数
      } else if(zKNumberOfPartitionsForTopic > hbaseNumberOfPartitionsForTopic){
        // handle scenario where new partitions have been added to existing kafka topic
        for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1){
          val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("info"),Bytes.toBytes(partition.toString)))
          fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
        //将新增的分区也添加上
        for (partition <- hbaseNumberOfPartitionsForTopic to zKNumberOfPartitionsForTopic-1){
          fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
      } else {
        //initialize fromOffsets from last run
        for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1 ){
          val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("info"),Bytes.toBytes(partition.toString)))
          fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
      }
      scanner.close()
      conn.close()
      fromOffsets.toMap
    }


    val fromOffsets= getLastCommittedOffsets(
      topic,
      consumerGroupID,
      hbaseTableName,
      zkQuorum,
      zkKafkaRootDir,
      zkSessionTimeOut,
      zkConnectionTimeOut)
    //刚开始时候启动,全新的topic会报错
    val inputDStream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Assign[String, String](
      fromOffsets.keys,kafkaParams,fromOffsets)
    )
    //如果报错,则使用下面的api
//    val inputDStream = KafkaUtils.createDirectStream[String, String](
//      ssc,
//      PreferConsistent,
//      Subscribe[String, String](topics, kafkaParams)
//    )

    /*
      For each RDD in a DStream apply a map transformation that processes the message.
    */
    inputDStream.foreachRDD((rdd,batchTime) => {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      offsetRanges.foreach(offset => println(offset.topic, offset.partition, offset.fromOffset,offset.untilOffset))
      val newRDD = rdd.map(message => processMessage(message))
      newRDD.count()
      saveOffsets(topic,consumerGroupID,offsetRanges,hbaseTableName,batchTime) //save the offsets to HBase
    })

    println("Number of messages processed " + inputDStream.count())
    ssc.start()
    ssc.awaitTermination()
  }
}



原文地址:https://www.cnblogs.com/niutao/p/10547823.html