数据零丢失kafka + hbase

package kafkautils

import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZkUtils
import kafkautils.KafkaZKManager.{getFromOffsets, storeOffsets}
import org.I0Itec.zkclient.ZkClient
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import kafka.ZookeeperHelper.client
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.I0Itec.zkclient.ZkClient
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}

import scala.collection.JavaConversions._
/**
* Created by zhoucw on 下午4:52.
*
*/
object KafkaHbaseManager {

// 自己参考实现
def saveOffsets(TOPIC_NAME:String,GROUP_ID:String,offsetRanges:Array[OffsetRange],
hbaseTableName:String,batchTime: org.apache.spark.streaming.Time) ={

}


// 从zookeeper中获取topic的分区数
def getNumberOfPartitionsForTopicFromZK(TOPIC_NAME:String,GROUP_ID:String,
zkQuorum:String,zkRootDir:String,sessTimeout:Int,connTimeOut:Int): Int ={

}

// 自己参考实现
def getLastestOffsets(TOPIC_NAME:String,GROUP_ID:String,hTableName:String,
zkQuorum:String,zkRootDir:String,sessTimeout:Int,connTimeOut:Int):Map[TopicAndPartition,Long] ={


val zKNumberOfPartitions =getNumberOfPartitionsForTopicFromZK(TOPIC_NAME, GROUP_ID, zkQuorum,zkRootDir,sessTimeout,connTimeOut)


val hbaseConf = HBaseConfiguration.create()

// 获取hbase中最后提交的offset
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(hTableName))
val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
val scan = new Scan()
val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
val result = scanner.next()

var hbaseNumberOfPartitions = 0 // 在hbase中获取的分区数量
if (result != null){
// 将分区数量设置为hbase表的列数量
hbaseNumberOfPartitions = result.listCells().size()
}

val fromOffsets = collection.mutable.Map[TopicAndPartition,Long]()
if(hbaseNumberOfPartitions == 0){
// 初始化kafka为开始

} else if(zKNumberOfPartitions > hbaseNumberOfPartitions){
// 处理新增加的分区添加到kafka的topic

} else {
// 获取上次运行的offset

}

scanner.close()
conn.close()
fromOffsets.toMap
}

def main(args: Array[String]): Unit = {
// getLastCommittedOffsets("mytest1", "testp", "stream_kafka_offsets", "spark123:12181", "kafka0.9", 30000, 30000)

val processingInterval = 2
val brokers = "spark123:9092"
val topics = "mytest1"
// Create context with 2 second batch interval
val sparkConf = new SparkConf().setAppName("kafkahbase").setMaster("local[2]")
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers,
"auto.offset.reset" -> "smallest")


val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
val groupId = "testp"
val hbaseTableName = "spark_kafka_offsets"

// 获取kafkaStream
//val kafkaStream = createMyDirectKafkaStream(ssc, kafkaParams, zkClient, topicsSet, "testp")
val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
val fromOffsets = getLastestOffsets("mytest1", groupId,hbaseTableName , "spark123:12181", "kafka0.9", 30000, 30000)


var kafkaStream : InputDStream[(String, String)] = null
kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)


kafkaStream.foreachRDD((rdd,btime) => {
if(!rdd.isEmpty()){
println("==========================:" + rdd.count() )
println("==========================btime:" + btime )
saveOffsets(topics, groupId, rdd.asInstanceOf[HasOffsetRanges].offsetRanges, hbaseTableName, btime)
}

})


//val offsetsRanges:Array[OffsetRange] = null

ssc.start()
ssc.awaitTermination()


}
}

posted @ 2018-12-20 15:24  何国秀_xue  阅读(475)  评论(0编辑  收藏  举报