spark streaming入hbase

1.一般的入hbase

import net.sf.json.JSONObject
import org.apache.hadoop.hbase.client.{HTable, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}

object consumerStreaming {
//  Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//  Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
  def main(args: Array[String]): Unit = {
    var masterUrl="local[*]"
    if(args.length>2){
      masterUrl=args(0)
    }
    val conf = new SparkConf().setMaster(masterUrl).setAppName("NetworkWordCount")
    val ssc = new StreamingContext(conf, Seconds(1))
        val brokers = PropertieUtil.getString("kafka.brokers")
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "cloudera_mirrormaker",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    //配置
    val topics = Array("TIANQIAN")
    val tableName =  PropertieUtil.getString("hbase.tableName")

    val hbaseZookeeperList = PropertieUtil.getString("hbase.zookeeper")
    //    接收数据
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    //转换数据并存入hbase
    val events =  stream.map(r=>r.value())
    var n = 0
    events.foreachRDD(rdd=>{
      rdd.foreachPartition(partition=>{
        //hbase配置
              val hbaseConf = HBaseConfiguration.create()
              hbaseConf.set("hbase.zookeeper.quorum", hbaseZookeeperList)
              hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
              hbaseConf.set("hbase.defaults.for.version.skip", "true")
              val StatTable = new HTable(hbaseConf,TableName.valueOf(tableName))
        partition.foreach(r=>{
          val JsonObject=JSONObject.fromObject(r)
          val protocol= JsonObject.get("protocol")
          println(protocol)
          val data = JsonObject.get("data")
          println(data)
                  val start = data.toString.substring(0,3)
                  val zho = data.toString.substring(3,16)
                  val end = data.toString.substring(16,19)
                  val sui = data.toString.substring(19)
                  //存入hbase
                  val put=new Put(Bytes.toBytes(zho+sui))
                  put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("START"),Bytes.toBytes(start))
                  put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("ZHO"),Bytes.toBytes(zho))
                  put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("END"),Bytes.toBytes(end))
                  put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("SUI"),Bytes.toBytes(end))
                  StatTable.setAutoFlush(false,false)
                  StatTable.setWriteBufferSize(3*1024*1024)
                  StatTable.put(put)
        })
              StatTable.flushCommits()
      })
    })
    ssc.start()
    ssc.awaitTermination()

  }
}

2.下面给出一个发送kafka和接收kafka入spark hbase

生产数据

import java.text.SimpleDateFormat
import java.util.{Date, Properties}
import net.sf.json.JSONObject
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

import scala.util.Random
object YunLiProdicer {
  def main(args: Array[String]): Unit = {
    val brokers = PropertieUtil.getString("kafka.brokers")
    val topics = PropertieUtil.getString("kafka.topics")
    val props: Properties = new Properties()
    props.put("metadata.broker.list", brokers)
    props.put("bootstrap.servers", brokers)
    props.put("serializer.class", "kafka.serializer.StringEncoder")
    props.put("request.required.acks", "1")
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    var n: Int = 0
    val kafkaProducer = new KafkaProducer[String, String](props)
    while (true){
            var a1=(Math.random()*1000+10)
            var MACHNO=a1.formatted("%.0f") 
            var a2= (Math.random()*1001+10)
            var LINENO=a2.formatted("%.0f")
            var LABELNO= (new Random).nextInt(1000) 
            var SITETIME = NowDate 
            var LNG= (new Random).nextInt(1000) 
            var LAT= (new Random).nextInt(30) 
            var VELOCITY=(new Random).nextInt(1000)
            var ORIENTATION=(new Random).nextInt(1000)
            var ISUPDOWN= (new Random).nextInt(2)
            val event = new JSONObject() 
            val event2 = new JSONObject() 
            event.put("MACHNO",MACHNO)
            event.put("LINENO",LINENO)
            event.put("LABELNO",LABELNO)
            event.put("SITETIME",SITETIME)
            event.put("LNG",LNG)
            event.put("LAT",LAT)
            event.put("VELOCITY",VELOCITY)
            event.put("ORIENTATION",ORIENTATION)
            event.put("ISUPDOWN",0)

            //完整包
            event2.put("V","V7")
            event2.put("data",event.toString())
            kafkaProducer.send(new ProducerRecord[String,String](topics,"176",event2.toString()))
            n=n+1
            Thread.sleep(0)
      println(n+" "+"发送数据个数")
    }
    def NowDate(): String = {
      val now: Date = new Date()
      val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      val date = dateFormat.format(now)
      return date
    }
  }
}

接收并入hbase代码

import java.text.SimpleDateFormat
import java.util.Date
import net.sf.json.JSONObject
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.TimestampType
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

//正确
object XXXConsumerStreaming {
//  Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//  Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
  def main(args: Array[String]): Unit = {
    var masterUrl="local[*]"
    if(args.length>2){
      masterUrl="yarn-cluster"
    }
    val conf = new SparkConf().setAppName("NetworkWordCount").setMaster(masterUrl)
    val sc = new SparkContext(conf)
    val spark = new SQLContext(sc)
    val ssc = new StreamingContext(sc, Seconds(10))
        val brokers = PropertieUtil.getString("kafka.brokers")
         val topic = PropertieUtil.getString("kafka.topics")
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "cloudera_mirrormaker",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    //配置
    val topics = Set(topic)
    val tableName =  PropertieUtil.getString("hbase.tablenamehn")

    var lastRdd:RDD[Row] = null
    val hbaseZookeeperList = PropertieUtil.getString("hbase.zookeeper")
    //    接收数据
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    //转换数据并存入hbase
    val events =  stream.map(r=>r.value())
  //hbase配置
  sc.hadoopConfiguration.set("hbase.zookeeper.quorum","*:2181")
//遍历
    events.foreachRDD(rdd=>{
      //数据转为df
      import spark.implicits._
      val frame: DataFrame = rdd.map(rdd2 => DataToArray(rdd2)).map(r =>(r(0), r(1), r(2), r(3), r(4), r(5),r(6),r(7),r(8))).
        toDF("MACHNO","LINENO","LABELNO","SITETIME","LNG","LAT","VELOCITY","ORIENTATION","ISUPDOWN")
      frame.select(frame("MACHNO"),frame("LINENO"),frame("LABELNO"),frame("SITETIME").cast(TimestampType),frame("LNG"),frame("LAT"),frame("VELOCITY"),
        frame("ORIENTATION"),frame("ISUPDOWN")).createOrReplaceTempView("TEST")
      val sql =
        """
          |SELECT MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN
          |FROM (
          |SELECT *,ROW_NUMBER() OVER (PARTITION BY MACHNO,LINENO,ISUPDOWN ORDER BY SITETIME DESC) TT
          |FROM TEST
          |) WHERE TT = 1
        """.stripMargin
      val df: DataFrame = spark.sql(sql)
      df.createOrReplaceTempView("TESTA")
      println("计算结果")
      df.show()
      if (lastRdd==null){
        lastRdd=df.rdd
      }else{
        val df1: DataFrame = lastRdd.map(r => (r(0).toString, r(1).toString, r(2).toString, r(3).toString, r(4).toString, r(5).toString, r(6).toString, r(7).toString, r(8).toString))
          .toDF("MACHNO", "LINENO", "LABELNO", "SITETIME", "LNG", "LAT", "VELOCITY", "ORIENTATION", "ISUPDOWN")
        df1.select(df1("MACHNO"),df1("LINENO"),df1("LABELNO"),df1("SITETIME").cast(TimestampType),df1("LNG"),df1("LAT"),df1("VELOCITY"),
          df1("ORIENTATION"),df1("ISUPDOWN")).createOrReplaceTempView("TESTB")
        val sql2 =
          """
            |select MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN from TESTA
            |UNION ALL
            |select MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN from TESTB
          """.stripMargin
        val df2: DataFrame = spark.sql(sql2)
        println("关联结果")
        df2.show()
        df2.createOrReplaceTempView("TESTC")
        val sql3 =
          """
            |SELECT MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN
            |FROM (
            |SELECT *,ROW_NUMBER() OVER (PARTITION BY MACHNO,LINENO,ISUPDOWN ORDER BY SITETIME DESC) TT
            |FROM TEST
            |) WHERE TT = 1
          """.stripMargin
        val df4: DataFrame = spark.sql(sql3)
        println(":")
        df4.show()
        lastRdd=df4.rdd
        sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        val job = new Job(sc.hadoopConfiguration)
        job.setOutputKeyClass(classOf[ImmutableBytesWritable])
        job.setOutputValueClass(classOf[Result])
        job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
        df4.rdd.map(r=>{
          //        //存入hbase
          val INSTIME: String = NowDate
          //  
          val put=new Put(Bytes.toBytes(r(0).toString+r(1).toString+r(8).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("MACHNO"),Bytes.toBytes(r(0).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LINENO"),Bytes.toBytes(r(1).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LABELNO"),Bytes.toBytes(r(2).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("SITETIME"),Bytes.toBytes(r(3).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LNG"),Bytes.toBytes(r(4).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LAT"),Bytes.toBytes(r(5).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("VELOCITY"),Bytes.toBytes(r(6).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("ORIENTATION"),Bytes.toBytes(r(7).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("ISUPDOWN"),Bytes.toBytes(r(8).toString))
          put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("INSTIME"),Bytes.toBytes(INSTIME.toString))
          (new ImmutableBytesWritable, put)
        }).saveAsNewAPIHadoopDataset(job.getConfiguration())
      }
    })
    ssc.start()
    ssc.awaitTermination()
  }

  def DataToArray(rdd: String):ArrayBuffer[String]={
    val JsonObject=JSONObject.fromObject(rdd)
    val data = JsonObject.get("data")
    val JsonObject1=JSONObject.fromObject(data)
    val MACHNO= JsonObject1.get("MACHNO").toString
    val LINENO= JsonObject1.get("LINENO").toString
    val LABELNO= JsonObject1.get("LABELNO").toString
    val SITETIME= JsonObject1.get("SITETIME").toString
    val LNG= JsonObject1.get("LNG").toString
    val LAT= JsonObject1.get("LAT").toString
    val VELOCITY= JsonObject1.get("VELOCITY").toString
    val ORIENTATION= JsonObject1.get("ORIENTATION").toString
    val ISUPDOWN= JsonObject1.get("ISUPDOWN").toString
    val array: ArrayBuffer[String] = new ArrayBuffer[String]()
    array.append(MACHNO)
    array.append(LINENO)
    array.append(LABELNO)
    array.append(SITETIME)
    array.append(LNG)
    array.append(LAT)
    array.append(VELOCITY)
    array.append(ORIENTATION)
    array.append(ISUPDOWN)
    array
  }
  def StrDateToStr(SITETIME: String):String={
    val year: String = SITETIME.substring(0,4)
    val mouth: String = SITETIME.substring(5,7)
    val day: String = SITETIME.substring(8,10)
    val shi: String = SITETIME.substring(11,13)
    val fen: String = SITETIME.substring(14,16)
    val miao: String = SITETIME.substring(17,19)
    val str: String = year+mouth+day+shi+fen+miao
    str
  }
  def NowDate(): String = {
    val now: Date = new Date()
    val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    val date = dateFormat.format(now)
    return date
  }
}

 

posted @ 2018-10-25 11:08  聚云  阅读(357)  评论(0编辑  收藏  举报