spark streaming入hbase
1.一般的入hbase
import net.sf.json.JSONObject import org.apache.hadoop.hbase.client.{HTable, Put} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010._ import org.apache.spark.streaming.{Seconds, StreamingContext} object consumerStreaming { // Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) // Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) def main(args: Array[String]): Unit = { var masterUrl="local[*]" if(args.length>2){ masterUrl=args(0) } val conf = new SparkConf().setMaster(masterUrl).setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(1)) val brokers = PropertieUtil.getString("kafka.brokers") val kafkaParams = Map[String, Object]( "bootstrap.servers" -> brokers, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "cloudera_mirrormaker", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) //配置 val topics = Array("TIANQIAN") val tableName = PropertieUtil.getString("hbase.tableName") val hbaseZookeeperList = PropertieUtil.getString("hbase.zookeeper") // 接收数据 val stream = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) //转换数据并存入hbase val events = stream.map(r=>r.value()) var n = 0 events.foreachRDD(rdd=>{ rdd.foreachPartition(partition=>{ //hbase配置 val hbaseConf = HBaseConfiguration.create() hbaseConf.set("hbase.zookeeper.quorum", hbaseZookeeperList) hbaseConf.set("hbase.zookeeper.property.clientPort", "2181") hbaseConf.set("hbase.defaults.for.version.skip", "true") val StatTable = new HTable(hbaseConf,TableName.valueOf(tableName)) partition.foreach(r=>{ val JsonObject=JSONObject.fromObject(r) val protocol= JsonObject.get("protocol") println(protocol) val data = JsonObject.get("data") println(data) val start = data.toString.substring(0,3) val zho = data.toString.substring(3,16) val end = data.toString.substring(16,19) val sui = data.toString.substring(19) //存入hbase val put=new Put(Bytes.toBytes(zho+sui)) put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("START"),Bytes.toBytes(start)) put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("ZHO"),Bytes.toBytes(zho)) put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("END"),Bytes.toBytes(end)) put.addColumn(Bytes.toBytes("c"),Bytes.toBytes("SUI"),Bytes.toBytes(end)) StatTable.setAutoFlush(false,false) StatTable.setWriteBufferSize(3*1024*1024) StatTable.put(put) }) StatTable.flushCommits() }) }) ssc.start() ssc.awaitTermination() } }
2.下面给出一个发送kafka和接收kafka入spark hbase
生产数据
import java.text.SimpleDateFormat import java.util.{Date, Properties} import net.sf.json.JSONObject import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import scala.util.Random object YunLiProdicer { def main(args: Array[String]): Unit = { val brokers = PropertieUtil.getString("kafka.brokers") val topics = PropertieUtil.getString("kafka.topics") val props: Properties = new Properties() props.put("metadata.broker.list", brokers) props.put("bootstrap.servers", brokers) props.put("serializer.class", "kafka.serializer.StringEncoder") props.put("request.required.acks", "1") props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") var n: Int = 0 val kafkaProducer = new KafkaProducer[String, String](props) while (true){ var a1=(Math.random()*1000+10) var MACHNO=a1.formatted("%.0f") var a2= (Math.random()*1001+10) var LINENO=a2.formatted("%.0f") var LABELNO= (new Random).nextInt(1000) var SITETIME = NowDate var LNG= (new Random).nextInt(1000) var LAT= (new Random).nextInt(30) var VELOCITY=(new Random).nextInt(1000) var ORIENTATION=(new Random).nextInt(1000) var ISUPDOWN= (new Random).nextInt(2) val event = new JSONObject() val event2 = new JSONObject() event.put("MACHNO",MACHNO) event.put("LINENO",LINENO) event.put("LABELNO",LABELNO) event.put("SITETIME",SITETIME) event.put("LNG",LNG) event.put("LAT",LAT) event.put("VELOCITY",VELOCITY) event.put("ORIENTATION",ORIENTATION) event.put("ISUPDOWN",0) //完整包 event2.put("V","V7") event2.put("data",event.toString()) kafkaProducer.send(new ProducerRecord[String,String](topics,"176",event2.toString())) n=n+1 Thread.sleep(0) println(n+" "+"发送数据个数") } def NowDate(): String = { val now: Date = new Date() val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val date = dateFormat.format(now) return date } } }
接收并入hbase代码
import java.text.SimpleDateFormat import java.util.Date import net.sf.json.JSONObject import org.apache.hadoop.hbase.client.{Put, Result} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.mapreduce.Job import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.TimestampType import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010._ import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import scala.collection.mutable.ArrayBuffer //正确 object XXXConsumerStreaming { // Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) // Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) def main(args: Array[String]): Unit = { var masterUrl="local[*]" if(args.length>2){ masterUrl="yarn-cluster" } val conf = new SparkConf().setAppName("NetworkWordCount").setMaster(masterUrl) val sc = new SparkContext(conf) val spark = new SQLContext(sc) val ssc = new StreamingContext(sc, Seconds(10)) val brokers = PropertieUtil.getString("kafka.brokers") val topic = PropertieUtil.getString("kafka.topics") val kafkaParams = Map[String, Object]( "bootstrap.servers" -> brokers, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "cloudera_mirrormaker", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) //配置 val topics = Set(topic) val tableName = PropertieUtil.getString("hbase.tablenamehn") var lastRdd:RDD[Row] = null val hbaseZookeeperList = PropertieUtil.getString("hbase.zookeeper") // 接收数据 val stream = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) //转换数据并存入hbase val events = stream.map(r=>r.value()) //hbase配置 sc.hadoopConfiguration.set("hbase.zookeeper.quorum","*:2181") //遍历 events.foreachRDD(rdd=>{ //数据转为df import spark.implicits._ val frame: DataFrame = rdd.map(rdd2 => DataToArray(rdd2)).map(r =>(r(0), r(1), r(2), r(3), r(4), r(5),r(6),r(7),r(8))). toDF("MACHNO","LINENO","LABELNO","SITETIME","LNG","LAT","VELOCITY","ORIENTATION","ISUPDOWN") frame.select(frame("MACHNO"),frame("LINENO"),frame("LABELNO"),frame("SITETIME").cast(TimestampType),frame("LNG"),frame("LAT"),frame("VELOCITY"), frame("ORIENTATION"),frame("ISUPDOWN")).createOrReplaceTempView("TEST") val sql = """ |SELECT MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN |FROM ( |SELECT *,ROW_NUMBER() OVER (PARTITION BY MACHNO,LINENO,ISUPDOWN ORDER BY SITETIME DESC) TT |FROM TEST |) WHERE TT = 1 """.stripMargin val df: DataFrame = spark.sql(sql) df.createOrReplaceTempView("TESTA") println("计算结果") df.show() if (lastRdd==null){ lastRdd=df.rdd }else{ val df1: DataFrame = lastRdd.map(r => (r(0).toString, r(1).toString, r(2).toString, r(3).toString, r(4).toString, r(5).toString, r(6).toString, r(7).toString, r(8).toString)) .toDF("MACHNO", "LINENO", "LABELNO", "SITETIME", "LNG", "LAT", "VELOCITY", "ORIENTATION", "ISUPDOWN") df1.select(df1("MACHNO"),df1("LINENO"),df1("LABELNO"),df1("SITETIME").cast(TimestampType),df1("LNG"),df1("LAT"),df1("VELOCITY"), df1("ORIENTATION"),df1("ISUPDOWN")).createOrReplaceTempView("TESTB") val sql2 = """ |select MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN from TESTA |UNION ALL |select MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN from TESTB """.stripMargin val df2: DataFrame = spark.sql(sql2) println("关联结果") df2.show() df2.createOrReplaceTempView("TESTC") val sql3 = """ |SELECT MACHNO,LINENO,LABELNO,SITETIME,LNG,LAT,VELOCITY,ORIENTATION,ISUPDOWN |FROM ( |SELECT *,ROW_NUMBER() OVER (PARTITION BY MACHNO,LINENO,ISUPDOWN ORDER BY SITETIME DESC) TT |FROM TEST |) WHERE TT = 1 """.stripMargin val df4: DataFrame = spark.sql(sql3) println(":") df4.show() lastRdd=df4.rdd sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tableName) val job = new Job(sc.hadoopConfiguration) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Result]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) df4.rdd.map(r=>{ // //存入hbase val INSTIME: String = NowDate // val put=new Put(Bytes.toBytes(r(0).toString+r(1).toString+r(8).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("MACHNO"),Bytes.toBytes(r(0).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LINENO"),Bytes.toBytes(r(1).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LABELNO"),Bytes.toBytes(r(2).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("SITETIME"),Bytes.toBytes(r(3).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LNG"),Bytes.toBytes(r(4).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("LAT"),Bytes.toBytes(r(5).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("VELOCITY"),Bytes.toBytes(r(6).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("ORIENTATION"),Bytes.toBytes(r(7).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("ISUPDOWN"),Bytes.toBytes(r(8).toString)) put.addColumn(Bytes.toBytes("C"),Bytes.toBytes("INSTIME"),Bytes.toBytes(INSTIME.toString)) (new ImmutableBytesWritable, put) }).saveAsNewAPIHadoopDataset(job.getConfiguration()) } }) ssc.start() ssc.awaitTermination() } def DataToArray(rdd: String):ArrayBuffer[String]={ val JsonObject=JSONObject.fromObject(rdd) val data = JsonObject.get("data") val JsonObject1=JSONObject.fromObject(data) val MACHNO= JsonObject1.get("MACHNO").toString val LINENO= JsonObject1.get("LINENO").toString val LABELNO= JsonObject1.get("LABELNO").toString val SITETIME= JsonObject1.get("SITETIME").toString val LNG= JsonObject1.get("LNG").toString val LAT= JsonObject1.get("LAT").toString val VELOCITY= JsonObject1.get("VELOCITY").toString val ORIENTATION= JsonObject1.get("ORIENTATION").toString val ISUPDOWN= JsonObject1.get("ISUPDOWN").toString val array: ArrayBuffer[String] = new ArrayBuffer[String]() array.append(MACHNO) array.append(LINENO) array.append(LABELNO) array.append(SITETIME) array.append(LNG) array.append(LAT) array.append(VELOCITY) array.append(ORIENTATION) array.append(ISUPDOWN) array } def StrDateToStr(SITETIME: String):String={ val year: String = SITETIME.substring(0,4) val mouth: String = SITETIME.substring(5,7) val day: String = SITETIME.substring(8,10) val shi: String = SITETIME.substring(11,13) val fen: String = SITETIME.substring(14,16) val miao: String = SITETIME.substring(17,19) val str: String = year+mouth+day+shi+fen+miao str } def NowDate(): String = { val now: Date = new Date() val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val date = dateFormat.format(now) return date } }