Spark读取Hbase的数据
val conf = HBaseConfiguration.create() conf.addResource(new Path("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/hbase/conf/hbase-site.xml")) conf.addResource(new Path("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/hadoop/etc/hadoop/core-site.xml")) conf.set(TableInputFormat.INPUT_TABLE, "FLOW") //添加过滤条件,年龄大于 18 岁 //val scan = new Scan() //conf.set(TableInputFormat.SCAN, convertScanToString(scan)) /* scan.setFilter(new SingleColumnValueFilter("basic".getBytes, "age".getBytes, CompareOp.GREATER_OR_EQUAL, Bytes.toBytes(18))) */ val usersRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) val data1 = usersRDD.count() val sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSSSS") println("data length:" + data1) var map = HashMap[String, HashMap[String, collection.mutable.ArrayBuffer[Double]]]() usersRDD.collect().map { case (_, result) => val key = Bytes.toInt(result.getRow) println("Key:" + key) val ip = Bytes.toString(result.getValue("F".getBytes, "SADDR".getBytes)) val port = Bytes.toString(result.getValue("F".getBytes, "SPORT".getBytes)) val startTimeLong = Bytes.toString(result.getValue("F".getBytes, "STIME".getBytes)) val endTimeLong = Bytes.toString(result.getValue("F".getBytes, "LTIME".getBytes)) val protocol = Bytes.toString(result.getValue("F".getBytes, "PROTO".getBytes)) val sumTime = Bytes.toString(result.getValue("F".getBytes, "DUR".getBytes)) val sum = Bytes.toString(result.getValue("F".getBytes, "DBYTES".getBytes)).toDouble println("ip:" + ip + ",port:" + port + ",startTime:" + startTimeLong + ",endTime:" + endTimeLong + ",protocol:" + protocol + ",sum:" + sum) //ip+port+udp,14:02 14:07 List //ip+port+tcp,15:02 15:07 List val startTimeDate = sf.parse(startTimeLong) val endTimeLongDate = sf.parse(endTimeLong) val startHours = startTimeDate.getHours val startMinutes = startTimeDate.getMinutes val endHours = endTimeLongDate.getHours val endMinutes = endTimeLongDate.getMinutes val key1 = ip + "_" + port + "_" + protocol println("key1:" + key1) val key2 = startHours + ":" + startMinutes + "_" + endHours + ":" + endMinutes println("key2:" + key2) val tmpMap = map.get(key1) if (!tmpMap.isEmpty) { println("--------------------map is not null:" + tmpMap.size + "--------------------") val sumArray = tmpMap.get.get(key2) if (!sumArray.isEmpty) { sumArray.get += sum } } else { println("--------------------map is null--------------------") //如果当前Key不存在的话,是一个全新的Ip val sumArray = collection.mutable.ArrayBuffer[Double]() sumArray += sum val secondMap = HashMap[String, collection.mutable.ArrayBuffer[Double]]() secondMap += (key2 -> sumArray) map += (key1 -> secondMap) } map println("map size-----------------:" + map.size) } println("map size:" + map.size) map.map(e => { println("--------------------Statistics start --------------------") val resultKey1 = e._1 val resultVal1 = e._2 println("resultKey1:" + resultKey1) resultVal1.foreach(f => { val resultKey2 = f._1 val resultVal2 = f._2 println("resultKey2:" + resultKey2) println("-----------------resultVal2:" + resultVal2.length) resultVal2.map(f=>{ println("------------------------f:"+f) }) val dataArray = resultVal2.map(f => Vectors.dense(f)) val summary: MultivariateStatisticalSummary = Statistics.colStats(sc.parallelize(dataArray)) // println("--------------------mean:" + summary.mean + " --------------------") println("--------------------variance:" + summary.variance + " --------------------") println("--------------------mean apply 0:" + summary.mean.toArray.apply(0) + " --------------------") println("--------------------variance apply 0:" + summary.variance.apply(0) + " --------------------") val upbase = summary.mean.toArray.apply(0) + 1.960 * Math.sqrt(summary.variance.apply(0)) val downbase = summary.mean.toArray.apply(0) - 1.960 * Math.sqrt(summary.variance.apply(0)) println("------------------- " + upbase + " ---------- " + downbase) val df = new DecimalFormat(".##") val upbaseString = df.format(upbase) val downbaseString = df.format(downbase) //resultMap.put(key, value) val result3 = HashMap[Double, Double]() //result3 +=(upbase -> downbase) println("ip port:" + resultKey1 + ",time:" + resultKey2 + ",upbase:" + upbase + ",downbase:" + downbase) }) }) println("--------------------baseLine end --------------------") sc.stop()