1、数据格式
aaaa 201701 11.1,3.8,2.5 aaaa 201702 2.1,3.3,2.5 aaaa 201703 34.1,3.2,2.0 aaaa 201704 2.2,3.3,2.5 aaaa 201705 13.1,3.5,2.5 aaaa 201706 22.4,3.3,2.5 aaaa 201707 2.1,3.3,2.0 aaaa 201708 10.1,4.3,2.5 bbbb 201701 2.8,3.3,2.5 bbbb 201703 2.2,3.3,4.2 bbbb 201704 2.1,3.3,2.5 bbbb 201705 2.3,3.7,2.5 bbbb 201709 2.1,3.4,2.5 bbbb 201719 2.1,3.3,2.5 bbbb 201712 2.1,3.3,2.0
2、处理流程
package streaming import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import redis.RedisClient import redis.clients.jedis.Jedis import org.apache.log4j.Logger import spire.math.Interval import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, ListBuffer} /** * Created by hadoop on 2017/7/18. */ case class Da(kk: String, tt:Int, va:mutable.Buffer[String] ) object Streaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("streaming..").setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) //val fileStream = ssc.fileStream("D:\\tttttttttttt\\") //fileStream.print() //val lines = ssc.socketTextStream("192.168.25.129",9999) var lines = ssc.textFileStream("D:\\tt\\") /*var jd = new Jedis("192.168.25.128",6379) var jdv = jd.get("aaaa") println("aaaa: "+jdv) var broadcast = sc.broadcast(jd.get("aaaa")) println(broadcast)*/ val data = lines.flatMap( x=>{ var dataBuff=mutable.Buffer[Da]() var v = mutable.Buffer[String]() v = (x.split("\t")(2)).split(",").toBuffer val dd = Da(x.split("\t")(0).toString,x.split("\t")(1).toInt,v) dataBuff.append(dd) dataBuff }) //获取数据 val orders = data.map(x => { (x.kk,x) }) //获取数据个数 val orders2 = data.map(x => { (x.kk,1) }) val counts = orders2.reduceByKey(_+_) val oss = orders.groupByKey() val result = oss.join(counts).map( xx => { val sum = xx._2._2 val dada = xx._2._1.toList val zong = ArrayBuffer[Double]() if(dada(0).va != null && dada(0).va.size > 0 && dada.size >0){ for(clo <- 0 until dada(0).va.size){ var tmp = 0.0 for(row <- 0 until dada.size){ tmp += dada(row).va(clo).toDouble println("--"+dada(row).va(clo).toDouble) } zong+=tmp/sum } } zong }) result.print() ssc.start() ssc.awaitTermination() } }