博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

spark streaming求均值

Posted on 2017-07-27 16:45  来碗酸梅汤  阅读(370)  评论(0编辑  收藏  举报

1、数据格式

aaaa	201701	11.1,3.8,2.5
aaaa	201702	2.1,3.3,2.5
aaaa	201703	34.1,3.2,2.0
aaaa	201704	2.2,3.3,2.5
aaaa	201705	13.1,3.5,2.5
aaaa	201706	22.4,3.3,2.5
aaaa	201707	2.1,3.3,2.0
aaaa	201708	10.1,4.3,2.5
bbbb	201701	2.8,3.3,2.5
bbbb	201703	2.2,3.3,4.2
bbbb	201704	2.1,3.3,2.5
bbbb	201705	2.3,3.7,2.5
bbbb	201709	2.1,3.4,2.5
bbbb	201719	2.1,3.3,2.5
bbbb	201712	2.1,3.3,2.0

 2、处理流程

package streaming

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.RedisClient
import redis.clients.jedis.Jedis
import org.apache.log4j.Logger
import spire.math.Interval

import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
/**
  * Created by hadoop on 2017/7/18.
  */
case class Da(kk: String,
             tt:Int,
             va:mutable.Buffer[String]
                     )
object Streaming {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("streaming..").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))

    //val fileStream = ssc.fileStream("D:\\tttttttttttt\\")
    //fileStream.print()
    //val lines = ssc.socketTextStream("192.168.25.129",9999)
    var lines = ssc.textFileStream("D:\\tt\\")

    /*var jd = new Jedis("192.168.25.128",6379)
    var jdv = jd.get("aaaa")
    println("aaaa: "+jdv)
    var broadcast = sc.broadcast(jd.get("aaaa"))
    println(broadcast)*/


    val data = lines.flatMap( x=>{
      var dataBuff=mutable.Buffer[Da]()
      var v = mutable.Buffer[String]()
      v = (x.split("\t")(2)).split(",").toBuffer

      val dd = Da(x.split("\t")(0).toString,x.split("\t")(1).toInt,v)
      dataBuff.append(dd)
      dataBuff
    })

    //获取数据
    val orders = data.map(x => {
      (x.kk,x)
    })
    //获取数据个数
    val orders2 = data.map(x => {
      (x.kk,1)
    })
    val counts = orders2.reduceByKey(_+_)

    val oss = orders.groupByKey()

    val result = oss.join(counts).map( xx => {
      val sum = xx._2._2
      val dada = xx._2._1.toList


      val zong = ArrayBuffer[Double]()
      if(dada(0).va != null && dada(0).va.size > 0 && dada.size >0){
        for(clo <- 0 until dada(0).va.size){
          var tmp = 0.0
          for(row <- 0 until dada.size){
            tmp += dada(row).va(clo).toDouble
            println("--"+dada(row).va(clo).toDouble)
          }
          zong+=tmp/sum
        }
      }
      zong
    })

    result.print()
    ssc.start()
    ssc.awaitTermination()
  }

}