flum到kafka 收集数据 存储到redis 案例 (ip.txt)

ip.scala

package ip

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}

object ip {
  Logger.getLogger("org").setLevel(Level.WARN)
  def main(args: Array[String]): Unit = {
    //new sc
    val conf = new SparkConf ()
      .setAppName ( this.getClass.getSimpleName )
      .setMaster ( "local[*]" )
    val ssc=new StreamingContext(conf,Seconds(3))

    //创建topic
    val topic="ip01"
    val topics=Array(topic)
    //创建groupid
    val groupid="IPoffsets"
    //创建kafka链接参数
    val params=Map(
      "bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupid,
      //告诉大家从哪里消费
      "auto.offset.reset" -> "earliest",
      //是否自动提交偏移量
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    //创建kafka直连方式

    //判断偏移量是否存在
    val stream: InputDStream[ConsumerRecord[String, String]] =
      KafkaUtils.createDirectStream(
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String,String](topics,params)
      )

    stream.foreachRDD(rdd=>{
    rdd.foreach(println(_))
      //开启偏移量
      val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      //去获取数据
      val ip1:RDD[((String,String,String),Int)] = rdd.map ( tp => {
        val splits = tp.value().split ( "[|]" )
        val prive=splits(6)
        val city = splits ( 7 )
        val fangshi=splits(9)
        ((prive,city,fangshi),1)
      } ).reduceByKey(_+_)

      //写一个方法,存储数据与偏移量信息
      DateMyRedis.saveDataOffset(ip1,ranges,groupid)


    })
    ssc.start()
    ssc.awaitTermination()

  }
}

DataMyRedis.scala

package ip

import java.util

import day15.Jpoods
import org.apache.kafka.common.TopicPartition
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka010.OffsetRange

import scala.collection.mutable

object DateMyRedis {
  //保存数据到redis
  def saveDataOffset(result: RDD[((String,String, String),Int)],ranges: Array[OffsetRange],groupingID: String): Unit ={
    result.foreachPartition(filter=>{
      //获取jedis对象
      val jedis = Jpoods.getJedis ()
      //redis开启事务
      val transaction = jedis.multi()
      filter.foreach(tp=>{
        try {
          //存储数据
          transaction.hincrBy("IP1", tp._1._1+":"+tp._1._2+":"+tp._1._3, tp._2)
          //存储偏移量
          for (o <- ranges) {
            transaction.hset(groupingID, o.topic + ":" + o.partition, o.untilOffset.toString)
          }
        }catch {
          case _ =>
            println("报错了,需要回滚")
            transaction.discard()
        }
      })
      transaction.exec()
      jedis.close()
    })


  }


  //从redis中获取偏移量信息
  def getOffset(groupid: String, topic: String): mutable.Map[TopicPartition, Long] = {
    val offset = mutable.Map [TopicPartition, Long]()
    //tp._1  topic+partition   tp._2  offset
    import scala.collection.JavaConversions._
    val jedis = Jpoods.getJedis ()
    //导入转换list隐士转换
    val map: util.Map[String, String] = jedis.hgetAll ( groupid )
    val list = map.toList
    for (o <- list) {
      offset += new TopicPartition ( o._1.split ( ":" )( 0 ), o._1.split ( ":" )( 1 ).toInt ) -> o._2.toLong
    }
    offset
  }




}

Jpoods.scala

package day15

import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.{Jedis, JedisPool}

object Jpoods {
  //设置参数
  private val conf=new GenericObjectPoolConfig()
  conf.setMaxIdle(2000)
  conf.setMaxTotal(5000)

  //获取jedis的连接对象
  private val jpoods=new JedisPool("192.168.186.150",6379)

  //获取jedis对象的方法
  def getJedis():Jedis={
    val jedis=jpoods.getResource()
    jedis.select(6)
    jedis
  }




}

shell脚本  flum-kafka.conf

a1.sources = r1
a1.channels = c1

#定义source
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /usr/local/apache-flume-1.8.0-bin/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /root/myde/logs/access.log


#定义channel
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop01:9092,hadoop02:9092,hadoop03:9092
a1.channels.c1.kafka.topic = ip01
a1.channels.c1.parseAsFlumeEvent = false

#将Source和channle组装在一起
a1.sources.r1.channels = c1

 

posted @ 2019-07-10 17:14  又又IT  阅读(438)  评论(0编辑  收藏  举报