实际业务代码开发

数据清洗

时间工具类开发: DateUtils.scala

package com.imooc.utils

import java.util.Date
import org.apache.commons.lang3.time.FastDateFormat

/**
  * 日期时间工具类
  */
object DateUtils {

//  2019-03-31 06:00:00
  val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
  val TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")

  def getTime(time: String) = {
    YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
  }

  def parseToMinute(time: String) = {
    TARGET_FORMAT.format(new Date(getTime(time)))
  }

  def main(args: Array[String]): Unit = {
    println(parseToMinute("2019-03-31 06:12:00"))
  }
}

定义数据结果

ClickLog.scala

package com.imooc
/**
  * 清洗后的日志信息
  * @param time 日志访问的时间
  * @param ip 日志访问的时间
  * @param courseId 日志访问的实战课程编号
  * @param statusCode 日志访问产生的状态码
  * @param referer 日志访问产生的referer
  */
case class ClickLog(time: String, ip: String, courseId: Int, statusCode: Int, referer: String)
//    10.25.63.72   2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1   500 https://cn.bing.com/search?q=Spark Streaming实战

主代码:

ImoocStatStreamingApp.scala

package com.imooc

import com.imooc.utils.DateUtils
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * 测试Kafka对接Spark Streaming
  */
object ImoocStatStreamingApp {

  def main(args: Array[String]): Unit = {
    if (args.length != 4) {
      System.err.println("Usage: <zkQuorum> <groupId> <topics> <numThreads>")
      System.exit(1)
    }

    val Array(zkQuorum, groupId, topics,numThreads) = args
    val sparkconf = new SparkConf()
      .setAppName("ImoocStatStreamingApp")
      .setMaster("local[2]")
    val ssc = new StreamingContext(sparkconf, Seconds(60))

    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
    val message = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)


//    10.25.63.72   2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1   500 https://cn.bing.com/search?q=Spark Streaming实战
    val logs = message.map(_._2)
    val cleanData = logs.map(line => {
    val infos = line.split("\t")


      // info(2) = "GET /class/153.html HTTP/1.1"
      // url = /class/153.html
      val url = infos(2).split(" ")(1)
      var courseId = 0

      // 把实战课程的课程编号拿到了
      if (url.startsWith("/class")) {
        val courseIdHTML = url.split("/")(2)
        courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
      }
      
      
// 最终清洗后的结果
// 20190331065001,29.198.72.55,153,500,https://search.yahoo.com/search?p=Hadoop基础
      ClickLog(DateUtils.parseToMinute(infos(1)), infos(0), courseId, infos(3).toInt, infos(4))
    }).filter(clicklog => clicklog.courseId != 0)

    cleanData.print()

    ssc.start()
    ssc.awaitTermination()
  }
}
posted @ 2019-04-30 17:17  BBBone  阅读(134)  评论(0编辑  收藏  举报