实际业务代码开发
数据清洗
时间工具类开发: DateUtils.scala
package com.imooc.utils
import java.util.Date
import org.apache.commons.lang3.time.FastDateFormat
/**
* 日期时间工具类
*/
object DateUtils {
// 2019-03-31 06:00:00
val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
val TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
def getTime(time: String) = {
YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
}
def parseToMinute(time: String) = {
TARGET_FORMAT.format(new Date(getTime(time)))
}
def main(args: Array[String]): Unit = {
println(parseToMinute("2019-03-31 06:12:00"))
}
}
定义数据结果
ClickLog.scala
package com.imooc
/**
* 清洗后的日志信息
* @param time 日志访问的时间
* @param ip 日志访问的时间
* @param courseId 日志访问的实战课程编号
* @param statusCode 日志访问产生的状态码
* @param referer 日志访问产生的referer
*/
case class ClickLog(time: String, ip: String, courseId: Int, statusCode: Int, referer: String)
// 10.25.63.72 2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1 500 https://cn.bing.com/search?q=Spark Streaming实战
主代码:
ImoocStatStreamingApp.scala
package com.imooc
import com.imooc.utils.DateUtils
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* 测试Kafka对接Spark Streaming
*/
object ImoocStatStreamingApp {
def main(args: Array[String]): Unit = {
if (args.length != 4) {
System.err.println("Usage: <zkQuorum> <groupId> <topics> <numThreads>")
System.exit(1)
}
val Array(zkQuorum, groupId, topics,numThreads) = args
val sparkconf = new SparkConf()
.setAppName("ImoocStatStreamingApp")
.setMaster("local[2]")
val ssc = new StreamingContext(sparkconf, Seconds(60))
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val message = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)
// 10.25.63.72 2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1 500 https://cn.bing.com/search?q=Spark Streaming实战
val logs = message.map(_._2)
val cleanData = logs.map(line => {
val infos = line.split("\t")
// info(2) = "GET /class/153.html HTTP/1.1"
// url = /class/153.html
val url = infos(2).split(" ")(1)
var courseId = 0
// 把实战课程的课程编号拿到了
if (url.startsWith("/class")) {
val courseIdHTML = url.split("/")(2)
courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
}
// 最终清洗后的结果
// 20190331065001,29.198.72.55,153,500,https://search.yahoo.com/search?p=Hadoop基础
ClickLog(DateUtils.parseToMinute(infos(1)), infos(0), courseId, infos(3).toInt, infos(4))
}).filter(clicklog => clicklog.courseId != 0)
cleanData.print()
ssc.start()
ssc.awaitTermination()
}
}