spark分析点击流日志数据

日志数据样式

101.226.167.227 - - [18/Sep/2013:19:49:10 +0000] "GET /29171365/ HTTP/1.1" 302 161 "http://www.leonarding.com/29171365/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.167.224 - - [18/Sep/2013:19:49:10 +0000] "GET /35489/ HTTP/1.1" 302 161 "http://www.leonarding.com/35489/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.169.195 - - [18/Sep/2013:19:49:39 +0000] "GET /8092037/ HTTP/1.1" 302 161 "http://www.leonarding.com/8092037/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.169.196 - - [18/Sep/2013:19:49:39 +0000] "GET /7607759/ HTTP/1.1" 302 161 "http://www.leonarding.com/7607759/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.168.253 - - [18/Sep/2013:19:49:39 +0000] "GET /685769/ HTTP/1.1" 302 161 "http://www.leonarding.com/685769/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.168.254 - - [18/Sep/2013:19:49:47 +0000] "GET /9842/ HTTP/1.1" 302 161 "http://www.leonarding.com/9842/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.168.251 - - [18/Sep/2013:19:49:48 +0000] "GET /9240380/ HTTP/1.1" 302 161 "http://www.leonarding.com/9240380/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
101.226.169.196 - - [18/Sep/2013:19:49:48 +0000] "GET /8197055/ HTTP/1.1" 302 161 "http://www.leonarding.com/8197055/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"

一、pv值的计算(总访问量)

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object pv {

  def main(args: Array[String]): Unit = {
    //1、创建SparkConf
    val sparkConf: SparkConf = new SparkConf().setAppName("UV").setMaster("local[2]")
    //2、创建SparkContext
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("warn")
    //3、读取数据文件
    val data: RDD[String] = sc.textFile("D://access.log")
    //4、读取pv
    println("PV:"+data.count())
    //5、关闭sparkcontext
    sc.stop()
  }
}

 

二、uv的计算(ip计数)

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object IpCount {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("IpCount").setMaster("local[2]")
    val sc = new SparkContext(conf)
    sc.setLogLevel("warn")
    val data: RDD[String] = sc.textFile("d://access.log")
    //每行空格切割,取第一个
    val data1= data.map(_.split(" ")(0))
    //ip去重,计数
    val ipcount = data1.distinct().count()
    println("ipCount:"+ipcount)
    sc.stop()

  }
}

 

三、topN(访问地址top)

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object wzCount {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("wc").setMaster("local[2]")
    val sc = new SparkContext(conf)
    sc.setLogLevel("warn")
    //读取本地文件
    val data: RDD[String] = sc.textFile("d://access.log")
    //过滤包含http://的行,以http://开头和" 结尾来切分出访问地址
    val wn = data.filter(_.contains("http://")).map(_.split("http://")(1)).map(_.split("\" ")(0))
    //计数
    val value = wn.map((_,1)).reduceByKey(_ + _)
    //以第二个值降序排序,打印前5条数据
    value.sortBy(_._2,false).take(5).foreach(println)
    sc.stop()
  }
}

 

posted @ 2020-04-23 17:38  牛皮糖比  阅读(280)  评论(0编辑  收藏  举报