spark分析点击流日志数据
日志数据样式
101.226.167.227 - - [18/Sep/2013:19:49:10 +0000] "GET /29171365/ HTTP/1.1" 302 161 "http://www.leonarding.com/29171365/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.167.224 - - [18/Sep/2013:19:49:10 +0000] "GET /35489/ HTTP/1.1" 302 161 "http://www.leonarding.com/35489/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.169.195 - - [18/Sep/2013:19:49:39 +0000] "GET /8092037/ HTTP/1.1" 302 161 "http://www.leonarding.com/8092037/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.169.196 - - [18/Sep/2013:19:49:39 +0000] "GET /7607759/ HTTP/1.1" 302 161 "http://www.leonarding.com/7607759/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.168.253 - - [18/Sep/2013:19:49:39 +0000] "GET /685769/ HTTP/1.1" 302 161 "http://www.leonarding.com/685769/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.168.254 - - [18/Sep/2013:19:49:47 +0000] "GET /9842/ HTTP/1.1" 302 161 "http://www.leonarding.com/9842/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.168.251 - - [18/Sep/2013:19:49:48 +0000] "GET /9240380/ HTTP/1.1" 302 161 "http://www.leonarding.com/9240380/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider" 101.226.169.196 - - [18/Sep/2013:19:49:48 +0000] "GET /8197055/ HTTP/1.1" 302 161 "http://www.leonarding.com/8197055/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MDDR; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322; Tablet PC 2.0); 360Spider"
一、pv值的计算(总访问量)
import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object pv { def main(args: Array[String]): Unit = { //1、创建SparkConf val sparkConf: SparkConf = new SparkConf().setAppName("UV").setMaster("local[2]") //2、创建SparkContext val sc = new SparkContext(sparkConf) sc.setLogLevel("warn") //3、读取数据文件 val data: RDD[String] = sc.textFile("D://access.log") //4、读取pv println("PV:"+data.count()) //5、关闭sparkcontext sc.stop() } }
二、uv的计算(ip计数)
import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object IpCount { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("IpCount").setMaster("local[2]") val sc = new SparkContext(conf) sc.setLogLevel("warn") val data: RDD[String] = sc.textFile("d://access.log") //每行空格切割,取第一个 val data1= data.map(_.split(" ")(0)) //ip去重,计数 val ipcount = data1.distinct().count() println("ipCount:"+ipcount) sc.stop() } }
三、topN(访问地址top)
import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object wzCount { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("wc").setMaster("local[2]") val sc = new SparkContext(conf) sc.setLogLevel("warn") //读取本地文件 val data: RDD[String] = sc.textFile("d://access.log") //过滤包含http://的行,以http://开头和" 结尾来切分出访问地址 val wn = data.filter(_.contains("http://")).map(_.split("http://")(1)).map(_.split("\" ")(0)) //计数 val value = wn.map((_,1)).reduceByKey(_ + _) //以第二个值降序排序,打印前5条数据 value.sortBy(_._2,false).take(5).foreach(println) sc.stop() } }