学习日志---9
日志分析---从10000条数据中统计各个浏览器占比,数据格式如下
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.neusoft.com" "-" cid=0×tamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "neuedu/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027 10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000 117.35.88.11 - - [10/Nov/2016:00:01:02 +0800] "GET /article/ajaxcourserecommends?id=124 HTTP/1.1" 200 2345 "www.neusoft.com" "http://www.neusoft.com/code/1852" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36" "-" 10.100.136.65:80 200 0.616 0.616 182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.neuedu.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004 10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000 183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userdynamic HTTP/1.1" 200 19501 "www.neusoft.com" "-" cid=0×tamp=1478707261847&uid=2871142&touid=2871142&page=1&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=3837a5bf27ea718fe18bda6c53fbbc14 "neuedu/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.195 0.195 10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000 114.248.161.26 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getcourseintro HTTP/1.1" 200 2510 "www.neusoft.com" "-" cid=283&secrect=86b720f312c2b25da3b20e59e7c89780×tamp=1478707261951&token=4c144b3f4314178b9527d1e91ecc0fac&uid=3372975 "neuedu/5.0.2 (iPhone; iOS 8.4.1; Scale/2.00)" "-" 10.100.136.65:80 200 0.007 0.008 120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getmediainfo_ver2 HTTP/1.1" 200 633 "www.neusoft.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5×tamp=1478707261945&token=9bbdba949aec02735e59e0868b538e19&uid=4203162 "neuedu/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.049 0.049 10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000 112.10.136.45 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.neuedu.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.006 0.006 211.162.33.31 - - [10/Nov/2016:00:01:02 +0800] "GET /u/card HTTP/1.1" 200 331 "www.neusoft.com" "http://www.neusoft.com/code/2053" - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" "-" 10.100.136.65:80 200 0.371 0.371 116.22.196.70 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser HTTP/1.1" 200 54 "www.neusoft.com" "http://www.neusoft.com/code/3500" mid=3500&time=60 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0" "-" 10.100.134.244:80 200 0.026 0.026
难点:
1.从每一行中找到描述浏览器信息的字符串;
2.从这字符串中解析出浏览器;
解决:
1.多观察可以发现,每一行在第七个“ " ”之后的字符串为浏览器信息,可以通过以下方式获取
1 /** 2 * 获取指定字符串中指定标识符出现的索引位置 3 **/ 4 private int getCharacterPosition(String value, String operator, int index) { 5 Matcher slashMatcher = Pattern.compile(operator).matcher(value); 6 int mIdex = 0; 7 while (slashMatcher.find()) { 8 mIdex++; 9 10 if (mIdex == index) { 11 break; 12 } 13 } 14 return slashMatcher.start(); 15 }
2.使用GitHub上现成的工具UserAgentParser,使用方法如下
1 userAgentParser = new UserAgentParser(); 2 String userAgentString = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.13) Gecko/20100914 Firefox/3.5.13 (.NET CLR 3.5.30729)"; 3 String browser = userAgentParser.browser(userAgentString);
全部代码如下:
TravelMapper.java
1 package travel; 2 3 import java.io.IOException; 4 import java.util.regex.Matcher; 5 import java.util.regex.Pattern; 6 7 import org.apache.hadoop.io.LongWritable; 8 import org.apache.hadoop.io.Text; 9 import org.apache.hadoop.mapreduce.Mapper; 10 11 import com.kumkee.userAgent.UserAgent; 12 import com.kumkee.userAgent.UserAgentParser; 13 14 public class TravelMapper extends Mapper<LongWritable, Text, Text, Text> { 15 16 @Override 17 protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) 18 throws IOException, InterruptedException { 19 // TODO 自动生成的方法存根 20 String line=value.toString(); 21 UserAgentParser userAgentParser= new UserAgentParser(); 22 String source = line.substring(getCharacterPosition(line, "\"", 7) + 1); 23 UserAgent agent = userAgentParser.parse(source); 24 String browser = agent.getBrowser(); 25 26 // 通过上下文把map的处理结果输出 27 context.write(new Text(browser), new Text("1")); 28 } 29 30 private int getCharacterPosition(String value, String operator, int index) { 31 Matcher slashMatcher = Pattern.compile(operator).matcher(value); 32 int mIdex = 0; 33 while (slashMatcher.find()) { 34 mIdex++; 35 36 if (mIdex == index) { 37 break; 38 } 39 } 40 return slashMatcher.start(); 41 } 42 43 }
TravelReducer.java
1 package travel; 2 3 import java.io.IOException; 4 5 import org.apache.hadoop.io.Text; 6 import org.apache.hadoop.mapreduce.Reducer; 7 8 public class TravelReducer extends Reducer<Text, Text, Text, Text> { 9 10 @Override 11 protected void reduce(Text arg0, Iterable<Text> arg1, 12 Reducer<Text, Text, Text, Text>.Context arg2) throws IOException, InterruptedException { 13 // TODO 自动生成的方法存根 14 15 int count=0; 16 for (Text text : arg1) { 17 count++; 18 19 } 20 double percent=(double)count/100; 21 arg2.write(arg0, new Text(percent+"%")); 22 } 23 24 25 }
MyJob.java
1 package travel; 2 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.conf.Configured; 5 import org.apache.hadoop.fs.Path; 6 import org.apache.hadoop.io.Text; 7 import org.apache.hadoop.mapreduce.Job; 8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 import org.apache.hadoop.util.Tool; 11 import org.apache.hadoop.util.ToolRunner; 12 13 14 15 public class MyJob extends Configured implements Tool{ 16 17 public static void main(String[] args) throws Exception { 18 System.setProperty("hadoop.home.dir", "E:\\hadoop"); 19 MyJob myJob=new MyJob(); 20 ToolRunner.run(myJob, null); 21 } 22 public int run(String[] args) throws Exception { 23 // TODO Auto-generated method stub 24 Configuration conf=new Configuration(); 25 conf.set("fs.default.name", "hdfs://192.168.137.11:9000"); 26 Job job=Job.getInstance(conf); 27 job.setJarByClass(MyJob.class); 28 job.setMapperClass(TravelMapper.class); 29 job.setReducerClass(TravelReducer.class); 30 job.setOutputKeyClass(Text.class); 31 job.setOutputValueClass(Text.class); 32 job.setMapOutputKeyClass(Text.class); 33 job.setMapOutputValueClass(Text.class); 34 FileInputFormat.addInputPath(job, new Path("/hadoop/test.log")); 35 FileOutputFormat.setOutputPath(job, new Path("/hadoop/TravelResult")); 36 job.waitForCompletion(true); 37 38 return 0; 39 } 40 41 }
结果: