学习日志---9

日志分析---从10000条数据中统计各个浏览器占比,数据格式如下

183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.neusoft.com" "-" cid=0&timestamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "neuedu/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
117.35.88.11 - - [10/Nov/2016:00:01:02 +0800] "GET /article/ajaxcourserecommends?id=124 HTTP/1.1" 200 2345 "www.neusoft.com" "http://www.neusoft.com/code/1852" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36" "-" 10.100.136.65:80 200 0.616 0.616
182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.neuedu.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userdynamic HTTP/1.1" 200 19501 "www.neusoft.com" "-" cid=0&timestamp=1478707261847&uid=2871142&touid=2871142&page=1&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=3837a5bf27ea718fe18bda6c53fbbc14 "neuedu/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.195 0.195
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
114.248.161.26 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getcourseintro HTTP/1.1" 200 2510 "www.neusoft.com" "-" cid=283&secrect=86b720f312c2b25da3b20e59e7c89780&timestamp=1478707261951&token=4c144b3f4314178b9527d1e91ecc0fac&uid=3372975 "neuedu/5.0.2 (iPhone; iOS 8.4.1; Scale/2.00)" "-" 10.100.136.65:80 200 0.007 0.008
120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getmediainfo_ver2 HTTP/1.1" 200 633 "www.neusoft.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5&timestamp=1478707261945&token=9bbdba949aec02735e59e0868b538e19&uid=4203162 "neuedu/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.049 0.049
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
112.10.136.45 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.neuedu.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.006 0.006
211.162.33.31 - - [10/Nov/2016:00:01:02 +0800] "GET /u/card HTTP/1.1" 200 331 "www.neusoft.com" "http://www.neusoft.com/code/2053" - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" "-" 10.100.136.65:80 200 0.371 0.371
116.22.196.70 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser HTTP/1.1" 200 54 "www.neusoft.com" "http://www.neusoft.com/code/3500" mid=3500&time=60 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0" "-" 10.100.134.244:80 200 0.026 0.026

难点:

  1.从每一行中找到描述浏览器信息的字符串;

  2.从这字符串中解析出浏览器;

解决:

  1.多观察可以发现,每一行在第七个“ " ”之后的字符串为浏览器信息,可以通过以下方式获取

 

 1  /**
 2   * 获取指定字符串中指定标识符出现的索引位置
 3   **/
 4 private int getCharacterPosition(String value, String operator, int index) {
 5         Matcher slashMatcher = Pattern.compile(operator).matcher(value);
 6         int mIdex = 0;
 7         while (slashMatcher.find()) {
 8             mIdex++;
 9 
10             if (mIdex == index) {
11                 break;
12             }
13         }
14         return slashMatcher.start();
15     }

 

  2.使用GitHub上现成的工具UserAgentParser,使用方法如下

 

1 userAgentParser = new UserAgentParser();
2 String userAgentString = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.13) Gecko/20100914 Firefox/3.5.13 (.NET CLR 3.5.30729)";
3 String browser = userAgentParser.browser(userAgentString);

全部代码如下:

TravelMapper.java

 1 package travel;
 2 
 3 import java.io.IOException;
 4 import java.util.regex.Matcher;
 5 import java.util.regex.Pattern;
 6 
 7 import org.apache.hadoop.io.LongWritable;
 8 import org.apache.hadoop.io.Text;
 9 import org.apache.hadoop.mapreduce.Mapper;
10 
11 import com.kumkee.userAgent.UserAgent;
12 import com.kumkee.userAgent.UserAgentParser;
13 
14 public class TravelMapper extends Mapper<LongWritable, Text, Text, Text> {
15 
16     @Override
17     protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
18             throws IOException, InterruptedException {
19         // TODO 自动生成的方法存根
20         String line=value.toString();
21         UserAgentParser userAgentParser= new UserAgentParser();
22         String source = line.substring(getCharacterPosition(line, "\"", 7) + 1);
23         UserAgent agent = userAgentParser.parse(source);
24         String browser = agent.getBrowser();
25 
26         // 通过上下文把map的处理结果输出
27         context.write(new Text(browser), new Text("1"));
28     }
29 
30     private int getCharacterPosition(String value, String operator, int index) {
31         Matcher slashMatcher = Pattern.compile(operator).matcher(value);
32         int mIdex = 0;
33         while (slashMatcher.find()) {
34             mIdex++;
35 
36             if (mIdex == index) {
37                 break;
38             }
39         }
40         return slashMatcher.start();
41     }
42     
43 }

TravelReducer.java

 1 package travel;
 2 
 3 import java.io.IOException;
 4 
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapreduce.Reducer;
 7 
 8 public class TravelReducer extends Reducer<Text, Text, Text, Text> {
 9 
10     @Override
11     protected void reduce(Text arg0, Iterable<Text> arg1,
12             Reducer<Text, Text, Text, Text>.Context arg2) throws IOException, InterruptedException {
13         // TODO 自动生成的方法存根
14 
15         int count=0;
16         for (Text text : arg1) {
17             count++;
18             
19         }
20         double percent=(double)count/100;
21         arg2.write(arg0, new Text(percent+"%"));
22     }
23     
24 
25 }

MyJob.java

 1 package travel;
 2 
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.conf.Configured;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.Text;
 7 import org.apache.hadoop.mapreduce.Job;
 8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 import org.apache.hadoop.util.Tool;
11 import org.apache.hadoop.util.ToolRunner;
12 
13 
14 
15 public class MyJob extends Configured implements Tool{
16     
17     public static void main(String[] args) throws Exception {
18         System.setProperty("hadoop.home.dir", "E:\\hadoop");
19         MyJob myJob=new MyJob();
20         ToolRunner.run(myJob, null);
21     }
22     public int run(String[] args) throws Exception {
23         // TODO Auto-generated method stub
24         Configuration conf=new Configuration();
25         conf.set("fs.default.name", "hdfs://192.168.137.11:9000");
26         Job job=Job.getInstance(conf);
27         job.setJarByClass(MyJob.class);
28         job.setMapperClass(TravelMapper.class);
29         job.setReducerClass(TravelReducer.class);
30         job.setOutputKeyClass(Text.class);
31         job.setOutputValueClass(Text.class);
32         job.setMapOutputKeyClass(Text.class);
33         job.setMapOutputValueClass(Text.class);
34         FileInputFormat.addInputPath(job, new Path("/hadoop/test.log"));
35         FileOutputFormat.setOutputPath(job, new Path("/hadoop/TravelResult"));
36         job.waitForCompletion(true);
37         
38         return 0;
39     }
40 
41 }

结果:

posted @ 2018-07-19 08:54  遗风遗风丶  阅读(150)  评论(0编辑  收藏  举报