Hadoop-MR实现日志清洗(三)
Hadoop-MR实现日志清洗(三)
5.论坛请求日志清洗解析
请求日志的清洗主要是指过滤掉跟后续统计无关的数据,包括爬虫数据、静态资源数据、无用数据列等。根据需要,清洗过程中也可以对部门数据域进行数据转换,比如日期,以便简化后续的数据加工/统计分析。
对日志的清洗逻辑上也是分为编写map、reduce、run(main)函数,在对输入数据处理时,日志的提取过滤较为复杂,通常是将文件处理的方法单独编写作为解析类,由map调用相关的方法。
5.1解析日志的各个域
单独编写的解析类,给map函数调用
package com.leeyk99.hadoop; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; /** * 解析日志的每个数据列:日志的数据域大致可分为:IP 、"-"、"-"、TIME、URL、STATUS、STREAM、?、?等等 * @author LIN */ public class FieldParser { public static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); public static final SimpleDateFormat FORMAT=new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); /** * 解析日志记录 * @return 数组,含有五个元素,分别是IP\TIME\URL\STAUS\流量 */ public String[] parseLog(String line){ String ip=parseIP(line); String time=parseTime(line); String url=parseURL(line); String status=parseStatus(line); String stream=parseStream(line); String[] fields=new String[5]; fields[0]=ip; fields[1]=time; fields[2]=url; fields[3]=status; fields[4]=stream; //String[] fields=new String[]{ip,time,url,status,stream}; return fields; } private String parseStream(String line) { try{ final String trim = line.substring(line.lastIndexOf("\"")+1).trim(); String stream = trim.split(" ")[1]; return stream; }catch (ArrayIndexOutOfBoundsException e){ e.printStackTrace(); System.out.println(line); }finally{ return null; } } private String parseStatus(String line) { final String trim = line.substring(line.lastIndexOf("\"")+1).trim(); String status = trim.split(" ")[0]; return status; } private String parseURL(String line) { final String trim = line.split("\"")[1].trim(); String url = trim; return url; } private String parseTime(String line) { final String trim = line.split("\"")[0].trim(); String time = trim.split(" ")[3].substring(1); Date date=parseDateFormat(time);//原始字符串解析成date才能方便格式化为指定的字符串样式 time=dateFormat.format(date);//转成20180903101923格式 return time; } private String parseIP(String line) { final String trim = line.split(" ")[0].trim(); String ip = trim; return ip; } /** * 日志时间转换 18/Sep/2013:16:16:16 * @author LIN * @param 18/Sep/2013:16:16:16 */ private Date parseDateFormat(String time){ Date formatTime=new Date(); try{ formatTime =FORMAT.parse(time);//FORMAT.parse解析String类型返回Date类型,FORMAT.format解析Date类型返回字符串类型 }catch (ParseException e){ e.printStackTrace(); } return formatTime; } }
5.2编写map函数
这里也演示了如何对多个字段进行传递输出的方法。
package com.leeyk99.hadoop.mapreduce; import com.leeyk99.hadoop.FieldParser; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class LogMapper extends Mapper<LongWritable,Text,LongWritable,Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //super.map(key, value, context); String line=value.toString(); FieldParser fieldParser=new FieldParser(); String[] record=fieldParser.parseLog(line); /*数据预处理*/ //1.过滤指定字符串开头的数据 if( record[2].startsWith("GET /uc_server") || record[2].startsWith("GET /static") ){ //测试过滤数据 return; } //2.数据域加工,这里是字符串截取 if( record[2].startsWith("GET /")){ record[2]=record[2].substring("GET /".length()-1);//或者5 }else if(record[2].startsWith("POST /")){ record[2]=record[2].substring("POST /".length()-1); } if (record[2].endsWith(" HTTP/1.1")){ //System.out.println("1"+record[2]); record[2]=record[2].substring(0,record[2].length()-" HTTP/1.1".length()); //System.out.println("2"+record[2]); } //3.列裁剪,进一步选取指定的列 Text outPutValue=new Text(); outPutValue.set(record[0]+"\001"+record[1]+"\001"+record[2]); //指定了\001分隔符 /*map输出,这个输出key使用的是LongWritable,输出的还是行号,没有像往常使用Text(维度) 输出是Text,不像我们平时的IntWritable或DoubleWritable,这个不是在reduce中进行与group by类似计算的*/ context.write(key,outPutValue); } }
5.3编写reducer函数
package com.leeyk99.hadoop.mapreduce; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class LogReducer extends Reducer<LongWritable, Text, Text, NullWritable> { @Override protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //super.reduce(key, values, context); for(Text value : values){ context.write(value, NullWritable.get()); } } }
5.4编写入口函数(main函数、run函数)
package com.leeyk99.hadoop.mapreduce; //import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; //import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; //import java.io.File; import java.net.URI; public class LogParser extends Configured implements Tool { //不能使用小写override //@Override 实现接口的方法不能注释为重写,一直红色波浪线提示不合规,程序运行正常,找了好久这个位置的异常。 public int run(String[] args) throws Exception{ if(args.length != 2){ System.err.printf("Usage: %s [generic options ] <input> <output> \n",getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } //getClass() 、getConf() //方法1:Hadoop权威指南写法 /*Job job=new Job(getConf(),"Log parser"); job.setJarByClass(getClass());*/ //方法二:main写法,最简单写法 Job job=new Job(); job.setJarByClass(getClass());//getClass() 获取类名 LogParser.class job.setJobName("Log parser"); //方法三:Configuration写法,网上写法 /*Configuration conf=new Configuration(); //String[] otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs(); Job job=new Job(conf, "Job_001");//新建一个job对象,并给了job任务名 job.setJarByClass(LogParser.class); //指定class //FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //输入路径 //FileOutputFormat.setOutputPath(job,new Path(otherArgs[1])); //输出路径*/ FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); job.setMapperClass(LogMapper.class); job.setMapOutputKeyClass(LongWritable.class); //与Reducer的不一致,需要指定 job.setMapOutputValueClass(Text.class); /*使用这个后,map一直卡在22%不动,因为map的输出是<LongWritable,Text>,如果使用Combiner后,输出与reducer一致<Text, NullWritable>, 这种输出是不能作为Reducer的输入的,因为输入要求是<LongWritable,Text>*/ //job.setCombinerClass(LogReducer.class); job.setReducerClass(LogReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //Hdfs 输出目录删除 FileSystem fs= FileSystem.get(new URI(args[0]),getConf()); Path outPath=new Path(args[1]); if(fs.exists(outPath)){ fs.delete(outPath,true); } return job.waitForCompletion(true)?0:1; } public static void main(String[] args) throws Exception { int exitCode=ToolRunner.run(new LogParser(),args); System.exit(exitCode); } //使用本地,Hdfs 输出目录应该怎么删除呢 /*private static void delDir(String path){ File f=new File(path); if(f.exists()){ if(f.isDirectory()){ String[] items=f.list(); for( String item : items ){ File f2=new File(path+"/"+item); if(f2.isDirectory()){ delDir(path+"/"+item); } else{ f2.delete(); } } } f.delete(); //删除文件或者最后的空目录 } else{ System.out.println("Output directory does not exist ."); } }*/ }