【mapreudce】6.对Nginx的access日志进行数据清洗,我们提取出文件数据的ip,时间,url
1.首先我们需要一个util辅助类
package cn.cutter.demo.hadoop.mapreduce.nginxlog.util; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; /** * @ClassName AccessLogUtil * @Description * @Author xiaof * @Date 2019/5/15 22:07 * @Version 1.0 **/ public class AccessLogUtil { public static final SimpleDateFormat FORMAT = new SimpleDateFormat( "d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); public static final SimpleDateFormat dateformat1 = new SimpleDateFormat( "yyyyMMddHHmmss");/** * 解析英文时间字符串 * * @param string * @return * @throws ParseException */ private static Date parseDateFormat(String string) { Date parse = null; try { parse = FORMAT.parse(string); } catch (ParseException e) { e.printStackTrace(); } return parse; } /** * 解析日志的行记录 * * @param line * @return 数组含有5个元素,分别是ip、时间、url、状态、流量,请求来源 */ public static String[] parse(String line) { String ip = parseIP(line); String time = parseTime(line); String url = parseURL(line); String status = parseStatus(line); String traffic = parseTraffic(line); String sourcePath = parseSource(line); return new String[] { ip, time, url, status, traffic, sourcePath }; } private static String parseTraffic(String line) { // final String trim = line.substring(line.lastIndexOf("-") + 1) // .trim(); int start = line.indexOf("\""); int second = line.indexOf("\"", start + 1); int three = line.indexOf("\"", second + 1); final String trim = line.substring(second + 1, three) .trim(); String traffic = trim.split(" ")[1]; return traffic; } private static String parseStatus(String line) { int start = line.indexOf("\""); int second = line.indexOf("\"", start + 1); int three = line.indexOf("\"", second + 1); final String trim = line.substring(second + 1, three) .trim(); String status = trim.split(" ")[0]; return status; } private static String parseURL(String line) { final int first = line.indexOf("\""); final int second = line.indexOf("\"", first + 1); final int last = line.lastIndexOf("\""); String url = line.substring(first + 1, second); return url; } private static String parseTime(String line) { final int first = line.indexOf("["); final int last = line.indexOf("+0800]"); String time = line.substring(first + 1, last).trim(); Date date = parseDateFormat(time); return dateformat1.format(date); } private static String parseIP(String line) { String ip = line.substring(0, line.indexOf("-")).trim(); return ip; } private static String parseSource(String line) { final int end = line.lastIndexOf("\""); final int start = line.lastIndexOf("\"", end - 1); String sourcePath = line.substring(start + 1, end).trim(); return sourcePath; } public static void main(String args[]) { String s1 = "10.25.24.133 - admin [07/Mar/2019:14:19:53 +0800] \"GET /oss-eureka-server/console HTTP/1.1\" 200 21348 \"http://218.200.65.200:9425/oss-web/main.jsp\" \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36\"\n"; String result[] = AccessLogUtil.parse(s1); for(int i = 0; i < result.length; ++i) { System.out.println(result[i]); } } }
2.map类
package cn.cutter.demo.hadoop.mapreduce.nginxlog.map; import cn.cutter.demo.hadoop.mapreduce.nginxlog.util.AccessLogUtil; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * @ProjectName: cutter-point * @Package: cn.cutter.demo.hadoop.mapreduce.nginxlog.map * @ClassName: NginxAccessLogMap * @Author: xiaof * @Description: ${description} * @Date: 2019/5/17 11:12 * @Version: 1.0 */ public class NginxAccessLogCleanMap extends Mapper<LongWritable, Text, LongWritable, Text> { Text outputValue = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //解析没行数据,获取不同的数据 String data[] = AccessLogUtil.parse(value.toString()); //组装前三个数据信息,输出到combine outputValue.set(data[0] + "\t" + data[1] + "\t" + data[2]); context.write(key, outputValue); } }
3.reduce类
package cn.cutter.demo.hadoop.mapreduce.nginxlog.reduce; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * @ProjectName: cutter-point * @Package: cn.cutter.demo.hadoop.mapreduce.nginxlog.reduce * @ClassName: NginxAccessLogReduce * @Author: xiaof * @Description: 进行数据清洗 * @Date: 2019/5/17 11:21 * @Version: 1.0 */ public class NginxAccessLogCleanReduce extends Reducer<LongWritable, Text, Text, NullWritable> { @Override protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //循环所有遍历到的数据并输出 for(Text v : values) { context.write(v, NullWritable.get()); } } }
4.启动类
package cn.cutter.demo.hadoop.mapreduce.nginxlog; import cn.cutter.demo.hadoop.mapreduce.nginxlog.map.NginxAccessLogCleanMap; import cn.cutter.demo.hadoop.mapreduce.nginxlog.reduce.NginxAccessLogCleanReduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; /** * @ProjectName: cutter-point * @Package: cn.cutter.demo.hadoop.mapreduce.nginxlog * @ClassName: NginxAccessLogClean * @Author: xiaof * @Description: hadoop jar ./cutter-point-service1.jar NginxAccessLogClean /user/xiaof/nginx /user/xiaof/nginx/output * @Date: 2019/5/17 11:25 * @Version: 1.0 */ public class NginxAccessLogClean { public static void main(String args[]) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { System.out.println("进入NginxAccessLogClean方法"); Configuration conf = new Configuration(); // conf.set("mapreduce.job.jar", "cutter-point-service1.jar"); //其中mr01.jar是你的导出的jar文件名。 conf.set("fs.default.name", "hdfs://jyh-zhzw-inline-27:9000"); conf.set("dfs.client.use.datanode.hostname", "true"); GenericOptionsParser optionsParser = new GenericOptionsParser(conf, args); String[] remainingArgs = optionsParser.getRemainingArgs(); //输出参数 for(int i = 0; i < remainingArgs.length; ++i) { System.out.println(remainingArgs[i]); } Job job = Job.getInstance(conf, NginxAccessLogClean.class.getName()); job.setJarByClass(NginxAccessLogClean.class); job.setMapperClass(NginxAccessLogCleanMap.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(NginxAccessLogCleanReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // 清理已存在的输出文件 FileInputFormat.setInputPaths(job, remainingArgs[0]); FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1])); FileSystem fs = FileSystem.get(new URI(remainingArgs[0]), conf); Path outPath = new Path(remainingArgs[1]); if (fs.exists(outPath)) { fs.delete(outPath, true); } boolean success = job.waitForCompletion(true); if(success){ System.out.println("Clean process success!"); } else{ System.out.println("Clean process failed!"); } } }
数据源:
我们文件原始数据格式展示
我们清洗之后数据展示
展示数据