电商项目实战-原始数据ETL操作(九)
1、ETL重要性
有了ETL,就不需要reduce了。
依次打开PageStartApp.java、ProvinceStartApp.java、PVStartApp.java作业处理类
(1)输入数据都是access/raw/input/trackinfo_20130721.data
(2)trackinfo_20130721.data大小为166M
[hadoop@hadoop000 data]$ ll -lh trackinfo_20130721.data
-rw-r--r-- 1 hadoop hadoop 166M Dec 8 2018 trackinfo_20130721.data
(3)假设trackinfo_20130721.data为50T的数据,上述3个作业的main函数里,不应每个都去读取这50T的数据
FileInputFormat.setInputPaths(job, new Path("access/raw/input/trackinfo_20130721.data"));
(4)存在问题
每个MR作业都去读取待处理的原始日志
所以需要提升性能
(5)新概念:ETL
全量原始数据不方便直接进行计算,最好是进行一步处理后,再进行相应的维度统计分析。
即解析出需要的字段:ip===>城市信息
去除不需要的字段
PageStartApp.java、ProvinceStartApp.java、PVStartApp.java这三个统计分析要基于ETL过后的数据,即OK
(6)需求的字段:
ip、time、url、page_id、country、province、city
2、添加ETLApp.java
src/main/java/project/mrv2/ETLApp.java
(1)有ETL,则不需要reduce。因为将数据输出至目录上,一个map可以搞定。
(2)static class MyMapper的输出不需要key。value的输出将所有Text输出。
package project.mrv2; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import project.mrv1.PageStartApp; import project.mrv1.ProvinceStartApp; import project.utils.ContentUtils; import project.utils.LogParser; import java.io.IOException; import java.util.Map; public class ETLApp { public static void main(String[] args) throws Exception{ Configuration configuration = new Configuration(); FileSystem fileSystem = FileSystem.get(configuration); Path outputPath = new Path("input/etl"); if (fileSystem.exists(outputPath)) { fileSystem.delete(outputPath, true); } Job job = Job.getInstance(configuration); job.setJarByClass(ETLApp.class); //没有reduce job.setMapperClass(ETLApp.MyMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path("access/raw/input/trackinfo_20130721.data")); FileOutputFormat.setOutputPath(job, new Path("input/etl")); job.waitForCompletion(true); } static class MyMapper extends Mapper<LongWritable, Text, NullWritable, Text> { private LongWritable ONE = new LongWritable(1); private LogParser logParser; @Override protected void setup(Context context) throws IOException, InterruptedException { logParser = new LogParser(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //value:是日志 //拿到日志 String log = value.toString(); //将日志传进来,日志解析过程 Map<String, String> info = logParser.parser(log); String ip = info.get("ip"); String country = info.get("country"); String province = info.get("province"); //String city = info.get("city"); String url = info.get("url"); //拿到的是LogParser.java里的url String time = info.get("time"); String pageId = ContentUtils.getPageId(url); //将上述字段拼接起来 StringBuilder builder = new StringBuilder(); builder.append(ip).append("\t"); builder.append(country).append("\t"); builder.append(province).append("\t"); //builder.append(city).append("\t"); //builder.append(url).append("\t"); builder.append(time).append("\t"); builder.append(pageId); //将上述拼接起来的数据,输出 context.write(NullWritable.get(), new Text(builder.toString())); } } }
3、修改LogParser.java
src/main/java/project/utils/LogParser.java
package project.utils; import org.apache.commons.lang.StringUtils; import java.util.HashMap; import java.util.Map; //日志解析 public class LogParser { IPParser ipParser = IPParser.getInstance(); public Map<String, String> parser(String log){ Map<String, String> info = new HashMap<>(); //IP是第13个字段 if (StringUtils.isNotBlank(log)){ String[] splits = log.split("\001"); String ip = splits[13]; String country = "-"; String province = "-"; String city = "-"; IPParser.RegionInfo regionInfo = ipParser.analyseIp(ip); if (regionInfo != null){ country = regionInfo.getCountry(); province = regionInfo.getProvince(); city = regionInfo.getCity(); } info.put("ip",ip); info.put("country",country); info.put("province",province); info.put("city",city); String url = splits[1]; info.put("url",url); String time = splits[17]; info.put("time",time); } return info; } }