hadoop hdfs hbase优化实例

需求描述：

从hdfs中获取数据，字段url需要计算出url_type 通过进行hive的left outer join ，效率非常低。故将url的类型导入到hbase中，利用hbase快速查询的特点，结合mapreduce进行字段打标。

刚开始的mapreduce程序如下：

  1 package com.bonc.db;
  2 
  3 import java.io.IOException;
  4 
  5 import org.apache.hadoop.conf.Configuration;
  6 import org.apache.hadoop.fs.Path;
  7 import org.apache.hadoop.hbase.client.Get;
  8 import org.apache.hadoop.hbase.client.HTable;
  9 import org.apache.hadoop.hbase.client.HTablePool;
 10 import org.apache.hadoop.hbase.client.Result;
 11 import org.apache.hadoop.io.LongWritable;
 12 import org.apache.hadoop.io.Text;
 13 import org.apache.hadoop.mapreduce.Job;
 14 import org.apache.hadoop.mapreduce.Mapper;
 15 import org.apache.hadoop.mapreduce.Reducer;
 16 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 17 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 18 
 19 import com.bonc.URLMatch.HBaseMain;
 20 
 21 public class DWA_S_D_USE_MB_COUNT_BASE2 {
 22     public static void main(String args[]) throws Exception {
 23         Configuration conf = new Configuration();
 24         Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE");
 25         job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE2.class);
 26         job.setMapperClass(DataCleanMapper.class);
 27         job.setReducerClass(DataCleanReduce.class);
 28         job.setNumReduceTasks(150);
 29         job.setOutputKeyClass(Text.class);
 30         job.setOutputValueClass(Text.class);
 31         job.setMapOutputKeyClass(Text.class);
 32         job.setMapOutputValueClass(Text.class);
 33         FileInputFormat.addInputPath(job, new Path(args[0]));
 34         FileOutputFormat.setOutputPath(job, new Path(args[1]));
 35         System.exit(job.waitForCompletion(true) ? 0 : 1);
 36     }
 37 
 38     public static class DataCleanMapper extends
 39             Mapper<LongWritable, Text, Text, Text> {
 40         @Override
 41         protected void map(LongWritable key, Text value, Context context)
 42                 throws IOException, InterruptedException {
 43             String lines = value.toString();
 44             String[] strs = lines.split("\\|");
 45             ParesURL pu = new ParesURL();
 46             String url = "NULL";
 47             if (strs.length > 25) {
 48                 url = pu.execute(strs[25], "HOST");
 49             }
 50             String keys = "";
 51             String values = "";
 52             if (strs.length > 16) {
 53                 keys = strs[0] + "|" + strs[1] + "|" + strs[2] + "|" + strs[3]
 54                         + "|" + strs[4] + "|" + use_seg(strs[5]) + "|"
 55                         + strs[11] + "|" + strs[16] + "|" + url + "|" + strs[7]
 56                         + "|" + strs[8] + "|" + strs[9] + "|" + strs[10] + "|";
 57             }
 58             if (strs.length > 15) {
 59                 values = url + "|" + strs[13] + "|" + strs[15] + "|" + "1";
 60             }
 61             context.write(new Text(keys), new Text(values));
 62         }
 63 
 64         public String use_seg(String start_date) {
 65             String s = "**";
 66             if (start_date.toString().length() > 23) {
 67                 if (isNum(start_date.toString().substring(11, 13))
 68                         && Integer.parseInt(start_date.toString().substring(11,
 69                                 13)) >= 0
 70                         && Integer.parseInt(start_date.toString().substring(11,
 71                                 13)) <= 23) {
 72                     s = start_date.toString().substring(11, 13);
 73                 }
 74             }
 75             return s;
 76         }
 77 
 78         public static boolean isNum(String str) {
 79             return str
 80                     .matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");
 81         }
 82     }
 83 
 84     public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> {
 85         private HTable table;
 86 
 87         @Override
 88         protected void reduce(Text arg0, Iterable<Text> arg1, Context context)
 89                 throws IOException, InterruptedException {
 90             String keys = arg0.toString();
 91             String value[] = { "" };
 92             String url = "NULL";
 93             String visitIP = "NULL";
 94             String value2 = "NULL";
 95             for (Text c : arg1) {
 96                 value = c.toString().split("\\|");
 97                 if (value.length > 0) {
 98                     url = value[0];
 99                 }
100                 if (value.length > 1) {
101                     visitIP = value[1];
102                 }
103                 if (value.length > 2) {
104                     value2 = value[2];
105                 }
106             }
107             String matchResult = urlMatch(url);
108             if (matchResult.equals("NULL")) {
109                 matchResult = urlMatch(visitIP);
110             }
111             String output = matchResult + "|" + value2 + "|" + "1";
112             // System.out.println(output+"+++++++++++++++++");
113             context.write(new Text(keys), new Text(output));
114         }
115 
116         @Override
117         protected void cleanup(Context context) throws IOException,
118                 InterruptedException {
119             super.cleanup(context);
120             table.close();
121         }
122 
123         @Override
124         protected void setup(Context context) throws IOException,
125                 InterruptedException {
126             // TODO Auto-generated method stub
127             super.setup(context);
128             HTablePool pool = new HTablePool(HBaseMain.conf, 1000);
129             table = (HTable) pool.getTable("22222");
130         }
131 
132         public String urlMatch(String url) {
133             String s = "NULL";
134             if (url == null || url.equals("NULL")) {
135                 s = "NULL";
136             } else {
137                 try {
138                     Get getu = new Get(url.getBytes());
139                     Result ru = table.get(getu);
140                     if (!ru.isEmpty()) {
141                         s = new String(ru.getValue("123".getBytes(), "456".getBytes()));
142                     }
143                 } catch (IOException e) {
144                     e.printStackTrace();
145                 }
146             }
147             return s;
148         }
149     }
150 }

View Code

后来发现效率很低，主要是每一条数据都要访问hbase并且进行随机查询，所以后来转换方法，查询时先将row组装成list，然后再去查询，时间几乎是原来的一半。

改进后的代码：

  1 package com.bonc.db;
  2 
  3 import java.io.IOException;
  4 import java.util.ArrayList;
  5 import java.util.Iterator;
  6 import java.util.List;
  7 
  8 import org.apache.hadoop.conf.Configuration;
  9 import org.apache.hadoop.fs.Path;
 10 import org.apache.hadoop.hbase.client.Get;
 11 import org.apache.hadoop.hbase.client.HTable;
 12 import org.apache.hadoop.hbase.client.HTablePool;
 13 import org.apache.hadoop.hbase.client.Result;
 14 import org.apache.hadoop.io.LongWritable;
 15 import org.apache.hadoop.io.Text;
 16 import org.apache.hadoop.mapred.Counters.Counter;
 17 import org.apache.hadoop.mapreduce.Job;
 18 import org.apache.hadoop.mapreduce.Mapper;
 19 import org.apache.hadoop.mapreduce.Reducer;
 20 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 22 
 23 import com.bonc.URLMatch.HBaseMain;
 24 
 25 public class DWA_S_D_USE_MB_COUNT_BASE {
 26     public static void main(String args[]) throws Exception {
 27         Configuration conf = new Configuration();
 28         Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE2");
 29         job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE.class);
 30         job.setMapperClass(DataCleanMapper.class);
 31         job.setReducerClass(DataCleanReduce.class);
 32         job.setNumReduceTasks(150);
 33         job.setOutputKeyClass(Text.class);
 34         job.setOutputValueClass(Text.class);
 35         job.setMapOutputKeyClass(Text.class);
 36         job.setMapOutputValueClass(Text.class);
 37         FileInputFormat.addInputPath(job, new Path(args[0]));
 38         FileOutputFormat.setOutputPath(job, new Path(args[1]));
 39         System.exit(job.waitForCompletion(true) ? 0 : 1);
 40     }
 41 
 42     public static class DataCleanMapper extends
 43             Mapper<LongWritable, Text, Text, Text> {
 44         public static Counter ct = null;
 45         public static long i = 0;
 46 
 47         @Override
 48         protected void map(LongWritable key, Text value, Context context)
 49                 throws IOException, InterruptedException {
 50             //之所以在后面+1，是为了保证如果后面的几个字段都为空的话，依然可以输出这个字段！
 51             String lines = value.toString()+"|"+"1";
 52             String[] strs = lines.split("\\|");
 53             ParesURL pu = new ParesURL();
 54             String url = "NULL";
 55             String keys = "";
 56             String values = "";
 57             if (strs.length > 25) {
 58                 i++;
 59                 if(!strs[25].startsWith("http://")){
 60                     strs[25]="http://"+strs[25];
 61                 }
 62                 url = pu.execute(EmptyParse(strs[25]), "HOST");
 63                 keys = EmptyParse(strs[0]) + "|" + EmptyParse(strs[1]) + "|"
 64                         + EmptyParse(strs[2]) + "|" + EmptyParse(strs[3]) + "|"
 65                         + EmptyParse(strs[4]) + "|"
 66                         + EmptyParse(use_seg(strs[5])) + "|"
 67                         + EmptyParse(strs[11]) + "|" + EmptyParse(strs[16])
 68                         + "|" + EmptyParse(url) + "|" + EmptyParse(strs[7])
 69                         + "|" + EmptyParse(strs[8]) + "|" + EmptyParse(strs[9])
 70                         + "|" + EmptyParse(strs[10]) + "|";
 71                 values = EmptyParse(url) + "|" + EmptyParse(strs[13]) + "|"
 72                         + EmptyParse(strs[15]) + "|" + i;
 73                 context.write(new Text(String.valueOf(i % 10000)), new Text(
 74                         keys + values));
 75             }
 76         }
 77 
 78         public String use_seg(String start_date) {
 79             String s = "**";
 80             if (start_date.toString().length() > 23) {
 81                 if (isNum(start_date.toString().substring(11, 13))
 82                         && Integer.parseInt(start_date.toString().substring(11,
 83                                 13)) >= 0
 84                         && Integer.parseInt(start_date.toString().substring(11,
 85                                 13)) <= 23) {
 86                     s = start_date.toString().substring(11, 13);
 87                 }
 88             }
 89             return s;
 90         }
 91 
 92         public static boolean isNum(String str) {
 93             return str
 94                     .matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");
 95         }
 96 
 97         public static String EmptyParse(String str) {
 98             if (str == null || str.length() < 1 || str.equals("")
 99                     || str.isEmpty()) {
100                 return "NULL";
101             } else {
102                 return str;
103             }
104         }
105     }
106 
107     public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> {
108         private HTable table;
109         private long index = 0;
110 
111         @Override
112         protected void reduce(Text arg0, Iterable<Text> arg1, Context context)
113                 throws IOException, InterruptedException {
114             String keys = arg0.toString();
115             String value[] = { "" };
116             String url = "NULL";
117             String visitIP = "NULL";
118             String value2 = "NULL";
119             String reduceoutput = "NULL";
120             String urlMatch = "NULL";
121             String output = "NULL";
122             Get getu;
123             Get getip;
124             List<Get> lg = new ArrayList<Get>();
125             List<Get> li = new ArrayList<Get>();
126             List<String> lo = new ArrayList<String>();
127             List<String> useragent = new ArrayList<String>();
128             for (Text c : arg1) {
129                 value = c.toString().split("\\|");
130                 url = value[13];
131                 visitIP = value[14];
132                 value2 = value[15];
133                 output = value[0] + "|" + value[1] + "|" + value[2] + "|"
134                         + value[3] + "|" + value[4] + "|" + value[5] + "|"
135                         + value[6] + "|" + value[7] + "|" + value[8] + "|"
136                         + value[9] + "|" + value[10] + "|" + value[11] + "|"
137                         + value[12] + "|";
138                 getu = new Get(url.getBytes());
139                 getip = new Get(visitIP.getBytes());
140                 lg.add(getu);
141                 li.add(getip);
142                 lo.add(output);
143                 useragent.add(value2);
144             }
145 
146             Result ru[];
147             Result ri[];
148             ru = table.get(lg);
149             ri = table.get(li);
150             for (int i = 0; i < lo.size(); i++) {
151 
152                 if (!ru[i].isEmpty()) {
153                     urlMatch = new String(ru[i].getValue("url_type".getBytes(),
154                             "type".getBytes()));
155                 } else if (!ri[i].isEmpty()) {
156                     urlMatch = new String(ri[i].getValue("url_type".getBytes(),
157                             "type".getBytes()));
158                 }
159                 reduceoutput = urlMatch + "|" + useragent.get(i) + "|" + "1";
160                 context.write(new Text(lo.get(i)), new Text(reduceoutput));
161             }
162         }
163 
164         @Override
165         protected void cleanup(Context context) throws IOException,
166                 InterruptedException {
167             super.cleanup(context);
168             table.close();
169         }
170 
171         @Override
172         protected void setup(Context context) throws IOException,
173                 InterruptedException {
174             // TODO Auto-generated method stub
175             super.setup(context);
176             HTablePool pool = new HTablePool(HBaseMain.conf, 1000);
177             table = (HTable) pool.getTable("url_rule");
178         }
179 
180         public String urlMatch(String url) {
181             String s = "NULL";
182             Result ru;
183             if (url == null || url.equals("NULL")) {
184                 s = "NULL";
185             } else {
186                 try {
187                     Get getu = new Get(123.getBytes());
188                     ru = table.get(getu);
189                     if (!ru.isEmpty()) {
190                         s = new String(ru.getValue("123123".getBytes(),
191                                 "123".getBytes()));
192                     }
193                 } catch (IOException e) {
194                     e.printStackTrace();
195                 }
196             }
197             return s;
198         }
199     }
200 }

View Code

在有限的资源下，可以激发一个人的创造力。用这句话作为总结吧。

posted on 2013-10-17 19:16 卡酷卡阅读(368) 评论(0) 编辑收藏举报