Hbase 表的Rowkey设计避免数据热点

一、案例分析

常见避免数据热点问题的处理方式有:加盐、哈希、反转等方法结合预分区使用。

由于目前原数据第一字段为时间戳形式,第二字段为电话号码,直接存储容易引起热点问题,通过加随机列、组合时间戳、字段反转的方式来设计Rowkey,来实现既能高效查询又能避免热点问题。

二、代码部分

  1 package beifeng.hadoop.hbase;
  2 import java.io.IOException;
  3 import java.text.SimpleDateFormat;
  4 import java.util.Date;
  5 import org.apache.hadoop.conf.Configuration;
  6 import org.apache.hadoop.conf.Configured;
  7 import org.apache.hadoop.fs.Path;
  8 import org.apache.hadoop.hbase.HBaseConfiguration;
  9 import org.apache.hadoop.hbase.HColumnDescriptor;
 10 import org.apache.hadoop.hbase.HTableDescriptor;
 11 import org.apache.hadoop.hbase.MasterNotRunningException;
 12 import org.apache.hadoop.hbase.TableName;
 13 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 14 import org.apache.hadoop.hbase.client.HBaseAdmin;
 15 import org.apache.hadoop.hbase.client.Mutation;
 16 import org.apache.hadoop.hbase.client.Put;
 17 import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
 18 import org.apache.hadoop.hbase.mapreduce.TableReducer;
 19 import org.apache.hadoop.io.LongWritable;
 20 import org.apache.hadoop.io.NullWritable;
 21 import org.apache.hadoop.mapreduce.Job;
 22 import org.apache.hadoop.mapreduce.Mapper;
 23 import org.apache.hadoop.mapreduce.Reducer;
 24 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 25 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 26 import org.apache.hadoop.util.Tool;
 27 import org.apache.hadoop.util.ToolRunner;
 28 import org.apache.hadoop.io.Text;
 29 
 30 /**
 31  * 遵循rowkey的设计原则
 32  *  1.rowkey不能过长
 33  *  2.唯一性,加随机列  md5
 34  *  3.注意避免产生数据热点
 35  *  4.满足更多的查询场景
 36  * @author Administrator
 37  *
 38  */
 39 public class LoadData extends Configured implements Tool {
 40 
 41     /**
 42      * 综合考虑 使用时间和手机 做组合key,能更好的满足应用场景
 43      * @author Administrator
 44      *
 45      */
 46     public static class LoadDataMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
 47         //专门处理时间戳 =》标准时间格式
 48         SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHsss");
 49         private Text mapOutputValue = new Text();
 50         @Override
 51         protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context)
 52                 throws IOException, InterruptedException {
 53             String line = value.toString();
 54             String[] splited = line.split("\t");
 55             
 56             //将切分的第一个字段转成标准时间
 57             String formatDate = sdf.format(new Date(Long.parseLong(splited[0].trim())));
 58             //将手机号码反转
 59             String phoneNumber = splited[1].toString();
 60                         String reversePhoneNumber = new StringBuffer(phoneNumber).reverse().toString();
 61             
 62             String rowKeyString = reversePhoneNumber +"|"+formatDate;
 63             //反转手机号+“|”+时间 +正行内容拼接
 64             mapOutputValue.set(rowKeyString+"\t"+ line);
 65             context.write(key, mapOutputValue);
 66         }
 67     }
 68     
 69     public static class LoadDataReuducer extends TableReducer<LongWritable, Text, NullWritable>{
 70 
 71         //设置HBase的列簇
 72             private static final String COLUMN_FAMAILY = "info";            
 73         @Override
 74         protected void reduce(LongWritable key, Iterable<Text> values,
 75                 Reducer<LongWritable, Text, NullWritable, Mutation>.Context context)
 76                 throws IOException, InterruptedException {
 77             for (Text value:values) {
 78                 String[] splited = value.toString().split("\t");
 79                 String rowKey = splited[0];
 80             //    System.err.println(rowKey);
 81                 Put put = new Put(rowKey.getBytes());
 82                 //put.addColumn(COLUMN_FAMAILY.getBytes(),"row".getBytes(),value.getBytes());
 83                 put.add(COLUMN_FAMAILY.getBytes(), "reportTime".getBytes(), splited[1].getBytes());
 84                 put.add(COLUMN_FAMAILY.getBytes(), "apmac".getBytes(), splited[3].getBytes());
 85                 put.add(COLUMN_FAMAILY.getBytes(), "acmac".getBytes(), splited[4].getBytes());
 86                 put.add(COLUMN_FAMAILY.getBytes(), "host".getBytes(), splited[5].getBytes());
 87                 put.add(COLUMN_FAMAILY.getBytes(), "siteType".getBytes(), splited[6].getBytes());
 88                 put.add(COLUMN_FAMAILY.getBytes(), "upPackNum".getBytes(), splited[7].getBytes());
 89                 put.add(COLUMN_FAMAILY.getBytes(), "downPackNum".getBytes(), splited[8].getBytes());
 90                 put.add(COLUMN_FAMAILY.getBytes(), "unPayLoad".getBytes(), splited[9].getBytes());
 91                 put.add(COLUMN_FAMAILY.getBytes(), "downPayLoad".getBytes(), splited[10].getBytes());
 92                 put.add(COLUMN_FAMAILY.getBytes(),"httpStatus".getBytes(),splited[11].getBytes());
 93                 context.write(NullWritable.get(), put);
 94                 
 95             }
 96         }
 97     }
 98 
 99     public static void createTable(String tableName) throws MasterNotRunningException, ZooKeeperConnectionException, IOException {
100         Configuration  conf = HBaseConfiguration.create();
101         conf.set("hbase.zookeeper.quorum", "beifeng01");
102         
103         HBaseAdmin admin = new HBaseAdmin(conf);
104         
105         TableName tName = TableName.valueOf(tableName);
106         
107         HTableDescriptor htd = new HTableDescriptor(tName);
108         HColumnDescriptor hcd = new HColumnDescriptor("info");
109         htd.addFamily(hcd);
110         
111         if(admin.tableExists(tName)) {
112             System.out.println(tableName+"is exist,trying to recrate the table");
113             admin.disableTable(tName);
114             admin.deleteTable(tName);
115         } 
116         admin.createTable(htd);
117         System.out.println("create new table"+ " " + tableName);
118         
119     }
120     
121     public int run(String[] args) throws Exception {
122 
123     Configuration conf = this.getConf();
124         conf.set("hbase.zookeeper.quorum", "beifeng01"); 
125         conf.set(TableOutputFormat.OUTPUT_TABLE, "phoneLog"); 
126         
127         createTable("phoneLog"); 
128         
129         Job job = Job.getInstance(conf, this.getClass().getSimpleName());  
130         job.setJarByClass(this.getClass());  
131         job.setNumReduceTasks(1); 
132         
133         // map class  
134         job.setMapperClass(LoadDataMapper.class);  
135         job.setMapOutputKeyClass(LongWritable.class);  
136         job.setMapOutputValueClass(Text.class);  
137    
138         // reduce class  
139         job.setReducerClass(LoadDataReuducer.class);  
140         job.setOutputFormatClass(TableOutputFormat.class); 
141         
142         Path inPath = new Path(args[0]);  
143         FileInputFormat.addInputPath(job, inPath);
144    
145        boolean isSucced = job.waitForCompletion(true);
146        
147        return isSucced ? 0 : 1; 
148     }
149 
150     public static void main(String[] args) throws Exception {
151         Configuration conf = HBaseConfiguration.create(); 
152         
153         //指定HDFS数据地址
154         args = new String[] {"hdfs://hbase/data/input/HTTP_20130313143750.data"};  
155            int status = ToolRunner.run(
156                    conf, 
157                    new LoadData(), 
158                    args);
159              
160     System.exit(status);
161     }
162 }

运行完程序后scan 查看效果

hbase(main):004:0> scan 'phoneLog', {LIMIT => 2}
ROW                                COLUMN+CELL                                                                                         
 01787706731|2013031314048         column=info:acmac, timestamp=1544022103345, value=120.196.100.82                                    
 01787706731|2013031314048         column=info:apmac, timestamp=1544022103345, value=00-FD-07-A4-7B-08:CMCC                            
 01787706731|2013031314048         column=info:downPackNum, timestamp=1544022103345, value=2                                           
 01787706731|2013031314048         column=info:downPayLoad, timestamp=1544022103345, value=120                                         
 01787706731|2013031314048         column=info:host, timestamp=1544022103345, value=                                                   
 01787706731|2013031314048         column=info:httpStatus, timestamp=1544022103345, value=200                                          
 01787706731|2013031314048         column=info:reportTime, timestamp=1544022103345, value=1363157988072                                
 01787706731|2013031314048         column=info:siteType, timestamp=1544022103345, value=                                               
 01787706731|2013031314048         column=info:unPayLoad, timestamp=1544022103345, value=120                                           
 01787706731|2013031314048         column=info:upPackNum, timestamp=1544022103345, value=2                                             
 10007032831|2013031314045         column=info:acmac, timestamp=1544022103345, value=120.196.100.99                                    
 10007032831|2013031314045         column=info:apmac, timestamp=1544022103345, value=20-7C-8F-70-68-1F:CMCC                            
 10007032831|2013031314045         column=info:downPackNum, timestamp=1544022103345, value=3                                           
 10007032831|2013031314045         column=info:downPayLoad, timestamp=1544022103345, value=180                                         
 10007032831|2013031314045         column=info:host, timestamp=1544022103345, value=                                                   
 10007032831|2013031314045         column=info:httpStatus, timestamp=1544022103345, value=200                                          
 10007032831|2013031314045         column=info:reportTime, timestamp=1544022103345, value=1363157985079                                
 10007032831|2013031314045         column=info:siteType, timestamp=1544022103345, value=                                               
 10007032831|2013031314045         column=info:unPayLoad, timestamp=1544022103345, value=360                                           
 10007032831|2013031314045         column=info:upPackNum, timestamp=1544022103345, value=6  

 

posted on 2018-12-06 11:54  PerfectData  阅读(1124)  评论(0编辑  收藏  举报

导航