mapreduce方式操作hbase
一、导入数据到hbase
1、配置hbase-site.xml指向hdfs
<configuration> <property> <name>hbase.rootdir</name> <value>hdfs://bigdata-senior01.home.com:9000/hbase</value> </property> <property> <name>hbase.zookeeper.property.dataDir</name> <value>hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper</value> </property> <property> <name>hbase.unsafe.stream.capability.enforce</name> <value>false</value> <description> Controls whether HBase will check for stream capabilities (hflush/hsync). Disable this if you intend to run on LocalFileSystem, denoted by a rootdir with the 'file://' scheme, but be mindful of the NOTE below. WARNING: Setting this to false blinds you to potential data loss and inconsistent system state in the event of process and/or node failures. If HBase is complaining of an inability to use hsync or hflush it's most likely not a false positive. </description> </property> </configuration>
2、依赖
<dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.2.0</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>2.0.4</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-mapreduce</artifactId> <version>2.0.4</version> </dependency>
3、mapper
//输入:文本方式,输出:字节作为键,hbase的Mutation作为输出值 public class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> { //计数器 public enum Counters { LINES } private byte[] family = null; private byte[] qualifier = null; /** * Called once at the beginning of the task. * * @param context */ @Override protected void setup(Context context) throws IOException, InterruptedException { //从配置文件中读取列族信息,这个信息是控制台方式写入,并通过cli获取 String column = context.getConfiguration().get("conf.column"); ColParser parser = new ColParser(); parser.parse(column); if(!parser.isValid()) throw new IOException("family or qualifier error"); family = parser.getFamily(); qualifier = parser.getQualifier(); } /** * Called once for each key/value pair in the input split. Most applications * should override this, but the default is the identity function. * * @param key * @param value * @param context */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { try { String line = value.toString(); //散列每行数据作为行键,根据需求调整 byte[] rowKey = DigestUtils.md5(line); Put put = new Put(rowKey); put.addColumn(this.family,this.qualifier,Bytes.toBytes(line)); context.write(new ImmutableBytesWritable(rowKey),put); context.getCounter(Counters.LINES).increment(1); }catch (Exception e){ e.printStackTrace(); } } class ColParser { private byte[] family; private byte[] qualifier; private boolean valid; public byte[] getFamily() { return family; } public byte[] getQualifier() { return qualifier; } public boolean isValid() { return valid; } public void parse(String value) { try { String[] sValue = value.split(":"); if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) { valid = false; return; } family = Bytes.toBytes(sValue[0]); qualifier = Bytes.toBytes(sValue[1]); valid = true; } catch (Exception e) { valid = false; } } } }
4、main
public class ImportFromFile { // private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000"; public static final String NAME = "ImportFromFile"; private static CommandLine parseArgs(String[] args) throws ParseException{ Options options = new Options(); Option option = new Option("t","table",true,"表不能为空"); option.setArgName("table-name"); option.setRequired(true); options.addOption(option); option = new Option("c","column",true,"列族和列名不能为空"); option.setArgName("family:qualifier"); option.setRequired(true); options.addOption(option); option = new Option("i","input",true,"输入文件或者目录"); option.setArgName("path-in-HDFS"); option.setRequired(true); options.addOption(option); options.addOption("d","debug",false,"switch on DEBUG log level"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options,args); }catch (Exception e){ System.err.println("ERROR: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } if (cmd.hasOption("d")) { Logger log = Logger.getLogger("mapreduce"); log.setLevel(Level.DEBUG); } return cmd; } public static void main(String[] args) throws Exception{ Configuration conf = HBaseConfiguration.create(); String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); CommandLine cmd = parseArgs(runArgs); if (cmd.hasOption("d")) conf.set("conf.debug", "true"); String table = cmd.getOptionValue("t"); String input = cmd.getOptionValue("i"); String column = cmd.getOptionValue("c"); //写入配置后,在mapper阶段取出 conf.set("conf.column", column); Job job = Job.getInstance(conf,"Import from file " + input +" into table " + table); job.setJarByClass(ImportFromFile.class); job.setMapperClass(ImportMapper.class); job.setOutputFormatClass(TableOutputFormat.class); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE,table); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); job.setNumReduceTasks(0); //不需要reduce FileInputFormat.addInputPath(job,new Path(input)); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
5、执行
先在HBASE里建表 create 'importTable','data' 把jar包传到hdfs上执行 hadoop jar ImportFromFile.jar -t importTable -i /input/test-data.txt -c data:json
二、从hbase获取数据进行计算
从上例中把hbase数据抽取出来计算作者出现数量
多加一个依赖
<dependency> <groupId>com.googlecode.json-simple</groupId> <artifactId>json-simple</artifactId> <version>1.1.1</version> </dependency>
1、mapper
import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import java.io.IOException; public class AnalyzeMapper extends TableMapper<Text,IntWritable> { private JSONParser parser = new JSONParser(); public enum Counters { ROWS, COLS, ERROR, VALID } private IntWritable ONE = new IntWritable(1); /** * Called once for each key/value pair in the input split. Most applications * should override this, but the default is the identity function. * * @param key * @param value * @param context */ @Override protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException { context.getCounter(Counters.ROWS).increment(1); String val = null; try { for(Cell cell:value.listCells()){ context.getCounter(Counters.COLS).increment(1); val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()); JSONObject json = (JSONObject)parser.parse(val); String author = (String)json.get("author"); if (context.getConfiguration().get("conf.debug") != null) System.out.println("Author: " + author); context.write(new Text(author),ONE); context.getCounter(Counters.VALID).increment(1); } }catch (Exception e){ e.printStackTrace(); System.err.println("Row: " + Bytes.toStringBinary(key.get()) + ", JSON: " + value); context.getCounter(Counters.ERROR).increment(1); } } }
2、reducer
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class AnalyzeReducer extends Reducer<Text,IntWritable,Text,IntWritable> { /** * This method is called once for each key. Most applications will define * their reduce class by overriding this method. The default implementation * is an identity function. * * @param key * @param values * @param context */ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for(IntWritable one:values) count++; if (context.getConfiguration().get("conf.debug") != null) System.out.println("Author: " + key.toString() + ", Count: " + count); context.write(key,new IntWritable(count)); } }
3、main
import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.commons.cli.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Level; import org.apache.log4j.Logger; import java.io.IOException; public class AnalyzeData { private static final Log LOG = LogFactory.getLog(AnalyzeData.class); public static final String NAME = "AnalyzeData"; /** * Parse the command line parameters. * * @param args The parameters to parse. * @return The parsed command line. * @throws org.apache.commons.cli.ParseException When the parsing of the parameters fails. */ private static CommandLine parseArgs(String[] args) throws ParseException { Options options = new Options(); Option o = new Option("t", "table", true, "table to read from (must exist)"); o.setArgName("table-name"); o.setRequired(true); options.addOption(o); o = new Option("c", "column", true, "column to read data from (must exist)"); o.setArgName("family:qualifier"); options.addOption(o); o = new Option("o", "output", true, "the directory to write to"); o.setArgName("path-in-HDFS"); o.setRequired(true); options.addOption(o); options.addOption("d", "debug", false, "switch on DEBUG log level"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (Exception e) { System.err.println("ERROR: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } if (cmd.hasOption("d")) { Logger log = Logger.getLogger("mapreduce"); log.setLevel(Level.DEBUG); System.out.println("DEBUG ON"); } return cmd; } public static void main(String[] args) throws Exception{ Configuration conf = HBaseConfiguration.create(); String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); CommandLine cmd = parseArgs(runArgs); if(cmd.hasOption("d")) conf.set("conf.debug","true"); String table = cmd.getOptionValue("t"); String column = cmd.getOptionValue("c"); String output = cmd.getOptionValue("o"); ColumnParser columnParser = new ColumnParser(); columnParser.parse(column); if(!columnParser.isValid()) throw new IOException("family or qualifier error"); byte[] family = columnParser.getFamily(); byte[] qualifier = columnParser.getQualifier(); Scan scan = new Scan(); scan.addColumn(family,qualifier); Job job = Job.getInstance(conf,"Analyze data in " + table); job.setJarByClass(AnalyzeData.class); TableMapReduceUtil.initTableMapperJob(table,scan,AnalyzeMapper.class, Text.class, IntWritable.class,job); job.setMapperClass(AnalyzeMapper.class); job.setReducerClass(AnalyzeReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job,new Path(output)); System.exit(job.waitForCompletion(true) ? 0:1); } }
### public class ColumnParser { private byte[] family; private byte[] qualifier; private boolean valid; public byte[] getFamily() { return family; } public byte[] getQualifier() { return qualifier; } public boolean isValid() { return valid; } public void parse(String value) { try { String[] sValue = value.split(":"); if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) { valid = false; return; } family = Bytes.toBytes(sValue[0]); qualifier = Bytes.toBytes(sValue[1]); valid = true; } catch (Exception e) { valid = false; } } }
4、执行
hadoop jar AnalyzeData.jar -t importTable -c data:json -o /output9 结果: ... ... AnalyzeMapper$Counters COLS=993 ERROR=6 ROWS=993 VALID=987
三、从hbase中读取数据,计算后存回hbase
把上例中存入的json串读出,按key-value的方式分解,把key作为列名,value作为列值存入hbase
public class ParseJson { private static final String HDFSUri = "hdfs://bigdata-senior01.home.com:9000"; private static final Log LOG = LogFactory.getLog(ParseJson.class); public static final String NAME = "ParseJson"; public enum Counters {ROWS,COLS,VALID,ERROR}; static class ParseMapper extends TableMapper<ImmutableBytesWritable, Mutation>{ private JSONParser parser = new JSONParser(); private byte[] columnFamily = null; @Override protected void setup(Context context) throws IOException, InterruptedException { columnFamily = Bytes.toBytes(context.getConfiguration().get("conf.columnFamily")); } @Override protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException { context.getCounter(Counters.ROWS).increment(1); String val = null; try { Put put = new Put(key.get()); for(Cell cell : value.listCells()){ context.getCounter(Counters.COLS).increment(1); val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()); JSONObject json = (JSONObject) parser.parse(val); for (Object jsonKey : json.keySet()){ Object jsonValue = json.get(jsonKey); put.addColumn(columnFamily,Bytes.toBytes(jsonKey.toString()),Bytes.toBytes(jsonValue.toString())); } } context.write(key,put); context.getCounter(Counters.VALID).increment(1); }catch (Exception e){ e.printStackTrace(); System.err.println("Error: " + e.getMessage() + ", Row: " + Bytes.toStringBinary(key.get()) + ", JSON: " + value); context.getCounter(Counters.ERROR).increment(1); } } } private static CommandLine parseArgs(String[] args) throws ParseException{ Options options = new Options(); Option o = new Option("i", "input", true, "table to read from (must exist)"); o.setArgName("input-table-name"); o.setRequired(true); options.addOption(o); o = new Option("o", "output", true, "table to write to (must exist)"); o.setArgName("output-table-name"); o.setRequired(true); options.addOption(o); o = new Option("c", "column", true, "column to read data from (must exist)"); o.setArgName("family:qualifier"); options.addOption(o); options.addOption("d", "debug", false, "switch on DEBUG log level"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (Exception e) { System.err.println("ERROR: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } if (cmd.hasOption("d")) { Logger log = Logger.getLogger("mapreduce"); log.setLevel(Level.DEBUG); System.out.println("DEBUG ON"); } return cmd; } public static void main(String[] args) throws Exception{ Configuration conf = HBaseConfiguration.create(); // conf.set("hbase.master","192.168.31.10"); // conf.set("hbase.zookeeper.quorum", "192.168.31.10"); // conf.set("hbase.rootdir","hdfs://bigdata-senior01.home.com:9000/hbase"); // conf.set("hbase.zookeeper.property.dataDir","hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper"); String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); CommandLine cmd = parseArgs(runArgs); if(cmd.hasOption("d")) conf.set("conf.debug","true"); String input = cmd.getOptionValue("i"); String output = cmd.getOptionValue("o"); String column = cmd.getOptionValue("c"); ColumnParser columnParser = new ColumnParser(); columnParser.parse(column); if(!columnParser.isValid()) throw new IOException("family or qualifier error"); byte[] family = columnParser.getFamily(); byte[] qualifier = columnParser.getQualifier(); Scan scan = new Scan(); scan.addColumn(family,qualifier); conf.set("conf.columnFamily", Bytes.toStringBinary(family)); Job job = Job.getInstance(conf, "Parse data in " + input + ", write to " + output); job.setJarByClass(ParseJson.class); TableMapReduceUtil.initTableMapperJob(input,scan,ParseMapper.class,ImmutableBytesWritable.class,Put.class,job); TableMapReduceUtil.initTableReducerJob(output, IdentityTableReducer.class,job); System.exit(job.waitForCompletion(true)?0:1); } }
执行:
hadoop jar ParseJson.jar -i importTable -c data:json -o importTable