HBase数据快速导入之ImportTsv&Bulkload
导入数据最快的方式,可以略过WAL直接生产底层HFile文件
(环境:centos6.5、Hadoop2.6.0、HBase0.98.9)
1.SHELL方式
1.1 ImportTsv直接导入
命令:bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv
Usage: importtsv -Dimporttsv.columns=a,b,c <tablename> <inputdir>
测试:
1.1.1在HBase中创建好表
create ‘testImport1’,’cf’
1.1.2准备数据文件sample1.csv,并上传到HDFS,内容为:
1,"tom"
2,"sam"
3,"jerry"
4,"marry"
5,"john
1.1.3使用导入命令导入
bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator="," -Dimporttsv.columns=HBASE_ROW_KEY,cf testImport1 /sample1.csv
1.1.4结果
1.2先通过ImportTsv生产HFile文件,再通过completeBulkload导入HBase
1.2.1使用刚才的源数据并创建新表
create ‘testImport2’,’cf’
1.2.2使用命令生产HFile文件
bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator="," -Dimporttsv.bulk.output=hfile_tmp -Dimporttsv.columns=HBASE_ROW_KEY,cf testImport2 /sample1.csv
1.2.3在HDFS上的中间结果
1.2.4使用命令将HFile文件导入HBase
hadoop jar lib/hbase-server-0.98.9-hadoop2.jar completebulkload hfile_tmp testImport2
1.2.5结果
注:1.如果出现缺包错误提示,则把HBase的jar包包含到hadoop的classpath中;2.运行该命令的本质是一个hdfs的mv操作,并不会启动MapReduce。
2.API代码方式
代码的方式更灵活一点,许多东西可以自定义。
直接贴代码吧:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FsShell; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; public class BulkLoadJob { static Logger logger = LoggerFactory.getLogger(BulkLoadJob.class); public static class BulkLoadMap extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] valueStrSplit = value.toString().split("\t"); String hkey = valueStrSplit[0]; String family = valueStrSplit[1].split(":")[0]; String column = valueStrSplit[1].split(":")[1]; String hvalue = valueStrSplit[2]; final byte[] rowKey = Bytes.toBytes(hkey); final ImmutableBytesWritable HKey = new ImmutableBytesWritable(rowKey); // Put HPut = new Put(rowKey); // byte[] cell = Bytes.toBytes(hvalue); // HPut.add(Bytes.toBytes(family), Bytes.toBytes(column), cell); KeyValue kv = new KeyValue(rowKey, Bytes.toBytes(family), Bytes.toBytes(column), Bytes.toBytes(hvalue)); context.write(HKey, kv); } } public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); conf.set("hbase.zookeeper.property.clientPort", "2182"); conf.set("hbase.zookeeper.quorum", "msg801,msg802,msg803"); conf.set("hbase.master", "msg801:60000"); String[] dfsArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); String inputPath = dfsArgs[0]; System.out.println("source: " + dfsArgs[0]); String outputPath = dfsArgs[1]; System.out.println("dest: " + dfsArgs[1]); HTable hTable = null; try { Job job = Job.getInstance(conf, "Test Import HFile & Bulkload"); job.setJarByClass(BulkLoadJob.class); job.setMapperClass(BulkLoadJob.BulkLoadMap.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); // speculation job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // in/out format job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(HFileOutputFormat2.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); hTable = new HTable(conf, dfsArgs[2]); HFileOutputFormat2.configureIncrementalLoad(job, hTable); if (job.waitForCompletion(true)) { FsShell shell = new FsShell(conf); try { shell.run(new String[] { "-chmod", "-R", "777", dfsArgs[1] }); } catch (Exception e) { logger.error("Couldnt change the file permissions ", e); throw new IOException(e); } // 加载到hbase表 LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); // 两种方式都可以 // 方式一 String[] loadArgs = { outputPath, dfsArgs[2] }; loader.run(loadArgs); // 方式二 // loader.doBulkLoad(new Path(outputPath), hTable); } else { logger.error("loading failed."); System.exit(1); } } catch (IllegalArgumentException e) { e.printStackTrace(); } finally { if (hTable != null) { hTable.close(); } } } }
2.1创建新表
create ‘testImport3’,’fm1’,’fm2’
2.2创建sample2.csv,并上传到HDFS,内容为:
key1 fm1:col1 value1
key1 fm1:col2 value2
key1 fm2:col1 value3
key4 fm1:col1 value4
使用命令:
hadoop jar BulkLoadJob.jar hdfs://msg/sample2,csv hdfs://msg/HFileOut testImport3
注:1.mapper中使用KeyValue和Put都可以;2.注意jar包的classpath;3.如果Hadoop是HA,则需要使用HA的名字,比如我们的active namenode名称为msg801,但是HA的nameservice为msg,则HDFS的路径必须使用hdfs://msg而不能使用hdfs://msg801:9000(WHY?)。
具体报错为:
IllegalArgumentException: Wrong FS: hdfs://msg801:9000/HFileOut/fm2/bbab9d883a574d518cdcb304d1e681e9, expected: hdfs://msg |