用spark导入数据到hbase
集群环境:一主三从,Spark为Spark On YARN模式
Spark导入hbase数据方式有多种
1.少量数据:直接调用hbase API的单条或者批量方法就可以
2.导入的数据量比较大,那就需要先生成hfile文件,在把hfile文件加载到hbase里面
下面主要介绍第二种方法:
该方法主要使用spark Java API的两个方法:
1.textFile:将本地文件或者HDFS文件转换成RDD
2.flatMapToPair:将每行数据的所有key-value对象合并成Iterator对象返回(针对多family,多column)
代码如下:
package scala; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.storage.StorageLevel; import util.HFileLoader; public class HbaseBulkLoad { private static final String ZKconnect="slave1,slave2,slave3:2181"; private static final String HDFS_ADDR="hdfs://master:8020"; private static final String TABLE_NAME="DBSTK.STKFSTEST";//表名 private static final String COLUMN_FAMILY="FS";//列族 public static void run(String[] args) throws Exception { Configuration configuration = HBaseConfiguration.create(); configuration.set("hbase.zookeeper.quorum", ZKconnect); configuration.set("fs.defaultFS", HDFS_ADDR); configuration.set("dfs.replication", "1"); String inputPath = args[0]; String outputPath = args[1]; Job job = Job.getInstance(configuration, "Spark Bulk Loading HBase Table:" + TABLE_NAME); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class);//指定输出键类 job.setMapOutputValueClass(KeyValue.class);//指定输出值类 job.setOutputFormatClass(HFileOutputFormat2.class); FileInputFormat.addInputPaths(job, inputPath);//输入路径 FileSystem fs = FileSystem.get(configuration); Path output = new Path(outputPath); if (fs.exists(output)) { fs.delete(output, true);//如果输出路径存在,就将其删除 } fs.close(); FileOutputFormat.setOutputPath(job, output);//hfile输出路径 //初始化sparkContext SparkConf sparkConf = new SparkConf().setAppName("HbaseBulkLoad").setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); //读取数据文件 JavaRDD<String> lines = jsc.textFile(inputPath); lines.persist(StorageLevel.MEMORY_AND_DISK_SER()); JavaPairRDD<ImmutableBytesWritable,KeyValue> hfileRdd = lines.flatMapToPair(new PairFlatMapFunction<String, ImmutableBytesWritable, KeyValue>() { private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<ImmutableBytesWritable, KeyValue>> call(String text) throws Exception { List<Tuple2<ImmutableBytesWritable, KeyValue>> tps = new ArrayList<Tuple2<ImmutableBytesWritable, KeyValue>>(); if(null == text || text.length()<1){ return tps.iterator();//不能返回null } String[] resArr = text.split(","); if(resArr != null && resArr.length == 14){ byte[] rowkeyByte = Bytes.toBytes(resArr[0]+resArr[3]+resArr[4]+resArr[5]) byte[] columnFamily = Bytes.toBytes(COLUMN_FAMILY); ImmutableBytesWritable ibw = new ImmutableBytesWritable(rowkeyByte); //EP,HP,LP,MK,MT,SC,SN,SP,ST,SY,TD,TM,TQ,UX(字典顺序排序) //注意,这地方rowkey、列族和列都要按照字典排序,如果有多个列族,也要按照字典排序,rowkey排序我们交给spark的sortByKey去管理 tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("EP"),Bytes.toBytes(resArr[9])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("HP"),Bytes.toBytes(resArr[7])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("LP"),Bytes.toBytes(resArr[8])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("MK"),Bytes.toBytes(resArr[13])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("MT"),Bytes.toBytes(resArr[4])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("SC"),Bytes.toBytes(resArr[0])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("SN"),Bytes.toBytes(resArr[1])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("SP"),Bytes.toBytes(resArr[6])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("ST"),Bytes.toBytes(resArr[5])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("SY"),Bytes.toBytes(resArr[2])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("TD"),Bytes.toBytes(resArr[3])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("TM"),Bytes.toBytes(resArr[11])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("TQ"),Bytes.toBytes(resArr[10])))); tps.add(new Tuple2<>(ibw,new KeyValue(rowkeyByte, columnFamily, Bytes.toBytes("UX"),Bytes.toBytes(resArr[12])))); } return tps.iterator(); } }).sortByKey(); Connection connection = ConnectionFactory.createConnection(configuration); TableName tableName = TableName.valueOf(TABLE_NAME); HFileOutputFormat2.configureIncrementalLoad(job, connection.getTable(tableName), connection.getRegionLocator(tableName)); //生成hfile文件 hfileRdd.saveAsNewAPIHadoopFile(outputPath, ImmutableBytesWritable.class, KeyValue.class, HFileOutputFormat2.class, job.getConfiguration()); // bulk load start Table table = connection.getTable(tableName); Admin admin = connection.getAdmin(); LoadIncrementalHFiles load = new LoadIncrementalHFiles(configuration); load.doBulkLoad(new Path(outputPath), admin,table,connection.getRegionLocator(tableName)); jsc.close(); } public static void main(String[] args) { try { long start = System.currentTimeMillis(); args = new String[]{"hdfs://master:8020/test/test.txt","hdfs://master:8020/test/hfile/test"}; run(args); long end = System.currentTimeMillis(); System.out.println("数据导入成功,总计耗时:"+(end-start)/1000+"s"); } catch(Exception e) { e.printStackTrace(); } } }
代码打包,上传到集群执行如下命令:
./spark-submit --master yarn-client --executor-memory 4G --driver-memory 1G --num-executors 100 --executor-cores 4 --total-executor-cores 400
--conf spark.default.parallelism=1000 --class scala.HbaseBulkLoad /home/hadoop/app/hadoop/data/spark-hbase-test.jar
本次只测试导入了50000条数据,在测试导入15G(1.5亿条左右)数据时,导入速度没有MapReduce快