大数据之路Week10_day01 (通过直接创建Hfile文件的方式往Hbase中插入数据)

package com.wyh.parctise;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class HDFStoHFile {
    /**
     * 编写map段
     */
    public static class HdfsToHFileMap extends Mapper<LongWritable,Text,ImmutableBytesWritable,KeyValue>{
        @Override
        protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
            String[] split = v1.toString().split(",");
            String id = split[0];
            //创建输入类型数据
            ImmutableBytesWritable key = new ImmutableBytesWritable(id.getBytes());

            //创建输出类型
            KeyValue name = new KeyValue(id.getBytes(), "info".getBytes(), "name".getBytes(), split[1].getBytes());
            KeyValue age = new KeyValue(id.getBytes(), "info".getBytes(), "age".getBytes(), split[2].getBytes());
            KeyValue gender = new KeyValue(id.getBytes(), "info".getBytes(), "gender".getBytes(), split[3].getBytes());
            KeyValue clazz = new KeyValue(id.getBytes(), "info".getBytes(), "clazz".getBytes(), split[4].getBytes());

            //写入到磁盘
            context.write(key,name);
            context.write(key,age);
            context.write(key,gender);
            context.write(key,clazz);
        }
    }



    public static void main(String[] args) throws Exception {
        //创建配置文件实例
        Configuration conf = HBaseConfiguration.create();
        Job job = Job.getInstance(conf);
        //创建Job
        job.setJobName("HDFStoHfile");

        job.setJarByClass(HDFStoHFile.class);

        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(KeyValue.class);

        //设置job的map段
        job.setMapperClass(HdfsToHFileMap.class);

        //设置reduce段,是Hbase给我们写好的一个类
        job.setReducerClass(KeyValueSortReducer.class);

        //创建HTable
        HTable stu4 = new HTable(conf, "stu4");

        //将这个表加入到输出中去
        HFileOutputFormat2.configureIncrementalLoad(job,stu4);

        //设置HDFS文件的输入路径
        FileInputFormat.addInputPath(job,new Path("/data/students.txt"));
        FileOutputFormat.setOutputPath(job,new Path("/data/hfile1"));

        //将其关闭
        job.waitForCompletion(true);


    }
}

前提:现在Hbase中创建好表和原本HDFS中存在数据

 

2、将产生的Hfile在hbase中添加索引

package com.wyh.parctise;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;

public class LoadHfileToHbase {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum", "testmaster:2181,testnode1:2181.testnode2:2181,testnode3:2181");

        HTable stu4 = new HTable(conf, "stu4");

        LoadIncrementalHFiles loadIncrementalHFiles = new LoadIncrementalHFiles(conf);
        loadIncrementalHFiles.doBulkLoad(new Path("/data/hfile1"),stu4);
    }
}

 

 

注意:两个执行方式都是将其打包,注意使用整个项目进行打包,不然在Hadoop的环境中没有添加Hbase的依赖会报错,在pom.xml中添加如下代码(这里不是依赖)

    <build>
        <plugins>
            <!-- compiler插件, 设定JDK版本 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <encoding>UTF-8</encoding>
                    <source>1.8</source>
                    <target>1.8</target>
                    <showWarnings>true</showWarnings>
                </configuration>
            </plugin>


            <!-- 带依赖jar 插件-->
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

        </plugins>

    </build>

 

在将项目打包,在hadoop的环境中,指定类名进行运行。

posted @ 2020-01-08 21:41  Xiaohu_BigData  阅读(384)  评论(0编辑  收藏  举报