MapReduce-读取文件写入HBase

MapReduce直接写入HBase

代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package com.hbase.mapreduce;
 
import java.io.IOException;
 
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
 
/**
* @author:FengZhen
* @create:2018年9月14日
*/
public class ImportFromFile extends Configured implements Tool{
 
    private static String addr="HDP233,HDP232,HDP231";
    private static String port="2181";
    public static final String NAME = "ImportFromFile";
    public enum Counters { LINES }
     
    static class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
         
        private byte[] family = null;
        private byte[] qualifier = null;
         
        @Override
        protected void setup(Mapper<LongWritable, Text, ImmutableBytesWritable, Put>.Context context)
                throws IOException, InterruptedException {
            String column = context.getConfiguration().get("conf.column");
            byte[][] colkey = KeyValue.parseColumn(Bytes.toBytes(column));
            family = colkey[0];
            if (colkey.length > 1) {
                qualifier = colkey[1];
            }
        }
         
        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, ImmutableBytesWritable, Put>.Context context)
                throws IOException, InterruptedException {
            try {
                String lineString = value.toString();
                //行键是经过MD5散列之后随机生成的键值
                byte[] rowkey = DigestUtils.md5(lineString);
                Put put = new Put(rowkey);
                //存储原始数据到给定的表中的一列
                put.addColumn(family, qualifier, Bytes.toBytes(lineString));
                context.write(new ImmutableBytesWritable(rowkey), put);
                context.getCounter(Counters.LINES).increment(1L);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
     
    /**
     * 使用Apache Commons CLI类解析命令行参数。
     * @param args
     * @return
     */
    private static CommandLine parseArgs(String[] args) {
        Options options = new Options();
        Option option = new Option("t", "table", true, "table to import into -must exist");
        option.setArgName("table-name");
        option.setRequired(true);
        options.addOption(option);
         
        option = new Option("c", "column", true, "column to store row data into -must exit");
        option.setArgName("family:qualifier");
        option.setRequired(true);
        options.addOption(option);
         
        option = new Option("i", "input", true, "the directory or file to read from");
        option.setArgName("path-in-HDFS");
        option.setRequired(true);
        options.addOption(option);
         
        options.addOption("d", "debug", false, "switch on DEBUG log level");
         
        CommandLineParser parser = new PosixParser();
        CommandLine cmd = null;
        try {
            cmd = parser.parse(options, args);
        } catch (ParseException e) {
            e.printStackTrace();
            System.err.println("ERROR: " + e.getMessage() + "\n");
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(NAME + " ", options, true);
            System.exit(1);
        }
        return cmd;
    }
     
    public int run(String[] arg0) throws Exception {
        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum",addr);
        configuration.set("hbase.zookeeper.property.clientPort", port);
        //String[] otherArgs = new GenericOptionsParser(configuration, arg0).getRemainingArgs();
        //CommandLine commandLine = parseArgs(arg0);
         
//      String table = commandLine.getOptionValue("t");
//      String input = commandLine.getOptionValue("i");
//      String column = commandLine.getOptionValue("c");
         
        String table = arg0[0];
        String input = arg0[1];
        String column = arg0[2];
        configuration.set("conf.column", column);
         
        Job job = Job.getInstance(configuration);
        job.setJobName("ImportFromFile");
        job.setJarByClass(ImportFromFile.class);
        job.setMapperClass(ImportMapper.class);
        job.setOutputFormatClass(TableOutputFormat.class);
        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table);
        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(Writable.class);
        //这是一个只包含map阶段的作业,框架会直接跳过reduce阶段
        job.setNumReduceTasks(0);
         
        FileInputFormat.addInputPath(job, new Path(input));
        return job.waitForCompletion(true) ? 0 : 1;
    }
     
    public static void main(String[] args) throws Exception {
        String[] params = new String[] {"test_table_mr", "hdfs://fz/data/fz/input/hbase", "data:info"};
        int exitCode = ToolRunner.run(new ImportFromFile(), params);
        System.exit(exitCode);
    }
}

 

posted on   嘣嘣嚓  阅读(650)  评论(0编辑  收藏  举报

编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示