Hbase第五章 MapReduce操作HBase
容易遇到的坑:
当用mapReducer操作HBase时,运行jar包的过程中如果遇到 java.lang.NoClassDefFoundError 类似的错误时,一般是由于hadoop环境没有hbase相关的jar包,这时候需要修改hadoop_env.sh文件,在最后面添加一行:
HADOOP_CLASSPATH=/home/hadoop/apps/hbase/lib/*
实例演示:
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>cn.itcast.hbase</groupId> <artifactId>hbase</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.4</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client --> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>0.99.2</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server --> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>1.4.0</version> </dependency> </dependencies> </project>
HbaseWordCount.java
package cn.itcast.bigdata.mapreduce; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.mapreduce.TableReducer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; public class HbaseWordCount { private final static String tableName = "word";// 表名1 private final static String colf = "content";// 列族 private final static String col = "info";// 列 private final static String tableName2 = "stat";// 表名2 private final static IntWritable one = new IntWritable(1); private final static Text word = new Text(); private static Configuration config; private static Connection connection; static class MyMapper extends TableMapper<Text, IntWritable> { @Override protected void map(ImmutableBytesWritable key, Result value, Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException { // 获取一行数据中的colf:col // 表里面只有一个列族,所以我就直接获取每一行的值 String words = Bytes.toString(value.getValue(Bytes.toBytes(colf), Bytes.toBytes(col))); // 按空格分割 String itr[] = words.toString().split(" "); for (int i = 0; i < itr.length; i++) { word.set(itr[i]); context.write(word, one); } } } static class MyReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, ImmutableBytesWritable, Mutation>.Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } Put put = new Put(Bytes.toBytes(key.toString())); put.add(Bytes.toBytes(colf), Bytes.toBytes(col), Bytes.toBytes(String.valueOf(sum))); context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put); } } // 初始化配置 private static void init() throws IOException { config = HBaseConfiguration.create(); // 配置zookeeper config.set("hbase.zookeeper.quorum", "hadoop2,hadoop3,hadoop4"); config.set("hbase.zookeeper.property.clientPort", "2181"); connection = ConnectionFactory.createConnection(config); CreateTable(); } // 初始化hbase表 private static void CreateTable() throws IOException { Admin admin = connection.getAdmin(); // 删除表 if (admin.tableExists(TableName.valueOf(tableName)) || admin.tableExists(TableName.valueOf(tableName2))) { System.out.println("table is already exists!"); admin.disableTable(TableName.valueOf(tableName)); admin.deleteTable(TableName.valueOf(tableName)); admin.disableTable(TableName.valueOf(tableName2)); admin.deleteTable(TableName.valueOf(tableName2)); } // 创建表 HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(tableName)); HColumnDescriptor family = new HColumnDescriptor(colf); desc.addFamily(family); admin.createTable(desc); HTableDescriptor desc2 = new HTableDescriptor(TableName.valueOf(tableName2)); HColumnDescriptor family2 = new HColumnDescriptor(colf); desc2.addFamily(family2); admin.createTable(desc2); // 插入数据 Table table = connection.getTable(TableName.valueOf(tableName)); table.setAutoFlushTo(false); table.setWriteBufferSize(5); List<Put> lp = new ArrayList<Put>(); Put p1 = new Put(Bytes.toBytes("1")); p1.add(colf.getBytes(), col.getBytes(), ("The Apache Hadoop software library is a framework").getBytes()); lp.add(p1); Put p2 = new Put(Bytes.toBytes("2")); p2.add(colf.getBytes(), col.getBytes(), ("The common utilities that support the other Hadoop modules").getBytes()); lp.add(p2); Put p3 = new Put(Bytes.toBytes("3")); p3.add(colf.getBytes(), col.getBytes(), ("Hadoop by reading the documentation").getBytes()); lp.add(p3); Put p4 = new Put(Bytes.toBytes("4")); p4.add(colf.getBytes(), col.getBytes(), ("Hadoop from the release page").getBytes()); lp.add(p4); Put p5 = new Put(Bytes.toBytes("5")); p5.add(colf.getBytes(), col.getBytes(), ("Hadoop on the mailing list").getBytes()); lp.add(p5); table.put(lp); table.flushCommits(); lp.clear(); } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { init(); Job job = Job.getInstance(config); job.setJarByClass(HbaseWordCount.class); Scan scan = new Scan(); scan.addColumn(Bytes.toBytes(colf), Bytes.toBytes(col)); //创建读取hbase数据的mapper,指定表名,scan,mapper类,输出的key和value TableMapReduceUtil.initTableMapperJob(tableName, scan, MyMapper.class, Text.class, IntWritable.class, job); // 创建写入hbase的reducer,指定表名、reducer类、job TableMapReduceUtil.initTableReducerJob(tableName2, MyReducer.class, job); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
实例代码流程说明:
1、在init()中首先会初始化Hbase的相关配置,主要配置zookeeper集群地址,zookeeper的端口号。
2、创建hbase word和 stat表,并向word表中添加数据。
3、然后执行mapreduce程序,从word表中读取数据,经过处理好,保存进stat表。注意执行mapreduce代码的时候,必须先创建好word表和stat表。