spark+hcatalog操作hive表及其数据

package iie.hadoop.hcatalog.spark;

import iie.udps.common.hcatalog.SerHCatInputFormat;
import iie.udps.common.hcatalog.SerHCatOutputFormat;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.spark.Accumulator;
import org.apache.spark.SerializableWritable;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;
import org.apache.hadoop.hive.serde.serdeConstants;
//import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hive.hcatalog.mapreduce.OutputJobInfo;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.thrift.TException;

import scala.Tuple2;

/**
 * spark+hcatalog 实现表的复制功能， 并将原表一列数据变成大写存到新表 ; create table test(name String,age
 * int); 执行命令：spark-submit --master yarn-cluster --class
 * iie.hadoop.hcatalog.spark.LowerUpperCaseConvert /home/xdf/test.jar -c
 * /user/xdf/stdin.xml
 * 
 * @author xiaodongfang
 *
 */
public class LowerUpperCaseConvert {

	private static Accumulator<Integer> inputDataCount;
	private static Accumulator<Integer> outputDataCount;

	@SuppressWarnings("rawtypes")
	public static void main(String[] args) throws Exception {

		if (args.length < 2) {
			System.err.println("Usage: <-c> <stdin.xml>");
			System.exit(1);
		}

		String stdinXml = args[1];
		String userName = null;
		String jobinstanceid = null;
		String operatorName = null;
		String dbName = null;
		String inputTabName = null;
		String operFieldName = null;
		int fieldCount = 0;

		// 读取stdin.xml文件
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		FSDataInputStream dis = fs.open(new Path(stdinXml));
		InputStreamReader isr = new InputStreamReader(dis, "utf-8");
		BufferedReader read = new BufferedReader(isr);
		String tempString = "";
		String xmlParams = "";
		while ((tempString = read.readLine()) != null) {
			xmlParams += "\n" + tempString;
		}
		read.close();
		xmlParams = xmlParams.substring(1);

		// 获取xml文件中的参数值
		OperatorParamXml operXML = new OperatorParamXml();
		List<Map> list = operXML.parseStdinXml(xmlParams);
		userName = list.get(0).get("userName").toString();
		dbName = list.get(0).get("dbName").toString();
		inputTabName = list.get(0).get("inputTabName").toString();
		operatorName = list.get(0).get("operatorName").toString();
		jobinstanceid = list.get(0).get("jobinstanceid").toString();
		fieldCount = Integer.parseInt(list.get(0).get("fieldCount").toString());

		// 设置输出表字段名及类型
		ArrayList<String> fieldName = new ArrayList<String>();
		ArrayList<String> fieldType = new ArrayList<String>();
		for (int i = 1; i <= fieldCount; i++) {
			fieldName.add(list.get(0).get("fieldName" + i).toString());
			fieldType.add(list.get(0).get("fieldType" + i).toString());
		}
		String[] fieldNames = new String[fieldCount];
		String[] fieldTypes = new String[fieldCount];

		// 设置输出表的名字
		String outputTable = "tmp_" + UUID.randomUUID().toString().replace('-', '_');

		// 获取表字段名字和类型
		for (int j = 0; j < fieldCount; j++) {
			fieldNames[j] = fieldName.get(j);
			fieldTypes[j] = fieldType.get(j);
			System.out.println("====fieldName=====" + fieldNames[j]);
			System.out.println("====fieldType=====" + fieldTypes[j]);
		}
		System.out.println("====fieldCount=====" + fieldCount);

		// 创建hive表
		HCatSchema schema = getHCatSchema(dbName, inputTabName);
		createTable(dbName, outputTable, schema);

		// 将输入表字段数据转换为大写，写入输出表文件中
		JavaSparkContext jsc = new JavaSparkContext(
				new SparkConf().setAppName("LowerUpperCaseConvert"));
		inputDataCount = jsc.accumulator(0);
		outputDataCount = jsc.accumulator(0);

		// 要操作的字段名称及字段序号
		operFieldName = fieldNames[0];
		System.out.println("====operFieldName======" + operFieldName);
		int position = schema.getPosition(operFieldName);

		JavaRDD<SerializableWritable<HCatRecord>> rdd1 = LowerUpperCaseConvert
				.lowerUpperCaseConvert(jsc, dbName, inputTabName, position);
		LowerUpperCaseConvert.storeToTable(rdd1, dbName, outputTable);
		jsc.stop();

		// 设置输出xml文件参数
		List<Map> listOut = new ArrayList<Map>();
		Map<String, String> mapOut = new HashMap<String, String>();
		mapOut.put("jobinstanceid", jobinstanceid);
		mapOut.put("dbName", dbName);
		mapOut.put("outputTable", outputTable);
		mapOut.put("inputDataCount", inputDataCount.value().toString());
		mapOut.put("outputDataCount", outputDataCount.value().toString());

		String operFieldType = fieldTypes[0];// 要操作的字段类型
		if (operFieldType.equalsIgnoreCase("String")) {
			// 创建正常输出xml文件
			listOut.add(mapOut);
			String hdfsOutXml = "/user/" + userName + "/optasks/"
					+ jobinstanceid + "/" + operatorName + "/out"
					+ "/stdout.xml";
			operXML.genStdoutXml(hdfsOutXml, listOut);
		} else {
			// 创建错误输出xml文件
			String errorMessage = "fieldType is not string!!!";
			String errotCode = "80001";
			mapOut.put("errorMessage", errorMessage);
			mapOut.put("errotCode", errotCode);
			listOut.add(mapOut);
			String hdfsErrorXml = "/user/" + userName + "/optasks/"
					+ jobinstanceid + "/" + operatorName + "/out"
					+ "/stderr.xml";
			operXML.genStderrXml(hdfsErrorXml, listOut);
		}
		System.exit(0);
	}

	@SuppressWarnings("rawtypes")
	public static JavaRDD<SerializableWritable<HCatRecord>> lowerUpperCaseConvert(
			JavaSparkContext jsc, String dbName, String inputTabName,
			int position) throws IOException {

		Configuration inputConf = new Configuration();
		SerHCatInputFormat.setInput(inputConf, dbName, inputTabName);

		JavaPairRDD<WritableComparable, SerializableWritable> rdd = jsc
				.newAPIHadoopRDD(inputConf, SerHCatInputFormat.class,
						WritableComparable.class, SerializableWritable.class);

		final Broadcast<Integer> posBc = jsc.broadcast(position);
		// 获取表记录集
		JavaRDD<SerializableWritable<HCatRecord>> result = null;
		final Accumulator<Integer> output = jsc.accumulator(0);
		final Accumulator<Integer> input = jsc.accumulator(0);

		result = rdd
				.map(new Function<Tuple2<WritableComparable, SerializableWritable>, SerializableWritable<HCatRecord>>() {

					private static final long serialVersionUID = -2362812254158054659L;

					private final int postion = posBc.getValue().intValue();

					public SerializableWritable<HCatRecord> call(
							Tuple2<WritableComparable, SerializableWritable> v)
							throws Exception {
						HCatRecord record = (HCatRecord) v._2.value();
						// +1 inport
						input.add(1);
						List<Object> newRecord = new ArrayList<Object>(record
								.size());
						for (int i = 0; i < record.size(); ++i) {
							newRecord.add(record.get(i));
						}
						/*
						 * if (ok) +1 outport1 else +1 errport
						 */
						newRecord.set(postion, newRecord.get(postion)
								.toString().toUpperCase());
						output.add(1);
						return new SerializableWritable<HCatRecord>(
								new DefaultHCatRecord(newRecord));// 返回记录
					}
				});
		inputDataCount = input;
		outputDataCount = output;
		return result;
	}

	@SuppressWarnings("rawtypes")
	public static void storeToTable(
			JavaRDD<SerializableWritable<HCatRecord>> rdd, String dbName,
			String tblName) {
		Job outputJob = null;
		try {
			outputJob = Job.getInstance();
			outputJob.setJobName("lowerUpperCaseConvert");
			outputJob.setOutputFormatClass(SerHCatOutputFormat.class);
			outputJob.setOutputKeyClass(WritableComparable.class);
			outputJob.setOutputValueClass(SerializableWritable.class);
			SerHCatOutputFormat.setOutput(outputJob,
					OutputJobInfo.create(dbName, tblName, null));
			HCatSchema schema = SerHCatOutputFormat.getTableSchema(outputJob
					.getConfiguration());
			SerHCatOutputFormat.setSchema(outputJob, schema);
		} catch (IOException e) {
			e.printStackTrace();
		}

		// 将RDD存储到目标表中
		rdd.mapToPair(
				new PairFunction<SerializableWritable<HCatRecord>, WritableComparable, SerializableWritable<HCatRecord>>() {

					private static final long serialVersionUID = -4658431554556766962L;

					@Override
					public Tuple2<WritableComparable, SerializableWritable<HCatRecord>> call(
							SerializableWritable<HCatRecord> record)
							throws Exception {
						return new Tuple2<WritableComparable, SerializableWritable<HCatRecord>>(
								NullWritable.get(), record);
					}
				}).saveAsNewAPIHadoopDataset(outputJob.getConfiguration());
	}

	// 创建表结构
	public static void createTable(String dbName, String tblName,
			HCatSchema schema) {
		HiveMetaStoreClient client = null;
		try {
			HiveConf hiveConf = HCatUtil.getHiveConf(new Configuration());
			try {
				client = HCatUtil.getHiveClient(hiveConf);
			} catch (MetaException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		try {
			if (client.tableExists(dbName, tblName)) {
				client.dropTable(dbName, tblName);
			}
		} catch (TException e) {
			e.printStackTrace();
		}

		List<FieldSchema> fields = HCatUtil.getFieldSchemaList(schema
				.getFields());
		System.out.println(fields);
		Table table = new Table();
		table.setDbName(dbName);
		table.setTableName(tblName);

		StorageDescriptor sd = new StorageDescriptor();
		sd.setCols(fields);
		table.setSd(sd);
		sd.setInputFormat(RCFileInputFormat.class.getName());
		sd.setOutputFormat(RCFileOutputFormat.class.getName());
		sd.setParameters(new HashMap<String, String>());
		sd.setSerdeInfo(new SerDeInfo());
		sd.getSerdeInfo().setName(table.getTableName());
		sd.getSerdeInfo().setParameters(new HashMap<String, String>());
		sd.getSerdeInfo().getParameters()
				.put(serdeConstants.SERIALIZATION_FORMAT, "1");
		sd.getSerdeInfo().setSerializationLib(
				org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class
						.getName());
		Map<String, String> tableParams = new HashMap<String, String>();
		table.setParameters(tableParams);
		try {
			client.createTable(table);
			System.out.println("Create table successfully!");
		} catch (TException e) {
			e.printStackTrace();
			return;
		} finally {
			client.close();
		}
	}

	// 获得HCatSchema
	public static HCatSchema getHCatSchema(String dbName, String tblName) {
		Job outputJob = null;
		HCatSchema schema = null;
		try {
			outputJob = Job.getInstance();
			outputJob.setJobName("getHCatSchema");
			outputJob.setOutputFormatClass(SerHCatOutputFormat.class);
			outputJob.setOutputKeyClass(WritableComparable.class);
			outputJob.setOutputValueClass(SerializableWritable.class);
			SerHCatOutputFormat.setOutput(outputJob,
					OutputJobInfo.create(dbName, tblName, null));
			schema = SerHCatOutputFormat.getTableSchema(outputJob
					.getConfiguration());
		} catch (IOException e) {
			e.printStackTrace();
		}
		return schema;
	}
}
posted on 2014-12-24 11:21 XIAO的博客阅读(1618) 评论(2) 收藏举报
刷新页面返回顶部
XIAO的博客

spark+hcatalog操作hive表及其数据

公告

导航