java - hive - 读写orc文件
读取orc文件
@Test public void readOrc() throws IOException { Configuration conf = new Configuration(); Reader reader = OrcFile.createReader(new Path("/tmp/Orc.orc"), OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(); VectorizedRowBatch batch = reader.getSchema().createRowBatch(); while (rows.nextBatch(batch)) { System.out.println(batch.toString()); } rows.close(); }
写orc文件---一行
@Test public void writeLine3() throws IOException { Configuration conf = new Configuration(); TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>"); Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"), OrcFile.writerOptions(conf) .setSchema(schema)); VectorizedRowBatch batch = schema.createRowBatch(); LongColumnVector x = (LongColumnVector) batch.cols[0]; LongColumnVector y = (LongColumnVector) batch.cols[1]; int row = batch.size++; x.vector[row] = 2; y.vector[row] = 2 * 3; if (batch.size != 0) { writer.addRowBatch(batch); batch.reset(); } writer.close(); }
写orc文件--多行
@Test public void writeLine2() throws IOException { String[] lines = new String[]{"1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd"}; // String[] lines = new String[]{"1,2,4", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3"}; Configuration conf = new Configuration(); TypeDescription schema = TypeDescription.fromString("struct<field1:String,field2:String,field3:String>"); // TypeDescription schema = TypeDescription.fromString("struct<field1:int,field2:int,field3:int>"); Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"), OrcFile.writerOptions(conf) .setSchema(schema).overwrite(true)); VectorizedRowBatch batch = schema.createRowBatch(); List<? super ColumnVector> columnVectors = new ArrayList<>(); for (int i = 0; i < batch.numCols; i++) { columnVectors.add(batch.cols[i]); } for (String line : lines) { String[] columns = line.split(","); System.out.println(batch.size); int row = batch.size++; for (int i = 0; i < columns.length; i++) { switch (columnVectors.get(i).getClass().getName()) { case "org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector": BytesColumnVector bytesColumnVector = BytesColumnVector.class.cast(columnVectors.get(i)); bytesColumnVector.setVal(row, columns[i].getBytes(), 0, columns[i].getBytes().length); break; case "org.apache.hadoop.hive.ql.exec.vector.LongColumnVector": LongColumnVector longColumnVector = LongColumnVector.class.cast(columnVectors.get(i)); longColumnVector.vector[row] = Long.parseLong(columns[i]); break; case "org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector": Decimal64ColumnVector decimal64ColumnVector = Decimal64ColumnVector.class.cast(columnVectors.get(i)); decimal64ColumnVector.set(row, HiveDecimal.create(columns[i])); break; case "org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector": DecimalColumnVector decimalColumnVector = DecimalColumnVector.class.cast(columnVectors.get(i)); decimalColumnVector.set(row, HiveDecimal.create(columns[i])); break; case "org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector": DoubleColumnVector doubleColumnVector = DoubleColumnVector.class.cast(columnVectors.get(i)); doubleColumnVector.vector[row] = Double.parseDouble(columns[i]); break; case "org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector": TimestampColumnVector timestampColumnVector = TimestampColumnVector.class.cast(columnVectors.get(i)); timestampColumnVector.set(row, java.sql.Timestamp.valueOf(columns[i])); break; } if (batch.size == batch.getMaxSize()) { writer.addRowBatch(batch); batch.reset(); } } } if (batch.size != 0) { writer.addRowBatch(batch); batch.reset(); } writer.close(); }
引用jar
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.*; import org.junit.Test; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List;