orc使用java生成文件的示例代码
包含了int等基本类型、string、数组
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.CompressionKind; import org.apache.orc.OrcFile; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import java.io.File; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; /** * 生成orc文件 */ public class WriteToOrcFile { public static class Bean { int id; String name; String[] strs; public Bean(int id, String name, String[] strs) { this.id = id; this.name = name; this.strs = strs; } public int getId() { return id; } public void setId(int id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String[] getStrs() { return strs; } public void setStrs(String[] strs) { this.strs = strs; } } public static void main(String[] args) throws IOException, SQLException { new WriteToOrcFile().writeOrc("myfile.orc"); } public void writeOrc(String filename) throws IOException, SQLException { Configuration conf = new Configuration(); List<Bean> list = new ArrayList(); Bean Bean1 = new Bean(1, "1", new String[]{"1", "1", "1"}); Bean Bean2 = new Bean(2, "2", new String[]{"2", "2", "2"}); Bean Bean3 = new Bean(3, "3", new String[]{"3", "3", "3"}); list.add(Bean1); list.add(Bean2); list.add(Bean3); //确定每一列的数据类型 TypeDescription schema = TypeDescription.createStruct() .addField("id", TypeDescription.createInt()) .addField("name", TypeDescription.createString()) .addField("strs", TypeDescription.createList(TypeDescription.createString())); //输出orc文件到本地路径 String path = "/temp/" + filename; File file = new File(path); if (file.exists()) {//文件存在则删除 System.out.println("orc文件文件存在,删除"); file.delete(); } //设置写入流时的参数, Writer writer = OrcFile.createWriter(new Path(path), OrcFile.writerOptions(conf) .setSchema(schema) .stripeSize(67108864) .bufferSize(64 * 1024) .blockSize(128 * 1024 * 1024) .rowIndexStride(10000) .blockPadding(true) //默认压缩算法为zlib,zlib相对于snappy压缩算法,压缩比更低,压缩效果更好,但是花费了更多的压缩时间 .compress(CompressionKind.ZLIB)); VectorizedRowBatch batch = schema.createRowBatch(); //获取每一列的引用 LongColumnVector a = (LongColumnVector) batch.cols[0]; BytesColumnVector b = (BytesColumnVector) batch.cols[1]; ListColumnVector c = (ListColumnVector) batch.cols[2]; //开始转换成二进制的orc文件 for (Bean o : list) { int row = batch.size++; //int,double,long等数据类型用 引用.vector a.vector[row] = o.getId(); //String等数据类型用 引用.setVal b.setVal(row, o.getName().getBytes()); //数组 String[] strs = o.getStrs(); c.offsets[row] += 1; c.lengths[row] = strs.length; c.childCount += c.lengths[row]; for (int i = 0; i < strs.length; i++) { ((BytesColumnVector) c.child).setVal((int) (c.offsets[row] + i), strs[i].getBytes()); } writer.addRowBatch(batch); batch.reset(); } System.out.println("orc文件写出完成"); writer.close(); } }