orc使用java生成文件的示例代码

包含了int等基本类型、string、数组
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

/**
 * 生成orc文件
 */
public class WriteToOrcFile {

    public static class Bean {
        int id;
        String name;
        String[] strs;

        public Bean(int id, String name, String[] strs) {
            this.id = id;
            this.name = name;
            this.strs = strs;
        }

        public int getId() {
            return id;
        }

        public void setId(int id) {
            this.id = id;
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }

        public String[] getStrs() {
            return strs;
        }

        public void setStrs(String[] strs) {
            this.strs = strs;
        }
    }

    public static void main(String[] args) throws IOException, SQLException {
        new WriteToOrcFile().writeOrc("myfile.orc");
    }

    public void writeOrc(String filename) throws IOException, SQLException {
        Configuration conf = new Configuration();
        List<Bean> list = new ArrayList();
        Bean Bean1 = new Bean(1, "1", new String[]{"1", "1", "1"});
        Bean Bean2 = new Bean(2, "2", new String[]{"2", "2", "2"});
        Bean Bean3 = new Bean(3, "3", new String[]{"3", "3", "3"});
        list.add(Bean1);
        list.add(Bean2);
        list.add(Bean3);
        //确定每一列的数据类型
        TypeDescription schema = TypeDescription.createStruct()
                .addField("id", TypeDescription.createInt())
                .addField("name", TypeDescription.createString())
                .addField("strs", TypeDescription.createList(TypeDescription.createString()));
        //输出orc文件到本地路径
        String path = "/temp/" + filename;
        File file = new File(path);
        if (file.exists()) {//文件存在则删除
            System.out.println("orc文件文件存在，删除");
            file.delete();
        }
        //设置写入流时的参数，
        Writer writer = OrcFile.createWriter(new Path(path), OrcFile.writerOptions(conf)
                .setSchema(schema)
                .stripeSize(67108864)
                .bufferSize(64 * 1024)
                .blockSize(128 * 1024 * 1024)
                .rowIndexStride(10000)
                .blockPadding(true)
                //默认压缩算法为zlib,zlib相对于snappy压缩算法，压缩比更低，压缩效果更好，但是花费了更多的压缩时间
                .compress(CompressionKind.ZLIB));
        VectorizedRowBatch batch = schema.createRowBatch();
        //获取每一列的引用
        LongColumnVector a = (LongColumnVector) batch.cols[0];
        BytesColumnVector b = (BytesColumnVector) batch.cols[1];
        ListColumnVector c = (ListColumnVector) batch.cols[2];
        //开始转换成二进制的orc文件
        for (Bean o : list) {
            int row = batch.size++;
            //int,double,long等数据类型用  引用.vector
            a.vector[row] = o.getId();
            //String等数据类型用 引用.setVal
            b.setVal(row, o.getName().getBytes());
            //数组
            String[] strs = o.getStrs();
            c.offsets[row] += 1;
            c.lengths[row] = strs.length;
            c.childCount += c.lengths[row];
            for (int i = 0; i < strs.length; i++) {
                ((BytesColumnVector) c.child).setVal((int) (c.offsets[row] + i), strs[i].getBytes());
            }
            writer.addRowBatch(batch);
            batch.reset();
        }
        System.out.println("orc文件写出完成");
        writer.close();
    }
}
posted on 2024-07-30 18:28 Yr-Zhang 阅读(52) 评论(0) 编辑收藏举报
刷新页面返回顶部
☆☆☆★☆☆☆

导航

公告

orc使用java生成文件的示例代码