orc格式文件
1、Hive支持
创建表时指定orc格式即可:
create table tmp.orc_test(id bigint, name string, age int) stored as orc TBLPROPERTIES('orc.compress'='SNAPPY')
压缩格式有"SNAPPY"和 "ZLIB"两种,需要哪种格式指定即可
2、SPARK支持
Spark读:
df = spark.read.orc("/tmp/test/orc_data") # 读出来的数据是一个dataframe
Spark写:
df.write.format("orc").save("/tmp/test/orc_data2")
3、Hadoop Streaming支持
3.1、读orc文件,输出text (常用查看orc文件)
hadoop jar /usr/local/hadoop-2.7.0//share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
-libjars /usr/local/hive-1.2.0/lib/hive-exec-1.2.0-SNAPSHOT.jar \
-mapper /bin/cat -reducer /bin/cat \
-input /tmp/test/orc_test1 \
-output /tmp/test/orc_streaming_test3 \
-inputformat org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
3.2、读orc文件,写orc文件
hadoop jar /usr/local/hadoop-2.7.0//share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \ -libjars orc_maprd_test.jar \ -D orc.mapred.output.schema="struct<id:string,name:string,sex:string,age:string>" \ -input /tmp/test/orc_streaming_test \ -output /tmp/test/orc_streaming_test2 \ -inputformat org.apache.orc.mapred.OrcInputFormat \ -outputformat org.apache.orc.mapred.OrcOutputFormat \ -mapper is.orc.MyMapper -reducer is.orc.MyReducer
例子:
mvn依赖
<!--orc文件--> <dependency> <groupId>org.apache.orc</groupId> <artifactId>orc-core</artifactId> <version>1.2.3</version> </dependency> <dependency> <groupId>org.apache.orc</groupId> <artifactId>orc-mapreduce</artifactId> <version>1.1.0</version> </dependency> <dependency> <groupId>com.yammer.metrics </groupId> <artifactId>metrics-core </artifactId> <version>2.2.0 </version> </dependency>
编写orc文件
/** * 编写ORC文件 * https://orc.apache.org/docs/mapreduce.html */ public class OrcWriterMR { public static class OrcWriterMapper extends Mapper<LongWritable,Text,NullWritable,OrcStruct> { //要创建的ORC文件中的字段类型 private TypeDescription schema = TypeDescription.fromString( //"struct<str:string>" "struct<datano:bigint,datatime:bigint,type:int,val:int>" ); private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema); private final NullWritable outKey = NullWritable.get(); public void map(LongWritable key, Text value, Context output) throws IOException, InterruptedException { if(!"".equals(value.toString())){ //String lineStr = value.toString().trim(); //pair.setFieldValue("str",new Text(lineStr)); String[] lineStrs = value.toString().split("\\,"); pair.setFieldValue("datano",new LongWritable(Long.parseLong(lineStrs[0]))); pair.setFieldValue("datatime",new LongWritable(Long.parseLong(lineStrs[1]))); pair.setFieldValue("type",new IntWritable(Integer.parseInt(lineStrs[2]))); pair.setFieldValue("val",new IntWritable(Integer.parseInt(lineStrs[3]))); output.write(outKey, pair); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); //conf.set("orc.mapred.output.schema","struct<str:string>"); conf.set("orc.mapred.output.schema","struct<datano:bigint,datatime:bigint,type:int,val:int>"); Job job = Job.getInstance(conf); job.setJarByClass(OrcWriterMR.class); job.setJobName("Writter"); String in = "file:///C:/Users/Administrator/Desktop/CAN.txt"; String out = "file:///C:/Users/Administrator/Desktop/CAN1.orc"; job.setMapperClass(OrcWriterMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(OrcOutputFormat.class); FileInputFormat.addInputPath(job, new Path(in)); OrcOutputFormat.setOutputPath(job, new Path(out)); job.waitForCompletion(true); } }
读取orc文件编写成text文件
/** * 读取orc文件编写成text文件 */ public class OrcReaderMR { public static class OrcMap extends Mapper<NullWritable,OrcStruct,NullWritable,Text> { Text text = new Text(); public void map(NullWritable key, OrcStruct value, Context output) throws IOException, InterruptedException { StringBuffer sb= new StringBuffer(); if (!"".equals(value.getFieldValue(0).toString())){ sb.append(value.getFieldValue(0).toString()+ "\t"); } text.set(sb.toString()); output.write(NullWritable.get(),text); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(OrcReaderMR.class); job.setJobName("OrcReaderMR"); String in = "file:///C:/Users/Administrator/Desktop/gps1/gps1.orc"; String out = "file:///C:/Users/Administrator/Desktop/CAN信息"; job.setMapperClass(OrcMap.class); OrcInputFormat.addInputPath(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(OrcInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(0); job.waitForCompletion(true); }