将parquet schema转换成avro schema
1.引入依赖
<!--parquet--> <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-avro</artifactId> <version>1.10.0</version> </dependency> <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-hadoop</artifactId> <version>1.10.0</version> </dependency> <!--hadoop--> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.3</version> </dependency>
2.从parquet文件的footer读取parquet schema
import org.apache.hadoop.conf.Configuration; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.hadoop.fs.Path; import org.apache.parquet.schema.MessageType; Configuration config = new Configuration(); Path parquetPath = new Path("file:///Users/lintong/Downloads/xxxx.snappy.parquet"); ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(parquetPath, config)); MessageType parquetSchema = reader.getFooter().getFileMetaData().getSchema(); System.out.println(parquetSchema);
输出
message TestSerializer { optional binary string1 (UTF8); optional int32 int1; optional int32 tinyint1; optional int32 smallint1; optional int64 bigint1; optional boolean boolean1; optional double float1; optional double double1; optional group list1 (LIST) { repeated binary array (UTF8); } optional group map1 (LIST) { repeated group array { optional binary key (UTF8); optional int32 value; } } optional group struct1 { optional int32 sInt; optional boolean sBoolean; optional binary sString (UTF8); } optional binary enum1 (UTF8); optional int32 nullableint; }
3.将parquet schema转换成avro schema
import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.avro.Schema; Schema avroSchema = new AvroSchemaConverter(config).convert(parquetSchema); System.out.println(avroSchema);
输出
{ "type":"record", "name":"TestSerializer", "fields":[ { "name":"string1", "type":[ "null", "string" ], "default":null }, { "name":"int1", "type":[ "null", "int" ], "default":null }, { "name":"tinyint1", "type":[ "null", "int" ], "default":null }, { "name":"smallint1", "type":[ "null", "int" ], "default":null }, { "name":"bigint1", "type":[ "null", "long" ], "default":null }, { "name":"boolean1", "type":[ "null", "boolean" ], "default":null }, { "name":"float1", "type":[ "null", "double" ], "default":null }, { "name":"double1", "type":[ "null", "double" ], "default":null }, { "name":"list1", "type":[ "null", { "type":"array", "items":"string" } ], "default":null }, { "name":"map1", "type":[ "null", { "type":"array", "items":{ "type":"record", "name":"array", "fields":[ { "name":"key", "type":[ "null", "string" ], "default":null }, { "name":"value", "type":[ "null", "int" ], "default":null } ] } } ], "default":null }, { "name":"struct1", "type":[ "null", { "type":"record", "name":"struct1", "fields":[ { "name":"sInt", "type":[ "null", "int" ], "default":null }, { "name":"sBoolean", "type":[ "null", "boolean" ], "default":null }, { "name":"sString", "type":[ "null", "string" ], "default":null } ] } ], "default":null }, { "name":"enum1", "type":[ "null", "string" ], "default":null }, { "name":"nullableint", "type":[ "null", "int" ], "default":null } ] }
参考:https://stackoverflow.com/questions/54159454/how-to-convert-parquet-schema-to-avro-in-java-scala
本文只发表于博客园和tonglin0325的博客,作者:tonglin0325,转载请注明原文链接:https://www.cnblogs.com/tonglin0325/p/5323978.html