protobuf如何还原proto源文件及描述字符串中左括弧的意义
语法分析
和通常的有语法的结构一样,proto的编译也经过词法(tokenize)和语法(parse)两个阶段,相关代码分别在tokenizer.cc和parser.cc两个文件中。
/// @file: protobuf\src\google\protobuf\compiler\parser.cc
bool Parser::ParseTopLevelStatement(FileDescriptorProto* file,
const LocationRecorder& root_location) {
if (TryConsumeEndOfDeclaration(";", NULL)) {
// empty statement; ignore
return true;
} else if (LookingAt("message")) {
LocationRecorder location(root_location,
FileDescriptorProto::kMessageTypeFieldNumber,
file->message_type_size());
return ParseMessageDefinition(file->add_message_type(), location, file);
} else if (LookingAt("enum")) {
LocationRecorder location(root_location,
FileDescriptorProto::kEnumTypeFieldNumber,
file->enum_type_size());
return ParseEnumDefinition(file->add_enum_type(), location, file);
} else if (LookingAt("service")) {
LocationRecorder location(root_location,
FileDescriptorProto::kServiceFieldNumber,
file->service_size());
return ParseServiceDefinition(file->add_service(), location, file);
} else if (LookingAt("extend")) {
LocationRecorder location(root_location,
FileDescriptorProto::kExtensionFieldNumber);
return ParseExtend(
file->mutable_extension(), file->mutable_message_type(), root_location,
FileDescriptorProto::kMessageTypeFieldNumber, location, file);
} else if (LookingAt("import")) {
return ParseImport(file->mutable_dependency(),
file->mutable_public_dependency(),
file->mutable_weak_dependency(), root_location, file);
} else if (LookingAt("package")) {
return ParsePackage(file, root_location, file);
} else if (LookingAt("option")) {
LocationRecorder location(root_location,
FileDescriptorProto::kOptionsFieldNumber);
return ParseOption(file->mutable_options(), location, file,
OPTION_STATEMENT);
} else {
AddError("Expected top-level statement (e.g. \"message\").");
return false;
}
}
DescriptorTable
每个proto文件生成的cpp文件中都有一串特别醒目的字符串,该字符串位于一个DescriptorTable结构的const char* descriptor字段,它也是整个proto文件生成FileDescriptorProto结构序列化之后的内容。
/// @file: protobuf\src\google\protobuf\generated_message_reflection.h
// This struct tries to reduce unnecessary padding.
// The num_xxx might not be close to their respective pointer, but this saves
// padding.
struct PROTOBUF_EXPORT DescriptorTable {
mutable bool is_initialized;
bool is_eager;
int size; // of serialized descriptor
const char* descriptor;
const char* filename;
once_flag* once;
const DescriptorTable* const* deps;
int num_deps;
int num_messages;
const MigrationSchema* schemas;
const Message* const* default_instances;
const uint32* offsets;
// update the following descriptor arrays.
Metadata* file_level_metadata;
const EnumDescriptor** file_level_enum_descriptors;
const ServiceDescriptor** file_level_service_descriptors;
};
生成的cpp代码
在生成的cpp文件中类似于2e这种莫名其妙的字符串是从哪里来的呢?例如这个变量名assign_descriptors_table_tsecer_2eproto中的2e,从代码实现看也很简单,就是把不属于数字字母(ascii_isalnum)的字符转换为""+16进制字符串。所以assign_descriptors_table_tsecer_2eproto中的"2e"就是由于"tsecer.proto"中的"."字符内码"2e"加上""组成;同样,如果是文件名中有"",那么会转换为"_5f"。
/// @file: protobuf\src\google\protobuf\compiler\cpp\cpp_helpers.cc
// Convert a file name into a valid identifier.
std::string FilenameIdentifier(const std::string& filename) {
std::string result;
for (int i = 0; i < filename.size(); i++) {
if (ascii_isalnum(filename[i])) {
result.push_back(filename[i]);
} else {
// Not alphanumeric. To avoid any possibility of name conflicts we
// use the hex code for the character.
StrAppend(&result, "_",
strings::Hex(static_cast<uint8_t>(filename[i])));
}
}
return result;
}
std::string UniqueName(const std::string& name, const std::string& filename,
const Options& options) {
return name + "_" + FilenameIdentifier(filename);
}
///@file: protobuf\src\google\protobuf\compiler\cpp\cpp_file.cc
void FileGenerator::GenerateReflectionInitializationCode(io::Printer* printer) {
Formatter format(printer, variables_);
///....
// Embed the descriptor. We simply serialize the entire
// FileDescriptorProto/ and embed it as a string literal, which is parsed and
// built into real descriptors at initialization time.
const std::string protodef_name =
UniqueName("descriptor_table_protodef", file_, options_);
///...
// The DescriptorTable itself.
// Should be "bool eager = NeedsEagerDescriptorAssignment(file_, options_);"
// however this might cause a tsan failure in superroot b/148382879,
// so disable for now.
bool eager = false;
format(
"static ::$proto_ns$::internal::once_flag $desc_table$_once;\n"
"const ::$proto_ns$::internal::DescriptorTable $desc_table$ = {\n"
" false, $1$, $2$, $3$, \"$filename$\", \n"
" &$desc_table$_once, $4$, $5$, $6$,\n"
" schemas, file_default_instances, $tablename$::offsets,\n"
" $7$, $file_level_enum_descriptors$, "
"$file_level_service_descriptors$,\n"
"};\n"
// This function exists to be marked as weak.
// It can significantly speed up compilation by breaking up LLVM's SCC in
// the .pb.cc translation units. Large translation units see a reduction
// of more than 35% of walltime for optimized builds.
// Without the weak attribute all the messages in the file, including all
// the vtables and everything they use become part of the same SCC through
// a cycle like:
// GetMetadata -> descriptor table -> default instances ->
// vtables -> GetMetadata
// By adding a weak function here we break the connection from the
// individual vtables back into the descriptor table.
"PROTOBUF_ATTRIBUTE_WEAK const ::$proto_ns$::internal::DescriptorTable* "
"$desc_table$_getter() {\n"
" return &$desc_table$;\n"
"}\n"
"\n",
eager ? "true" : "false", file_data.size(), protodef_name,
num_deps == 0 ? "nullptr" : variables_["desc_table"] + "_deps", num_deps,
message_generators_.size(),
message_generators_.empty() ? "nullptr"
: variables_["file_level_metadata"]);
}
table的注册
既然源文件中有序列化之后的字符串,就可以在运行时把这个字符串反序列化成一个描述实例。
(gdb) bt
#0 google::protobuf::EncodedDescriptorDatabase::Add (this=0x61dc90, encoded_file_descriptor=0x7ffff7ab6e20 <descriptor_table_protodef_google_2fprotobuf_2fany_2eproto>, size=212)
at google/protobuf/descriptor_database.cc:556
#1 0x00007ffff78f3486 in google::protobuf::DescriptorPool::InternalAddGeneratedFile (encoded_file_descriptor=0x7ffff7ab6e20 <descriptor_table_protodef_google_2fprotobuf_2fany_2eproto>,
size=212) at google/protobuf/descriptor.cc:1357
#2 0x00007ffff79adc27 in google::protobuf::(anonymous namespace)::AddDescriptorsImpl (table=0x7ffff7dd8b80 <descriptor_table_google_2fprotobuf_2fany_2eproto>)
at google/protobuf/generated_message_reflection.cc:2767
#3 0x00007ffff79adc60 in google::protobuf::(anonymous namespace)::AddDescriptors (table=0x7ffff7dd8b80 <descriptor_table_google_2fprotobuf_2fany_2eproto>)
at google/protobuf/generated_message_reflection.cc:2778
#4 0x00007ffff79add3c in google::protobuf::internal::AddDescriptorsRunner::AddDescriptorsRunner (this=0x7ffff7dda284 <dynamic_init_dummy_google_2fprotobuf_2fany_2eproto>,
table=0x7ffff7dd8b80 <descriptor_table_google_2fprotobuf_2fany_2eproto>) at google/protobuf/generated_message_reflection.cc:2813
#5 0x00007ffff78e885e in __static_initialization_and_destruction_0 (__initialize_p=1, __priority=102) at google/protobuf/any.pb.cc:76
#6 0x00007ffff78e8874 in _GLOBAL__sub_I.00102_any.pb.cc(void) () at ./google/protobuf/port_undef.inc:128
#7 0x00007ffff7dea4c3 in _dl_init_internal () from /lib64/ld-linux-x86-64.so.2
#8 0x00007ffff7ddc1aa in _dl_start_user () from /lib64/ld-linux-x86-64.so.2
#9 0x0000000000000001 in ?? ()
#10 0x00007fffffffe5c5 in ?? ()
#11 0x0000000000000000 in ?? ()
(gdb)
在生成的源代码中有一个全局变量
// Force running AddDescriptors() at dynamic initialization time.
PROTOBUF_ATTRIBUTE_INIT_PRIORITY static ::PROTOBUF_NAMESPACE_ID::internal::AddDescriptorsRunner dynamic_init_dummy_tsecer_2eproto(&descriptor_table_tsecer_2eproto);
会触发internal::AddDescriptorsRunner构造函数,进而触发下面的流程,并调用file.ParseFromArray来解析这个字符串来还原原始的proto文件内容。
/// @file: protobuf\src\google\protobuf\generated_message_reflection.cc
void AssignDescriptorsImpl(const DescriptorTable* table, bool eager) {
///...
// Register the descriptor of this file.
DescriptorPool::InternalAddGeneratedFile(table->descriptor, table->size);
MessageFactory::InternalRegisterGeneratedFile(table);
}
/// @file: google/protobuf/descriptor_database.cc:556
bool EncodedDescriptorDatabase::Add(const void* encoded_file_descriptor,
int size) {
FileDescriptorProto file;
if (file.ParseFromArray(encoded_file_descriptor, size)) {
return index_->AddFile(file, std::make_pair(encoded_file_descriptor, size));
} else {
GOOGLE_LOG(ERROR) << "Invalid file descriptor data passed to "
"EncodedDescriptorDatabase::Add().";
return false;
}
}
pb内部Descriptor的proto
在pb内部,把proto文件本身的message也定义成了proto文件。要注意的是proto文件不是通过这个描述自动解析的,这里只是通过定义生成了指定格式的容器和对应的操作接口,另外,由于使用proto描述,所以也自动拥有了proto的序列化、反序列化两个重要的基础功能。
// @file: protobuf\src\google\protobufdescriptor.proto
// Describes a field within a message.
// Describes a field within a message.
message FieldDescriptorProto {
enum Type {
// 0 is reserved for errors.
// Order is weird for historical reasons.
TYPE_DOUBLE = 1;
TYPE_FLOAT = 2;
// Not ZigZag encoded. Negative numbers take 10 bytes. Use TYPE_SINT64 if
// negative values are likely.
TYPE_INT64 = 3;
TYPE_UINT64 = 4;
// Not ZigZag encoded. Negative numbers take 10 bytes. Use TYPE_SINT32 if
// negative values are likely.
TYPE_INT32 = 5;
TYPE_FIXED64 = 6;
TYPE_FIXED32 = 7;
TYPE_BOOL = 8;
TYPE_STRING = 9;
// Tag-delimited aggregate.
// Group type is deprecated and not supported in proto3. However, Proto3
// implementations should still be able to parse the group wire format and
// treat group fields as unknown fields.
TYPE_GROUP = 10;
TYPE_MESSAGE = 11; // Length-delimited aggregate.
// New in version 2.
TYPE_BYTES = 12;
TYPE_UINT32 = 13;
TYPE_ENUM = 14;
TYPE_SFIXED32 = 15;
TYPE_SFIXED64 = 16;
TYPE_SINT32 = 17; // Uses ZigZag encoding.
TYPE_SINT64 = 18; // Uses ZigZag encoding.
}
enum Label {
// 0 is reserved for errors
LABEL_OPTIONAL = 1;
LABEL_REQUIRED = 2;
LABEL_REPEATED = 3;
}
optional string name = 1;
optional int32 number = 3;
optional Label label = 4;
// If type_name is set, this need not be set. If both this and type_name
// are set, this must be one of TYPE_ENUM, TYPE_MESSAGE or TYPE_GROUP.
optional Type type = 5;
字符串可视化
这里为什么使用8进制而没有使用更容易阅读的16进制呢?从代码上猜测:如果是6进制就可以直接把数值操作(除法或者取模)的结果加上'0'来获得一个;如果使用16进制,这个结果可能要经过一个表格转换之类的(因为 '0' + 10 != 'A')。
/// @file: protobuf\src\google\protobuf\stubs\strutil.cc
// ----------------------------------------------------------------------
// Escapes 'src' using C-style escape sequences, and appends the escaped string
// to 'dest'. This version is faster than calling CEscapeInternal as it computes
// the required space using a lookup table, and also does not do any special
// handling for Hex or UTF-8 characters.
// ----------------------------------------------------------------------
void CEscapeAndAppend(StringPiece src, std::string *dest) {
size_t escaped_len = CEscapedLength(src);
if (escaped_len == src.size()) {
dest->append(src.data(), src.size());
return;
}
size_t cur_dest_len = dest->size();
dest->resize(cur_dest_len + escaped_len);
char* append_ptr = &(*dest)[cur_dest_len];
for (StringPiece::size_type i = 0; i < src.size(); ++i) {
unsigned char c = static_cast<unsigned char>(src[i]);
switch (c) {
case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
default:
if (!isprint(c)) {
*append_ptr++ = '\\';
*append_ptr++ = '0' + c / 64;
*append_ptr++ = '0' + (c % 64) / 8;
*append_ptr++ = '0' + c % 8;
} else {
*append_ptr++ = c;
}
break;
}
}
}
栗子
描述文件
tsecer@harry: cat tsecer.proto
syntax = "proto3";
message tsecer {
bytes harry = 1;
int32 fry = 3;
};
tsecer@harry:
左括弧
在字符串描述中,经常会遇到醒目的左括弧字符,它是从哪里来?
其实这个是有前面
optional Type type = 5;
这个字段生成的:
由于'('字符的十六进制内码是0x28,对应的二进制是0b00101000,结合protobuf的编码格式。最低3个bit值为0,表示类型是一个VARINT;剩余内容为5,对应这里type字段的tag值为5。这个结果刚好是左括号而已。在左括弧之后,就是type的具体值了。在开始的例子中,输出中有两个左括弧,分别是"(\014"和"(\005",括弧之后的八进制转换为10进制之后分别为15和5。
const char descriptor_table_protodef_tsecer_2eproto[] =
"\n\014tsecer.proto\"$\n\006tsecer\022\r\n\005harry\030\001 \001(\014\022"
"\013\n\003fry\030\003 \001(\005b\006proto3"
;
对应前面enum Type枚举类型,分别对应
TYPE_INT32 = 5;
TYPE_BYTES = 12;
对应,也和proto文件中的声明一致 。
descriptor.cc和descriptor.pb.cc
descriptor.pb.cc是protoc根据proto文件自动生成的,而descriptor.cc则是人工编写的代码。
///@file: protobuf\src\google\protobuf\descriptor.cc
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
descriptor.pb.cc只是作为一个解析之后的容器,它的数据也是有编译器在完成proto语法分析之后添加的。
在descriptor.cc文件中(人工)实现了DebugString接口,所以descriptor经过DebugString输出的格式跟常规Message输出的格式并不一样。
tsecer@harry: cat main.cpp
#include <iostream>
#include <fstream>
#include <string>
#include "tsecer.pb.h"
#include <google/protobuf/util/json_util.h>
using namespace std;
int main(int argc, char* argv[]) {
char str[] = "\00\01\02\03HelloWorld";
tsecer msg;
auto desc = msg.GetDescriptor();
printf("%s\n", desc->DebugString ().c_str());
msg.set_harry(std::string(str, sizeof(str)));
printf("%s\n", msg.DebugString().c_str());
return 0;
}
tsecer@harry: ./a.out
message tsecer {
bytes harry = 1;
int32 fry = 3;
}
harry: "\000\001\002\003HelloWorld\000"
tsecer@harry: