protobuf如何还原proto源文件及描述字符串中左括弧的意义

语法分析

和通常的有语法的结构一样，proto的编译也经过词法(tokenize)和语法(parse)两个阶段，相关代码分别在tokenizer.cc和parser.cc两个文件中。

/// @file: protobuf\src\google\protobuf\compiler\parser.cc
bool Parser::ParseTopLevelStatement(FileDescriptorProto* file,
                                    const LocationRecorder& root_location) {
  if (TryConsumeEndOfDeclaration(";", NULL)) {
    // empty statement; ignore
    return true;
  } else if (LookingAt("message")) {
    LocationRecorder location(root_location,
                              FileDescriptorProto::kMessageTypeFieldNumber,
                              file->message_type_size());
    return ParseMessageDefinition(file->add_message_type(), location, file);
  } else if (LookingAt("enum")) {
    LocationRecorder location(root_location,
                              FileDescriptorProto::kEnumTypeFieldNumber,
                              file->enum_type_size());
    return ParseEnumDefinition(file->add_enum_type(), location, file);
  } else if (LookingAt("service")) {
    LocationRecorder location(root_location,
                              FileDescriptorProto::kServiceFieldNumber,
                              file->service_size());
    return ParseServiceDefinition(file->add_service(), location, file);
  } else if (LookingAt("extend")) {
    LocationRecorder location(root_location,
                              FileDescriptorProto::kExtensionFieldNumber);
    return ParseExtend(
        file->mutable_extension(), file->mutable_message_type(), root_location,
        FileDescriptorProto::kMessageTypeFieldNumber, location, file);
  } else if (LookingAt("import")) {
    return ParseImport(file->mutable_dependency(),
                       file->mutable_public_dependency(),
                       file->mutable_weak_dependency(), root_location, file);
  } else if (LookingAt("package")) {
    return ParsePackage(file, root_location, file);
  } else if (LookingAt("option")) {
    LocationRecorder location(root_location,
                              FileDescriptorProto::kOptionsFieldNumber);
    return ParseOption(file->mutable_options(), location, file,
                       OPTION_STATEMENT);
  } else {
    AddError("Expected top-level statement (e.g. \"message\").");
    return false;
  }
}

DescriptorTable

每个proto文件生成的cpp文件中都有一串特别醒目的字符串，该字符串位于一个DescriptorTable结构的const char* descriptor字段，它也是整个proto文件生成FileDescriptorProto结构序列化之后的内容。

/// @file: protobuf\src\google\protobuf\generated_message_reflection.h

// This struct tries to reduce unnecessary padding.
// The num_xxx might not be close to their respective pointer, but this saves
// padding.
struct PROTOBUF_EXPORT DescriptorTable {
  mutable bool is_initialized;
  bool is_eager;
  int size;  // of serialized descriptor
  const char* descriptor;
  const char* filename;
  once_flag* once;
  const DescriptorTable* const* deps;
  int num_deps;
  int num_messages;
  const MigrationSchema* schemas;
  const Message* const* default_instances;
  const uint32* offsets;
  // update the following descriptor arrays.
  Metadata* file_level_metadata;
  const EnumDescriptor** file_level_enum_descriptors;
  const ServiceDescriptor** file_level_service_descriptors;
};

生成的cpp代码

在生成的cpp文件中类似于2e这种莫名其妙的字符串是从哪里来的呢？例如这个变量名assign_descriptors_table_tsecer_2eproto中的2e，从代码实现看也很简单，就是把不属于数字字母(ascii_isalnum)的字符转换为""+16进制字符串。所以assign_descriptors_table_tsecer_2eproto中的"2e"就是由于"tsecer.proto"中的"."字符内码"2e"加上""组成；同样，如果是文件名中有""，那么会转换为"_5f"。

/// @file: protobuf\src\google\protobuf\compiler\cpp\cpp_helpers.cc
// Convert a file name into a valid identifier.
std::string FilenameIdentifier(const std::string& filename) {
  std::string result;
  for (int i = 0; i < filename.size(); i++) {
    if (ascii_isalnum(filename[i])) {
      result.push_back(filename[i]);
    } else {
      // Not alphanumeric.  To avoid any possibility of name conflicts we
      // use the hex code for the character.
      StrAppend(&result, "_",
                      strings::Hex(static_cast<uint8_t>(filename[i])));
    }
  }
  return result;
}

std::string UniqueName(const std::string& name, const std::string& filename,
                       const Options& options) {
  return name + "_" + FilenameIdentifier(filename);
}
///@file: protobuf\src\google\protobuf\compiler\cpp\cpp_file.cc
void FileGenerator::GenerateReflectionInitializationCode(io::Printer* printer) {
  Formatter format(printer, variables_);
///....
  // Embed the descriptor.  We simply serialize the entire
  // FileDescriptorProto/ and embed it as a string literal, which is parsed and
  // built into real descriptors at initialization time.
  const std::string protodef_name =
      UniqueName("descriptor_table_protodef", file_, options_);
///...
  // The DescriptorTable itself.
  // Should be "bool eager = NeedsEagerDescriptorAssignment(file_, options_);"
  // however this might cause a tsan failure in superroot b/148382879,
  // so disable for now.
  bool eager = false;
  format(
      "static ::$proto_ns$::internal::once_flag $desc_table$_once;\n"
      "const ::$proto_ns$::internal::DescriptorTable $desc_table$ = {\n"
      "  false, $1$, $2$, $3$, \"$filename$\", \n"
      "  &$desc_table$_once, $4$, $5$, $6$,\n"
      "  schemas, file_default_instances, $tablename$::offsets,\n"
      "  $7$, $file_level_enum_descriptors$, "
      "$file_level_service_descriptors$,\n"
      "};\n"
      // This function exists to be marked as weak.
      // It can significantly speed up compilation by breaking up LLVM's SCC in
      // the .pb.cc translation units. Large translation units see a reduction
      // of more than 35% of walltime for optimized builds.
      // Without the weak attribute all the messages in the file, including all
      // the vtables and everything they use become part of the same SCC through
      // a cycle like:
      // GetMetadata -> descriptor table -> default instances ->
      //   vtables -> GetMetadata
      // By adding a weak function here we break the connection from the
      // individual vtables back into the descriptor table.
      "PROTOBUF_ATTRIBUTE_WEAK const ::$proto_ns$::internal::DescriptorTable* "
      "$desc_table$_getter() {\n"
      "  return &$desc_table$;\n"
      "}\n"
      "\n",
      eager ? "true" : "false", file_data.size(), protodef_name,
      num_deps == 0 ? "nullptr" : variables_["desc_table"] + "_deps", num_deps,
      message_generators_.size(),
      message_generators_.empty() ? "nullptr"
                                  : variables_["file_level_metadata"]);
}

table的注册

既然源文件中有序列化之后的字符串，就可以在运行时把这个字符串反序列化成一个描述实例。

(gdb) bt
#0  google::protobuf::EncodedDescriptorDatabase::Add (this=0x61dc90, encoded_file_descriptor=0x7ffff7ab6e20 <descriptor_table_protodef_google_2fprotobuf_2fany_2eproto>, size=212)
    at google/protobuf/descriptor_database.cc:556
#1  0x00007ffff78f3486 in google::protobuf::DescriptorPool::InternalAddGeneratedFile (encoded_file_descriptor=0x7ffff7ab6e20 <descriptor_table_protodef_google_2fprotobuf_2fany_2eproto>, 
    size=212) at google/protobuf/descriptor.cc:1357
#2  0x00007ffff79adc27 in google::protobuf::(anonymous namespace)::AddDescriptorsImpl (table=0x7ffff7dd8b80 <descriptor_table_google_2fprotobuf_2fany_2eproto>)
    at google/protobuf/generated_message_reflection.cc:2767
#3  0x00007ffff79adc60 in google::protobuf::(anonymous namespace)::AddDescriptors (table=0x7ffff7dd8b80 <descriptor_table_google_2fprotobuf_2fany_2eproto>)
    at google/protobuf/generated_message_reflection.cc:2778
#4  0x00007ffff79add3c in google::protobuf::internal::AddDescriptorsRunner::AddDescriptorsRunner (this=0x7ffff7dda284 <dynamic_init_dummy_google_2fprotobuf_2fany_2eproto>, 
    table=0x7ffff7dd8b80 <descriptor_table_google_2fprotobuf_2fany_2eproto>) at google/protobuf/generated_message_reflection.cc:2813
#5  0x00007ffff78e885e in __static_initialization_and_destruction_0 (__initialize_p=1, __priority=102) at google/protobuf/any.pb.cc:76
#6  0x00007ffff78e8874 in _GLOBAL__sub_I.00102_any.pb.cc(void) () at ./google/protobuf/port_undef.inc:128
#7  0x00007ffff7dea4c3 in _dl_init_internal () from /lib64/ld-linux-x86-64.so.2
#8  0x00007ffff7ddc1aa in _dl_start_user () from /lib64/ld-linux-x86-64.so.2
#9  0x0000000000000001 in ?? ()
#10 0x00007fffffffe5c5 in ?? ()
#11 0x0000000000000000 in ?? ()
(gdb)

在生成的源代码中有一个全局变量

  // Force running AddDescriptors() at dynamic initialization time.                            
  PROTOBUF_ATTRIBUTE_INIT_PRIORITY static ::PROTOBUF_NAMESPACE_ID::internal::AddDescriptorsRunner dynamic_init_dummy_tsecer_2eproto(&descriptor_table_tsecer_2eproto);

会触发internal::AddDescriptorsRunner构造函数，进而触发下面的流程，并调用file.ParseFromArray来解析这个字符串来还原原始的proto文件内容。

/// @file: protobuf\src\google\protobuf\generated_message_reflection.cc
void AssignDescriptorsImpl(const DescriptorTable* table, bool eager) {
///...
  // Register the descriptor of this file.
  DescriptorPool::InternalAddGeneratedFile(table->descriptor, table->size);
  MessageFactory::InternalRegisterGeneratedFile(table);
}

/// @file: google/protobuf/descriptor_database.cc:556
bool EncodedDescriptorDatabase::Add(const void* encoded_file_descriptor,
                                    int size) {
  FileDescriptorProto file;
  if (file.ParseFromArray(encoded_file_descriptor, size)) {
    return index_->AddFile(file, std::make_pair(encoded_file_descriptor, size));
  } else {
    GOOGLE_LOG(ERROR) << "Invalid file descriptor data passed to "
                  "EncodedDescriptorDatabase::Add().";
    return false;
  }
}

pb内部Descriptor的proto

在pb内部，把proto文件本身的message也定义成了proto文件。要注意的是proto文件不是通过这个描述自动解析的，这里只是通过定义生成了指定格式的容器和对应的操作接口，另外，由于使用proto描述，所以也自动拥有了proto的序列化、反序列化两个重要的基础功能。

// @file: protobuf\src\google\protobufdescriptor.proto
// Describes a field within a message.
// Describes a field within a message.
message FieldDescriptorProto {
  enum Type {
    // 0 is reserved for errors.
    // Order is weird for historical reasons.
    TYPE_DOUBLE = 1;
    TYPE_FLOAT = 2;
    // Not ZigZag encoded.  Negative numbers take 10 bytes.  Use TYPE_SINT64 if
    // negative values are likely.
    TYPE_INT64 = 3;
    TYPE_UINT64 = 4;
    // Not ZigZag encoded.  Negative numbers take 10 bytes.  Use TYPE_SINT32 if
    // negative values are likely.
    TYPE_INT32 = 5;
    TYPE_FIXED64 = 6;
    TYPE_FIXED32 = 7;
    TYPE_BOOL = 8;
    TYPE_STRING = 9;
    // Tag-delimited aggregate.
    // Group type is deprecated and not supported in proto3. However, Proto3
    // implementations should still be able to parse the group wire format and
    // treat group fields as unknown fields.
    TYPE_GROUP = 10;
    TYPE_MESSAGE = 11;  // Length-delimited aggregate.

    // New in version 2.
    TYPE_BYTES = 12;
    TYPE_UINT32 = 13;
    TYPE_ENUM = 14;
    TYPE_SFIXED32 = 15;
    TYPE_SFIXED64 = 16;
    TYPE_SINT32 = 17;  // Uses ZigZag encoding.
    TYPE_SINT64 = 18;  // Uses ZigZag encoding.
  }

  enum Label {
    // 0 is reserved for errors
    LABEL_OPTIONAL = 1;
    LABEL_REQUIRED = 2;
    LABEL_REPEATED = 3;
  }

  optional string name = 1;
  optional int32 number = 3;
  optional Label label = 4;

  // If type_name is set, this need not be set.  If both this and type_name
  // are set, this must be one of TYPE_ENUM, TYPE_MESSAGE or TYPE_GROUP.
  optional Type type = 5;

字符串可视化

这里为什么使用8进制而没有使用更容易阅读的16进制呢？从代码上猜测：如果是6进制就可以直接把数值操作(除法或者取模)的结果加上'0'来获得一个；如果使用16进制，这个结果可能要经过一个表格转换之类的(因为 '0' + 10 != 'A')。

/// @file: protobuf\src\google\protobuf\stubs\strutil.cc
// ----------------------------------------------------------------------
// Escapes 'src' using C-style escape sequences, and appends the escaped string
// to 'dest'. This version is faster than calling CEscapeInternal as it computes
// the required space using a lookup table, and also does not do any special
// handling for Hex or UTF-8 characters.
// ----------------------------------------------------------------------
void CEscapeAndAppend(StringPiece src, std::string *dest) {
  size_t escaped_len = CEscapedLength(src);
  if (escaped_len == src.size()) {
    dest->append(src.data(), src.size());
    return;
  }

  size_t cur_dest_len = dest->size();
  dest->resize(cur_dest_len + escaped_len);
  char* append_ptr = &(*dest)[cur_dest_len];

  for (StringPiece::size_type i = 0; i < src.size(); ++i) {
    unsigned char c = static_cast<unsigned char>(src[i]);
    switch (c) {
      case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
      case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
      case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
      case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
      case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
      case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
      default:
        if (!isprint(c)) {
          *append_ptr++ = '\\';
          *append_ptr++ = '0' + c / 64;
          *append_ptr++ = '0' + (c % 64) / 8;
          *append_ptr++ = '0' + c % 8;
        } else {
          *append_ptr++ = c;
        }
        break;
    }
  }
}

栗子

描述文件

tsecer@harry: cat  tsecer.proto   
syntax = "proto3";

message tsecer {
bytes harry = 1;
int32   fry   = 3;
};

tsecer@harry:

左括弧

在字符串描述中，经常会遇到醒目的左括弧字符，它是从哪里来？

其实这个是有前面
optional Type type = 5;
这个字段生成的：

由于'('字符的十六进制内码是0x28，对应的二进制是0b00101000，结合protobuf的编码格式。最低3个bit值为0，表示类型是一个VARINT；剩余内容为5，对应这里type字段的tag值为5。这个结果刚好是左括号而已。在左括弧之后，就是type的具体值了。在开始的例子中，输出中有两个左括弧，分别是"(\014"和"(\005"，括弧之后的八进制转换为10进制之后分别为15和5。

const char descriptor_table_protodef_tsecer_2eproto[] =                            
  "\n\014tsecer.proto\"$\n\006tsecer\022\r\n\005harry\030\001 \001(\014\022"       
  "\013\n\003fry\030\003 \001(\005b\006proto3"                                     
  ;

对应前面enum Type枚举类型，分别对应

    TYPE_INT32 = 5;
    TYPE_BYTES = 12;

对应，也和proto文件中的声明一致。

descriptor.cc和descriptor.pb.cc

descriptor.pb.cc是protoc根据proto文件自动生成的，而descriptor.cc则是人工编写的代码。

///@file: protobuf\src\google\protobuf\descriptor.cc
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:

descriptor.pb.cc只是作为一个解析之后的容器，它的数据也是有编译器在完成proto语法分析之后添加的。
在descriptor.cc文件中(人工)实现了DebugString接口，所以descriptor经过DebugString输出的格式跟常规Message输出的格式并不一样。

tsecer@harry: cat main.cpp
#include <iostream>
#include <fstream>
#include <string>
#include "tsecer.pb.h"
#include <google/protobuf/util/json_util.h>

using namespace std;

int main(int argc, char* argv[]) {
    char str[] = "\00\01\02\03HelloWorld";
tsecer msg;
auto desc = msg.GetDescriptor();
printf("%s\n", desc->DebugString ().c_str());
msg.set_harry(std::string(str, sizeof(str)));
printf("%s\n", msg.DebugString().c_str());
return 0;
}

tsecer@harry: ./a.out 
message tsecer {
  bytes harry = 1;
  int32 fry = 3;
}

harry: "\000\001\002\003HelloWorld\000"

tsecer@harry:

posted on 2023-01-12 20:12 tsecer 阅读(437) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

tsecer