protobuf协议

protobuf是Google公司推出的一种轻便、高效的结构化数据存储格式,比XML、JSON传输效率更高

一、环境配置(以windows系统下能正常运行的版本为例)

  • 下载protoc(protobuf编译器)

  • python安装protobuf依赖

    pip install protobuf==4.21.2

二、proto文件及对应python脚本生成示例

  • test.proto

    syntax = "proto3";
    
    
    message Info {
      string name = 1;
      int32 age = 2;
    }
    
    // enum类型,第一个值必须为0
    enum Status {
      A = 0;
      B = 1;
    }
    
    message School{
      string addr = 1;
      repeated int32 classes = 2; // 数组,里面元素为int32类型
      repeated Info students = 3; // 数组,里面元素为Info类型
      Status status = 4; // enum修饰,限制取值只能是0或者1
      map<string, string> desc = 5; // map定义了一个包含键值对的字典结构,其中键和值都必须是字符串
    }

    Info和Status也可以定义再School内部,School可以理解为python中的一个类,里面的属性理解为类属性即可

  • 通过命令行转换成对应python脚本

    protoc --python_out=. ./test.proto

    protoc  --python_out=输出目录  proto文件路径,执行以上命令,会在当前目录下生成test_pb2.py文件

  • 测试文件:test.py

    from test_pb2 import School, Status
    from google.protobuf import json_format
    
    # 实例化School对象
    school = School()
    school.addr = "北京市海淀区"
    
    # 对于repeated修饰,元素为int32类型的,添加值采用数组append的形式
    school.classes.append(1)
    school.classes.append(2)
    school.classes.append(3)
    
    school.status = Status.A  # 对于enum修饰的,限制给定的取值
    # school.status = 0
    
    # 对于repeated修饰,元素为对象类型的,添加值需要先通过:对象.属性.add()的形式返回添加对象,然后给该对象赋值
    student1 = school.students.add()
    student1.name = "小明"
    student1.age = 20
    
    student2 = school.students.add()
    student2.name = "小红"
    student2.age = 18
    
    # 对于map类型,必须按照字典进行操作(增、删、改、查)
    school.desc["evalutaion"] = "好评!"
    school.desc["history"] = "100年"
    # school.desc["history"] = "110年"  修改
    # del school.desc["evalutaion"] 删除
    
    # 1、序列化
    school_info = school.SerializeToString()
    print(school_info)  # 字节
    print(school_info.decode('utf-8'))
    
    # 2、反序列化(字节 => protobuf对象)
    deserializeInfo = School()
    deserializeInfo.ParseFromString(school_info)
    
    # 3、protobuf转字典或json
    # 将protobuf对象转字典
    print(json_format.MessageToDict(deserializeInfo, preserving_proto_field_name=True)) # 默认会将字段名转小写,添加参数会使用原字段名
    # {'addr': '北京市海淀区', 'classes': [1, 2, 3], 'students': [{'name': '小明', 'age': 20}, {'name': '小红', 'age': 18}], 'desc': {'evalutaion': '好评!', 'history': '100年'}}
    
    # 将protobuf对象转Json
    print(json_format.MessageToJson(deserializeInfo, ensure_ascii=False))

三、案例分析

  • 某方

    • proto文件

      syntax = "proto3";
      
      // 请求
      message Request{
        enum Order{
          A = 0;
        }
      
        enum SearchScope{
          B = 0;
        }
      
        enum SearchFilter{
          C = 0;
        }
      
        enum InterfaceType{
          D = 0;
          E = 1;
        }
      
        message SearchSort{
          string field = 1;
          Order order = 2;
        }
      
        message Second{
          string field = 1;
          string value = 2;
        }
      
        message CommonRequest{
          string searchType = 1; // paper
          string searchWord = 2; // 水稻
          SearchSort searchSort = 3;
          repeated Second secondsList = 4;
          int32 currentPage = 5;
          int32 pageSize = 6;
          SearchScope searchScope = 7;
          repeated SearchFilter searchFilterList = 8;
          bool languageExpand = 9;
          bool topicExpand = 10;
        }
      
        CommonRequest commonRequest = 1;
        InterfaceType interfaceType = 2;
      }
      
      
      
      // 响应
      message Response {
        enum Type {
          F = 0;
        }
      
        message ThirdParty {
          string url = 1;
          string showname = 2;
          string platform = 4;
          string id = 5;
        }
        message OriginButton {
          Type type = 1;
          ThirdParty thirdpartyList = 2;
        }
        message Periodical{
          string id = 1;
          repeated string titleList = 2;
          repeated string creatorList = 3;
          string firstcreator = 4;
          repeated string scholaridList = 5;
          repeated string foreigncreatorList = 6;
          repeated string creatorforsearchList = 7;
          repeated string organizationnormList = 8;
          repeated string organizationnewList = 9;
          repeated string originalorganizationList = 10;
          repeated string originalclasscodeList = 12;
          repeated string machinedclasscodeList = 13;
          repeated string classcodeforsearchList = 14;
          repeated string contentsearchList = 15;
          repeated string keywordsList = 16;
          repeated string foreignkeywordsList = 17;
          repeated string machinedkeywordsList = 18;
          repeated string keywordforsearchList = 19;
          repeated string abstractList = 20;
          int32 citedcount = 21;
          string periodicalid = 22;
          repeated string periodicaltitleList = 24;
          repeated string sourcedbList = 25;
          bool isoa = 26;
          repeated string fundList = 27;
          string publishdate = 28;
          string metadataonlinedate = 29;
          string fulltextonlinedate = 30;
          int32 servicemode = 31;
          int32 hasfulltext = 32;
          int32 publishyear = 33;
          string issue = 34;
          string volum = 35;
          string page = 36;
          string pageno = 37;
          repeated string columnList = 38;
          repeated string coreperiodicalList = 39;
          string fulltextpath = 40;
          string doi = 41;
          repeated string authororgList = 42;
          repeated string thirdpartyurlList = 43;
          string language = 44;
          string issn = 45;
          string cn = 46;
          int32 sequenceinissue = 47;
          int32 metadataviewcount = 48;
          int32 thirdpartylinkclickcount = 49;
          int32 downloadcount = 50;
          string prepublishversion = 51;
          string prepublishgroupid = 52;
          int32 exportcount = 56;
          repeated string periodicalclasscodeList = 57;
          repeated string scholaridauthorList = 58;
          int32 downloadscore = 60;
          int32 yearscore = 61;
          int32 typescore = 62;
          repeated int32 projectidList = 63;
          repeated string fundgroupnameList = 64;
          repeated int32 projectgrantnoList = 65;
          bool isthirdservice = 66;
          string lastmodifiedtime = 67;
          int32 delivercount = 70;
          string publishstatus = 53;
          string type = 54;
          repeated string singlesourcedbList = 55;
        }
      
        message Resources{
          string type = 1;
          OriginButton originbuttonsList = 2;
          string uid = 3;
          Periodical periodical = 101;
        }
      
        bool status = 1;
        string message = 2;
        int64 count = 3;
        repeated Resources resourcesList = 4;
        repeated string groupCustomList = 5;
      }

      一定要注意各字段后面对应的数字,一定要和网站的一致,可以不连续

    • 请求脚本

      import requests
      import test_pb2 as pb
      from google.protobuf import json_format
      
      search_request = pb.Request()
      search_request.commonRequest.searchType = "paper"
      search_request.commonRequest.searchWord = '水稻'
      search_request.commonRequest.currentPage = 1
      search_request.commonRequest.pageSize = 20
      search_request.commonRequest.searchScope = 0
      search_request.commonRequest.searchFilterList.append(0)
      search_request.interfaceType = 1
      
      bytes_body = search_request.SerializeToString()
      bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
      post_data = bytes_head + bytes_body # 该网站自己定义的,前面添加了5个字节的数据,其中前4个为空字节,第5个是请求数据长度对应的字节
      
      
      headers = {
          'content-type': 'application/grpc-web+proto',
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
          'x-user-agent': 'grpc-web-javascript/0.1',
      }
      url = "https://s.wanfangdata.com.cn/SearchService.SearchService/search"
      res = requests.post(url, data=post_data, headers=headers)
      
      # 对响应数据进行反序列化
      deResp = pb.Response()
      deResp.ParseFromString(res.content[5:]) # 前5个字节依旧是与响应字节长度有关,需要切割掉
      print(json_format.MessageToJson(deResp, ensure_ascii=False).replace("<span class='highlight'>", '').replace("</span>", ''))
    • 反写proto文件的思路

      一、请求数据序列化相关
      通过xhr请求分析,跟栈,浏览器提示明文数据类型:proto.SearchService.SearchRequest,然后搜索定位,找到请求proto相关重要信息
              ## 信息1
              proto.SearchService.SearchRequest.toObject = function(e, t) {
                  var r, a = {
                      commonrequest: (r = t.getCommonrequest()) && proto.SearchService.CommonRequest.toObject(e, r),
                      interfaceType: o.Message.getFieldWithDefault(t, 2, 0)
                  };
                  return e && (a.$jspbMessageInstance = t),
                  a
              }
              
              ## 信息2
              proto.SearchService.SearchRequest.serializeBinaryToWriter = function(e, t) {
                  var r = void 0;
                  null != (r = e.getCommonrequest()) && t.writeMessage(1, r, proto.SearchService.CommonRequest.serializeBinaryToWriter),
                  0 !== (r = e.getInterfaceType()) && t.writeEnum(2, r)
              }
              
              ## 信息3(跟据信息2,搜索proto.SearchService.CommonRequest.serializeBinaryToWriter即可)
              proto.SearchService.CommonRequest.serializeBinaryToWriter = function(e, t) {
                  var r = void 0;
                  (r = e.getSearchType()).length > 0 && t.writeString(1, r),
                  (r = e.getSearchWord()).length > 0 && t.writeString(2, r),
                  null != (r = e.getSearchSort()) && t.writeMessage(3, r, proto.SearchService.SearchSort.serializeBinaryToWriter),
                  (r = e.getSecondsList()).length > 0 && t.writeRepeatedMessage(4, r, proto.SearchService.Second.serializeBinaryToWriter),
                  0 !== (r = e.getCurrentPage()) && t.writeInt32(5, r),
                  0 !== (r = e.getPageSize()) && t.writeInt32(6, r),
                  0 !== (r = e.getSearchScope()) && t.writeEnum(7, r),
                  (r = e.getSearchFilterList()).length > 0 && t.writePackedEnum(8, r),
                  (r = e.getLanguageExpand()) && t.writeBool(9, r),
                  (r = e.getTopicExpand()) && t.writeBool(10, r)
              }
          
      
      二、响应数据反序列化相关
      可以根据浏览器解析后的响应数据,搜索特定的key值,然后能够定位到如下有用信息:
              
              # 1、信息1
              proto.SearchService.SearchResponse.toObject = function(e, t) {
                  var r = {
                      status: o.Message.getFieldWithDefault(t, 1, !1),
                      message: o.Message.getFieldWithDefault(t, 2, ""),
                      count: o.Message.getFieldWithDefault(t, 3, 0),
                      resourcesList: o.Message.toObjectList(t.getResourcesList(), s.Resource.toObject, e),
                      groupCustomList: o.Message.getField(t, 5)
                  };
                  return e && (r.$jspbMessageInstance = t),
                  r
              }
      
              # 2、信息2
              proto.SearchService.SearchResponse.serializeBinaryToWriter = function(e, t) {
                  var r = void 0;
                  (r = e.getStatus()) && t.writeBool(1, r),
                  (r = e.getMessage()).length > 0 && t.writeString(2, r),
                  0 !== (r = e.getCount()) && t.writeInt64(3, r),
                  (r = e.getResourcesList()).length > 0 && t.writeRepeatedMessage(4, r, s.Resource.serializeBinaryToWriter),
                  (r = e.getGroupCustomList()).length > 0 && t.writeRepeatedString(5, r)
              }
              
              # 3、信息3
              proto.com.wanfangdata.resource.Periodical.toObject = function(e, t) {
                  var r = {
                      id: o.Message.getFieldWithDefault(t, 1, ""),
                      titleList: o.Message.getField(t, 2),
                      creatorList: o.Message.getField(t, 3),
                      firstcreator: o.Message.getFieldWithDefault(t, 4, ""),
                      scholaridList: o.Message.getField(t, 5),
                      foreigncreatorList: o.Message.getField(t, 6),
                      creatorforsearchList: o.Message.getField(t, 7),
                      organizationnormList: o.Message.getField(t, 8),
                      organizationnewList: o.Message.getField(t, 9),
                      originalorganizationList: o.Message.getField(t, 10),
                      originalclasscodeList: o.Message.getField(t, 12),
                      machinedclasscodeList: o.Message.getField(t, 13),
                      classcodeforsearchList: o.Message.getField(t, 14),
                      contentsearchList: o.Message.getField(t, 15),
                      keywordsList: o.Message.getField(t, 16),
                      foreignkeywordsList: o.Message.getField(t, 17),
                      machinedkeywordsList: o.Message.getField(t, 18),
                      keywordforsearchList: o.Message.getField(t, 19),
                      abstractList: o.Message.getField(t, 20),
                      citedcount: o.Message.getFieldWithDefault(t, 21, 0),
                      periodicalid: o.Message.getFieldWithDefault(t, 22, ""),
                      periodicaltitleList: o.Message.getField(t, 24),
                      sourcedbList: o.Message.getField(t, 25),
                      isoa: o.Message.getFieldWithDefault(t, 26, !1),
                      fundList: o.Message.getField(t, 27),
                      publishdate: o.Message.getFieldWithDefault(t, 28, ""),
                      metadataonlinedate: o.Message.getFieldWithDefault(t, 29, ""),
                      fulltextonlinedate: o.Message.getFieldWithDefault(t, 30, ""),
                      servicemode: o.Message.getFieldWithDefault(t, 31, 0),
                      hasfulltext: o.Message.getFieldWithDefault(t, 32, !1),
                      publishyear: o.Message.getFieldWithDefault(t, 33, 0),
                      issue: o.Message.getFieldWithDefault(t, 34, ""),
                      volum: o.Message.getFieldWithDefault(t, 35, ""),
                      page: o.Message.getFieldWithDefault(t, 36, ""),
                      pageno: o.Message.getFieldWithDefault(t, 37, ""),
                      columnList: o.Message.getField(t, 38),
                      coreperiodicalList: o.Message.getField(t, 39),
                      fulltextpath: o.Message.getFieldWithDefault(t, 40, ""),
                      doi: o.Message.getFieldWithDefault(t, 41, ""),
                      authororgList: o.Message.getField(t, 42),
                      thirdpartyurlList: o.Message.getField(t, 43),
                      language: o.Message.getFieldWithDefault(t, 44, ""),
                      issn: o.Message.getFieldWithDefault(t, 45, ""),
                      cn: o.Message.getFieldWithDefault(t, 46, ""),
                      sequenceinissue: o.Message.getFieldWithDefault(t, 47, 0),
                      metadataviewcount: o.Message.getFieldWithDefault(t, 48, 0),
                      thirdpartylinkclickcount: o.Message.getFieldWithDefault(t, 49, 0),
                      downloadcount: o.Message.getFieldWithDefault(t, 50, 0),
                      prepublishversion: o.Message.getFieldWithDefault(t, 51, ""),
                      prepublishgroupid: o.Message.getFieldWithDefault(t, 52, ""),
                      publishstatus: o.Message.getFieldWithDefault(t, 53, ""),
                      type: o.Message.getFieldWithDefault(t, 54, ""),
                      singlesourcedbList: o.Message.getField(t, 55),
                      exportcount: o.Message.getFieldWithDefault(t, 56, 0),
                      periodicalclasscodeList: o.Message.getField(t, 57),
                      scholaridauthorList: o.Message.getField(t, 58),
                      citedscore: o.Message.getField(t, 59),
                      downloadscore: o.Message.getFloatingPointFieldWithDefault(t, 60, 0),
                      yearscore: o.Message.getFloatingPointFieldWithDefault(t, 61, 0),
                      typescore: o.Message.getFieldWithDefault(t, 62, 0),
                      projectidList: o.Message.getField(t, 63),
                      fundgroupnameList: o.Message.getField(t, 64),
                      projectgrantnoList: o.Message.getField(t, 65),
                      isthirdservice: o.Message.getBooleanFieldWithDefault(t, 66, !1),
                      lastmodifiedtime: o.Message.getFieldWithDefault(t, 67, ""),
                      delivercount: o.Message.getFieldWithDefault(t, 70, 0)
                  };
                  return e && (r.$jspbMessageInstance = t),
                  r
              }

      一定要注意proto文件中各字段后面对应的数字,并不一定是连续的,但一定要按照网站定义的来

posted @ 2023-09-08 01:08  eliwang  阅读(133)  评论(0编辑  收藏  举报