protobuf协议
protobuf是Google公司推出的一种轻便、高效的结构化数据存储格式,比XML、JSON传输效率更高
一、环境配置(以windows系统下能正常运行的版本为例)
-
下载protoc(protobuf编译器)
- https://github.com/protocolbuffers/protobuf/releases
- 找到指定的版本:protoc-21.12-win64.zip
- 下载并解压后,将bin目录添加至系统环境变量,然后使用如下命令进行测试
protoc --version
-
python安装protobuf依赖
pip install protobuf==4.21.2
二、proto文件及对应python脚本生成示例
-
test.proto
syntax = "proto3"; message Info { string name = 1; int32 age = 2; } // enum类型,第一个值必须为0 enum Status { A = 0; B = 1; } message School{ string addr = 1; repeated int32 classes = 2; // 数组,里面元素为int32类型 repeated Info students = 3; // 数组,里面元素为Info类型 Status status = 4; // enum修饰,限制取值只能是0或者1 map<string, string> desc = 5; // map定义了一个包含键值对的字典结构,其中键和值都必须是字符串 }
Info和Status也可以定义再School内部,School可以理解为python中的一个类,里面的属性理解为类属性即可
-
通过命令行转换成对应python脚本
protoc --python_out=. ./test.proto
protoc --python_out=输出目录 proto文件路径,执行以上命令,会在当前目录下生成test_pb2.py文件
-
测试文件:test.py
from test_pb2 import School, Status from google.protobuf import json_format # 实例化School对象 school = School() school.addr = "北京市海淀区" # 对于repeated修饰,元素为int32类型的,添加值采用数组append的形式 school.classes.append(1) school.classes.append(2) school.classes.append(3) school.status = Status.A # 对于enum修饰的,限制给定的取值 # school.status = 0 # 对于repeated修饰,元素为对象类型的,添加值需要先通过:对象.属性.add()的形式返回添加对象,然后给该对象赋值 student1 = school.students.add() student1.name = "小明" student1.age = 20 student2 = school.students.add() student2.name = "小红" student2.age = 18 # 对于map类型,必须按照字典进行操作(增、删、改、查) school.desc["evalutaion"] = "好评!" school.desc["history"] = "100年" # school.desc["history"] = "110年" 修改 # del school.desc["evalutaion"] 删除 # 1、序列化 school_info = school.SerializeToString() print(school_info) # 字节 print(school_info.decode('utf-8')) # 2、反序列化(字节 => protobuf对象) deserializeInfo = School() deserializeInfo.ParseFromString(school_info) # 3、protobuf转字典或json # 将protobuf对象转字典 print(json_format.MessageToDict(deserializeInfo, preserving_proto_field_name=True)) # 默认会将字段名转小写,添加参数会使用原字段名 # {'addr': '北京市海淀区', 'classes': [1, 2, 3], 'students': [{'name': '小明', 'age': 20}, {'name': '小红', 'age': 18}], 'desc': {'evalutaion': '好评!', 'history': '100年'}} # 将protobuf对象转Json print(json_format.MessageToJson(deserializeInfo, ensure_ascii=False))
三、案例分析
-
某方
-
proto文件
syntax = "proto3"; // 请求 message Request{ enum Order{ A = 0; } enum SearchScope{ B = 0; } enum SearchFilter{ C = 0; } enum InterfaceType{ D = 0; E = 1; } message SearchSort{ string field = 1; Order order = 2; } message Second{ string field = 1; string value = 2; } message CommonRequest{ string searchType = 1; // paper string searchWord = 2; // 水稻 SearchSort searchSort = 3; repeated Second secondsList = 4; int32 currentPage = 5; int32 pageSize = 6; SearchScope searchScope = 7; repeated SearchFilter searchFilterList = 8; bool languageExpand = 9; bool topicExpand = 10; } CommonRequest commonRequest = 1; InterfaceType interfaceType = 2; } // 响应 message Response { enum Type { F = 0; } message ThirdParty { string url = 1; string showname = 2; string platform = 4; string id = 5; } message OriginButton { Type type = 1; ThirdParty thirdpartyList = 2; } message Periodical{ string id = 1; repeated string titleList = 2; repeated string creatorList = 3; string firstcreator = 4; repeated string scholaridList = 5; repeated string foreigncreatorList = 6; repeated string creatorforsearchList = 7; repeated string organizationnormList = 8; repeated string organizationnewList = 9; repeated string originalorganizationList = 10; repeated string originalclasscodeList = 12; repeated string machinedclasscodeList = 13; repeated string classcodeforsearchList = 14; repeated string contentsearchList = 15; repeated string keywordsList = 16; repeated string foreignkeywordsList = 17; repeated string machinedkeywordsList = 18; repeated string keywordforsearchList = 19; repeated string abstractList = 20; int32 citedcount = 21; string periodicalid = 22; repeated string periodicaltitleList = 24; repeated string sourcedbList = 25; bool isoa = 26; repeated string fundList = 27; string publishdate = 28; string metadataonlinedate = 29; string fulltextonlinedate = 30; int32 servicemode = 31; int32 hasfulltext = 32; int32 publishyear = 33; string issue = 34; string volum = 35; string page = 36; string pageno = 37; repeated string columnList = 38; repeated string coreperiodicalList = 39; string fulltextpath = 40; string doi = 41; repeated string authororgList = 42; repeated string thirdpartyurlList = 43; string language = 44; string issn = 45; string cn = 46; int32 sequenceinissue = 47; int32 metadataviewcount = 48; int32 thirdpartylinkclickcount = 49; int32 downloadcount = 50; string prepublishversion = 51; string prepublishgroupid = 52; int32 exportcount = 56; repeated string periodicalclasscodeList = 57; repeated string scholaridauthorList = 58; int32 downloadscore = 60; int32 yearscore = 61; int32 typescore = 62; repeated int32 projectidList = 63; repeated string fundgroupnameList = 64; repeated int32 projectgrantnoList = 65; bool isthirdservice = 66; string lastmodifiedtime = 67; int32 delivercount = 70; string publishstatus = 53; string type = 54; repeated string singlesourcedbList = 55; } message Resources{ string type = 1; OriginButton originbuttonsList = 2; string uid = 3; Periodical periodical = 101; } bool status = 1; string message = 2; int64 count = 3; repeated Resources resourcesList = 4; repeated string groupCustomList = 5; }
一定要注意各字段后面对应的数字,一定要和网站的一致,可以不连续
-
请求脚本
import requests import test_pb2 as pb from google.protobuf import json_format search_request = pb.Request() search_request.commonRequest.searchType = "paper" search_request.commonRequest.searchWord = '水稻' search_request.commonRequest.currentPage = 1 search_request.commonRequest.pageSize = 20 search_request.commonRequest.searchScope = 0 search_request.commonRequest.searchFilterList.append(0) search_request.interfaceType = 1 bytes_body = search_request.SerializeToString() bytes_head = bytes([0, 0, 0, 0, len(bytes_body)]) post_data = bytes_head + bytes_body # 该网站自己定义的,前面添加了5个字节的数据,其中前4个为空字节,第5个是请求数据长度对应的字节 headers = { 'content-type': 'application/grpc-web+proto', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36', 'x-user-agent': 'grpc-web-javascript/0.1', } url = "https://s.wanfangdata.com.cn/SearchService.SearchService/search" res = requests.post(url, data=post_data, headers=headers) # 对响应数据进行反序列化 deResp = pb.Response() deResp.ParseFromString(res.content[5:]) # 前5个字节依旧是与响应字节长度有关,需要切割掉 print(json_format.MessageToJson(deResp, ensure_ascii=False).replace("<span class='highlight'>", '').replace("</span>", ''))
-
反写proto文件的思路
一、请求数据序列化相关 通过xhr请求分析,跟栈,浏览器提示明文数据类型:proto.SearchService.SearchRequest,然后搜索定位,找到请求proto相关重要信息 ## 信息1 proto.SearchService.SearchRequest.toObject = function(e, t) { var r, a = { commonrequest: (r = t.getCommonrequest()) && proto.SearchService.CommonRequest.toObject(e, r), interfaceType: o.Message.getFieldWithDefault(t, 2, 0) }; return e && (a.$jspbMessageInstance = t), a } ## 信息2 proto.SearchService.SearchRequest.serializeBinaryToWriter = function(e, t) { var r = void 0; null != (r = e.getCommonrequest()) && t.writeMessage(1, r, proto.SearchService.CommonRequest.serializeBinaryToWriter), 0 !== (r = e.getInterfaceType()) && t.writeEnum(2, r) } ## 信息3(跟据信息2,搜索proto.SearchService.CommonRequest.serializeBinaryToWriter即可) proto.SearchService.CommonRequest.serializeBinaryToWriter = function(e, t) { var r = void 0; (r = e.getSearchType()).length > 0 && t.writeString(1, r), (r = e.getSearchWord()).length > 0 && t.writeString(2, r), null != (r = e.getSearchSort()) && t.writeMessage(3, r, proto.SearchService.SearchSort.serializeBinaryToWriter), (r = e.getSecondsList()).length > 0 && t.writeRepeatedMessage(4, r, proto.SearchService.Second.serializeBinaryToWriter), 0 !== (r = e.getCurrentPage()) && t.writeInt32(5, r), 0 !== (r = e.getPageSize()) && t.writeInt32(6, r), 0 !== (r = e.getSearchScope()) && t.writeEnum(7, r), (r = e.getSearchFilterList()).length > 0 && t.writePackedEnum(8, r), (r = e.getLanguageExpand()) && t.writeBool(9, r), (r = e.getTopicExpand()) && t.writeBool(10, r) } 二、响应数据反序列化相关 可以根据浏览器解析后的响应数据,搜索特定的key值,然后能够定位到如下有用信息: # 1、信息1 proto.SearchService.SearchResponse.toObject = function(e, t) { var r = { status: o.Message.getFieldWithDefault(t, 1, !1), message: o.Message.getFieldWithDefault(t, 2, ""), count: o.Message.getFieldWithDefault(t, 3, 0), resourcesList: o.Message.toObjectList(t.getResourcesList(), s.Resource.toObject, e), groupCustomList: o.Message.getField(t, 5) }; return e && (r.$jspbMessageInstance = t), r } # 2、信息2 proto.SearchService.SearchResponse.serializeBinaryToWriter = function(e, t) { var r = void 0; (r = e.getStatus()) && t.writeBool(1, r), (r = e.getMessage()).length > 0 && t.writeString(2, r), 0 !== (r = e.getCount()) && t.writeInt64(3, r), (r = e.getResourcesList()).length > 0 && t.writeRepeatedMessage(4, r, s.Resource.serializeBinaryToWriter), (r = e.getGroupCustomList()).length > 0 && t.writeRepeatedString(5, r) } # 3、信息3 proto.com.wanfangdata.resource.Periodical.toObject = function(e, t) { var r = { id: o.Message.getFieldWithDefault(t, 1, ""), titleList: o.Message.getField(t, 2), creatorList: o.Message.getField(t, 3), firstcreator: o.Message.getFieldWithDefault(t, 4, ""), scholaridList: o.Message.getField(t, 5), foreigncreatorList: o.Message.getField(t, 6), creatorforsearchList: o.Message.getField(t, 7), organizationnormList: o.Message.getField(t, 8), organizationnewList: o.Message.getField(t, 9), originalorganizationList: o.Message.getField(t, 10), originalclasscodeList: o.Message.getField(t, 12), machinedclasscodeList: o.Message.getField(t, 13), classcodeforsearchList: o.Message.getField(t, 14), contentsearchList: o.Message.getField(t, 15), keywordsList: o.Message.getField(t, 16), foreignkeywordsList: o.Message.getField(t, 17), machinedkeywordsList: o.Message.getField(t, 18), keywordforsearchList: o.Message.getField(t, 19), abstractList: o.Message.getField(t, 20), citedcount: o.Message.getFieldWithDefault(t, 21, 0), periodicalid: o.Message.getFieldWithDefault(t, 22, ""), periodicaltitleList: o.Message.getField(t, 24), sourcedbList: o.Message.getField(t, 25), isoa: o.Message.getFieldWithDefault(t, 26, !1), fundList: o.Message.getField(t, 27), publishdate: o.Message.getFieldWithDefault(t, 28, ""), metadataonlinedate: o.Message.getFieldWithDefault(t, 29, ""), fulltextonlinedate: o.Message.getFieldWithDefault(t, 30, ""), servicemode: o.Message.getFieldWithDefault(t, 31, 0), hasfulltext: o.Message.getFieldWithDefault(t, 32, !1), publishyear: o.Message.getFieldWithDefault(t, 33, 0), issue: o.Message.getFieldWithDefault(t, 34, ""), volum: o.Message.getFieldWithDefault(t, 35, ""), page: o.Message.getFieldWithDefault(t, 36, ""), pageno: o.Message.getFieldWithDefault(t, 37, ""), columnList: o.Message.getField(t, 38), coreperiodicalList: o.Message.getField(t, 39), fulltextpath: o.Message.getFieldWithDefault(t, 40, ""), doi: o.Message.getFieldWithDefault(t, 41, ""), authororgList: o.Message.getField(t, 42), thirdpartyurlList: o.Message.getField(t, 43), language: o.Message.getFieldWithDefault(t, 44, ""), issn: o.Message.getFieldWithDefault(t, 45, ""), cn: o.Message.getFieldWithDefault(t, 46, ""), sequenceinissue: o.Message.getFieldWithDefault(t, 47, 0), metadataviewcount: o.Message.getFieldWithDefault(t, 48, 0), thirdpartylinkclickcount: o.Message.getFieldWithDefault(t, 49, 0), downloadcount: o.Message.getFieldWithDefault(t, 50, 0), prepublishversion: o.Message.getFieldWithDefault(t, 51, ""), prepublishgroupid: o.Message.getFieldWithDefault(t, 52, ""), publishstatus: o.Message.getFieldWithDefault(t, 53, ""), type: o.Message.getFieldWithDefault(t, 54, ""), singlesourcedbList: o.Message.getField(t, 55), exportcount: o.Message.getFieldWithDefault(t, 56, 0), periodicalclasscodeList: o.Message.getField(t, 57), scholaridauthorList: o.Message.getField(t, 58), citedscore: o.Message.getField(t, 59), downloadscore: o.Message.getFloatingPointFieldWithDefault(t, 60, 0), yearscore: o.Message.getFloatingPointFieldWithDefault(t, 61, 0), typescore: o.Message.getFieldWithDefault(t, 62, 0), projectidList: o.Message.getField(t, 63), fundgroupnameList: o.Message.getField(t, 64), projectgrantnoList: o.Message.getField(t, 65), isthirdservice: o.Message.getBooleanFieldWithDefault(t, 66, !1), lastmodifiedtime: o.Message.getFieldWithDefault(t, 67, ""), delivercount: o.Message.getFieldWithDefault(t, 70, 0) }; return e && (r.$jspbMessageInstance = t), r }
一定要注意proto文件中各字段后面对应的数字,并不一定是连续的,但一定要按照网站定义的来
-