爬虫案例-爬运维工单
源代码:
# coding=utf-8 import requests from lxml import etree class ChaxunSpdier: def __init__(self): self.start_url = 'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=performQuery' self.part_url = 'http://111.40.232.237:9000/eoms35/sheet/complaint/' self.headers = { 'Connection': 'keep-alive', 'Cookie': 'TSJSESSIONID=0000YvxNFfPYx8EBo8lsKNrKIl6:1bkt8lo7d',#每次都得换一下 'Host': '111.40.232.237:9000', 'Referer': 'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=showQueryPage&type=interface&urlType=complaint&userName=liuhaoce&workSerial=0&isDutyMaster=false&workSerialTime=&startDuty=&endDuty=', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'} def parse_url(self, url): formdata = { 'sheetIdStringExpression': 'like', 'main.sheetId': '', # 工单流水号 'titleStringExpression': 'like', 'main.title': '', 'main.status': '', 'statusChoiceExpression': '0', 'task.taskName': '', 'sendRoleIdStringExpression': 'in', 'main.sendRoleId': '', 'sendDeptIdStringExpression': 'in', 'main.sendDeptId': '', 'sendUserIdStringExpression': 'in', 'main.sendUserId': '', 'operateRoleIdStringExpression': 'in', 'link.operateRoleId': '', 'operateDeptIdStringExpression': 'in', 'link.operateDeptId': '', 'operateUserIdStringExpression': 'in', 'link.operateUserId': '', 'toDeptIdStringExpression': 'in', 'showArea': '大庆, 铁通', # 投诉受理省份 'main.toDeptId': '1005, 1021', 'main.complaintType1': '', 'complaintType1ChoiceExpression': '1010615100202', # 投诉类型一:家宽业务 'main.complaintType2': '', 'complaintType2ChoiceExpression': '', 'main.complaintType': '', 'main.complaintType4': '', 'main.complaintType5': '', 'main.complaintType6': '', 'main.complaintType7': '', 'complaintNumStringExpression': '', 'main.complaintNum': '', 'parentCorrelationStringExpression': '', 'main.parentCorrelation': '', 'customAttributionStringExpression': 'like', 'main.customAttribution': '', 'repeatComplaintTimesStringExpression': '>=', 'main.repeatComplaintTimes': '', 'complaintDescStringExpression': 'like', 'main.complaintDesc': '', 'main.sendTime': '', 'sendTimeStartDateExpression': '>=', 'sendTimeStartDate': '2020-02-02 20:13:35', # 开始时间 'sendTimeLogicExpression': 'and', 'sendTimeEndDateExpression': '<=', 'sendTimeEndDate': '2020-02-23 20:13:35', # 结束时间 'queryType': 'record' } response = requests.post(url, data=formdata, headers=self.headers) return response.content def get_content_list(self, html_raw): html = etree.HTML(html_raw) tr_list = html.xpath('//tbody/tr') # 每一个tr里放了一行投诉 content_list = [] for content in tr_list: item = {} zineirong = content.xpath('./td') # 每行投诉都封装在n个td标签下 item['工单主题'] = zineirong[0].xpath('.//text()')[0] item['工单流水号'] = zineirong[1].xpath('./a/text()')[0] # item['处理时限'] = zineirong[3].xpath('./text()')[0] detail_link = self.part_url + zineirong[1].xpath('./a/@href')[0] detail_dict = self.get_gongdan_detail(detail_link) item['xiangqing'] = detail_dict content_list.append(item) next_gongdan_url = self.part_url + html.xpath("//a[text()='下一页']/@href")[0] if len(html.xpath("//a[text()='下一页']/@href")) > 0 else None # 下一页工单列表明细 return content_list, next_gongdan_url def get_gongdan_detail(self, url): html_raw = self.parse_url(url) html = etree.HTML(html_raw) xiangqing_dict = {} xiangqing_dict['投诉内容'] = html.xpath('//*[@id="complainttext"]/text()') xiangqing_dict['派往对象'] = html.xpath('//div[@id="ext-gen47"]/table/tbody/tr[4]/td[4]/text()')#ifram里了,查不到 xiangqing_dict['qita'] = html.xpath('//*[@id="ext-gen47"]/text()') return xiangqing_dict def save_content_list(self, content_list): for i, v in enumerate(content_list, start=1): print(i, v) def run(self): next_url = self.start_url#工单查询主界面 content_total_list = [] while next_url is not None: html_raw = self.parse_url(next_url) # 获取访问每一页工单源数据 content_list, next_url = self.get_content_list(html_raw) # 提取url具体内容放在里列表里,获取下一页链接 content_total_list = content_total_list + content_list #将提取每一页内容加载到列表中 self.save_content_list(content_total_list) # 每一条工单内容打印一下 if __name__ == '__main__': Spdier = ChaxunSpdier() Spdier.run()