2020学习05 爬虫,修改了一些bug
在上个爬虫代码中没有对信件类型进行分类,而且爬取的数据会出现大片时间爬取不到和回复内容爬取不到,
对代码进行优化后,
得到如下数据:
只显示部分数据,可以看到爬取的完整度基本完好。
代码如下:
#coding:utf-8 import requests from lxml import etree import time import pymysql import datetime import urllib import json from IPython.core.page import page conn = pymysql.connect( host="localhost", user="root", port=3306, password="123456", database="bjxj") gg=2950 def db(conn, reqcontent,reqname,reqtime,resname,restime,rescontent,reqtype,isreply): cursor = conn.cursor() # cursor.execute( # "INSERT INTO xinjian(name) VALUES (%s)", # [name]) if isreply == False : isreply = 0 restime1 = '' else : isreply = 1 restime1 = restime # print(reqcontent) # print(reqname) # print(reqtime) # print(resname) # #print(restime) # print(rescontent) # print(reqtype) # print(isreply) cursor.execute("INSERT INTO aaa (reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);", [reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime1]) conn.commit() cursor.close() def shijinOU(json1,url,i): print(i) head = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'method': 'POST', 'Content-Type': 'application/json;charset=UTF-8', } data_json = json.dumps(json1) r = requests.post(url,data = data_json,headers=head) html = r.content.decode("utf-8") print("Status code:",r.status_code) new_data = json.loads(html) #print("6666:" + html) for s in range(0,6): print(new_data['mailList'][s]) reqname = new_data['mailList'][s]['letter_title'] reqtime = new_data['mailList'][s]['create_date'] resname = new_data['mailList'][s]['org_id'] isreply = new_data['mailList'][s]['isReply'] reqtype = new_data['mailList'][s]['letter_type'] if new_data['mailList'][s]['letter_type'] == '咨询' : #print(isreply) #print("询问标题:" + reqname + "询问时间:" + reqtime + "回答部门:" + resname + "是否回答:") lettertype = 'consult' lettertype1 = 'consultDetail' zixunTiqu(new_data['mailList'][s]['original_id'],reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1) if new_data['mailList'][s]['letter_type'] == '建议' : lettertype = 'suggest' lettertype1 = 'suggesDetail' zixunTiqu(new_data['mailList'][s]['original_id'], reqname, reqtime, resname, isreply, reqtype, lettertype, lettertype1) if new_data['mailList'][s]['letter_type'] == '投诉' : lettertype = 'complain' lettertype1 = 'complainDetail' zixunTiqu(new_data['mailList'][s]['original_id'], reqname, reqtime, resname, isreply, reqtype, lettertype, lettertype1) def zixunTiqu(AH,reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1): #print("询问标题:"+reqname+"询问时间:"+reqtime+"回答部门:"+resname+"是否回答:"+isreply) head = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'method': 'GET' } url2 = 'http://www.beijing.gov.cn/hudong/hdjl/com.web.'+lettertype+'.'+lettertype1+'.flow?originalId='+AH r = requests.get(url2, headers=head) #print(r.status_code) html = r.content.decode("utf-8") #print("777"+html) html1 = etree.HTML(html) #print(html) reqcontent1 = html1.xpath('head/meta[@name="Description"]/@content') restime1 = html1.xpath('//div[@class="col-xs-12 col-sm-3 col-md-3 my-2 "]//text()') restime2 = html1.xpath('//div[@class="col-xs-12 col-sm-3 col-md-3 my-2"]//text()') print(restime1) restime = '' rescontent = '' if len(restime1) ==0 and len(restime2) ==0: print("未回答") restime = '' rescontent = '' else: if len(restime1) == 0: restime = restime2[0] rescontent1 = html1.xpath('string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])') rescontent = rescontent1.strip() else: restime = restime1[0] rescontent1 = html1.xpath('string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])') rescontent = rescontent1.strip() #print(restime) print(rescontent) db(conn, reqcontent1[0], reqname, reqtime, resname, restime, rescontent, reqtype, isreply) if __name__=='__main__': for i in range(0,100): print('***************************************************') page = 6*i fuck = {"PageCond/begin":page, "PageCond/length":6, "PageCond/isCount":"true", "keywords":"","orgids":"", "startDate":"","endDate":"", "letterType":"","letterStatue":"" } shijinOU(fuck,"http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.mailList.biz.ext",i) #break #print(fuck)
html1 = etree.HTML(html)
总结:对于页面元素内容的提取可以通过html = etree.html(html1) 将html元素转换成可以使用xpath解析定位的内容,进而通过xpath解析定位得到元素的值。