Python多协程异步爬取北京市政信件
采用单线程+多协程的方式爬取
import asyncio
import json
import re
import aiofiles
import requests
from letter import Letter
from bs4 import BeautifulSoup
import os
import aiohttp
def get_original_id():
my_url = "https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action"
keyword = input("关键字:")
startDate = input("开始日期:")
endDate = input("结束日期:")
letterType = input("信件类型:")
if letterType is "":
letterType = 0
page_pageNo = input("显示的页数:")
if page_pageNo is "":
page_pageNo = 1
page_pageSize = input("数据条数:")
if page_pageSize is "":
page_pageSize = 6
orgtitleLength = input("标题长度:")
if orgtitleLength is "":
orgtitleLength = 26
print(keyword, startDate, endDate, letterType, page_pageNo, page_pageSize, orgtitleLength)
my_parm = {
"keyword": keyword,
"startDate": startDate,
"endDate": endDate,
"letterType": letterType,
"page.pageNo": page_pageNo,
"page.pageSize": page_pageSize,
"orgtitleLength": orgtitleLength
}
my_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
}
my_filename = input("文件名称:")
response = requests.post(url=my_url, params=my_parm, headers=my_header)
response.encoding = "utf-8"
category = response.headers.get("Content-Type")
print("返回数据类型:", category)
text_json = response.text
text_json = re.sub("page:", "\"page\":", text_json)
text_json = re.sub("result:", "\"result\":", text_json)
text_json = re.sub("pageNo:", "\"pageNo\":", text_json)
text_json = re.sub("totalCount:", "\"totalCount\":", text_json)
text_json = re.sub("totalPages:", "\"totalPages\":", text_json)
text_json = re.sub("pageSize:", "\"pageSize\":", text_json)
text_json = re.sub("originalId:", "\"originalId\":", text_json)
text_json = re.sub("letterType:", "\"letterType\":", text_json)
text_json = re.sub("letterTypeName:", "\"letterTypeName\":", text_json)
text_json = re.sub("letterTitle:", "\"letterTitle\":", text_json)
text_json = re.sub("showLetterTitle:", "\"showLetterTitle\":", text_json)
text_json = re.sub("writeDate:", "\"writeDate\":", text_json)
text_json = re.sub("orgNames:", "\"orgNames\":", text_json)
text_json = re.sub("showOrgNames:", "\"showOrgNames\":", text_json)
text_json = re.sub("'", '"', text_json)
text_json = json.loads(text_json)
text_dict = dict(text_json)
test_list = list(text_dict.get("result"))
res = []
for i in test_list:
res.append((i.get("letterType"),i.get("originalId")))
fp = open("context/letter_json/%s.json" % my_filename, "w", encoding="utf-8")
json.dump(text_json, fp, ensure_ascii=False)
return res
#原本的单线程
# def get_letter_context(originalId):
# letter_url1="https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="
# letter_url2="https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="
# letter_headers={
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
# }
# for i in originalId:
# if i[0] == "1":
# letter_response=requests.post(letter_url1+i[1],headers=letter_headers)
# letter_response.encoding = "utf-8"
# with open("context/letter_consultation/%s.html" % i[1], "w", encoding="utf-8") as fp:
# fp.write(letter_response.text)
# else:
# letter_response = requests.post(letter_url2 + i[1], headers=letter_headers)
# letter_response.encoding = "utf-8"
# with open("context/letter_suggestion/%s.html" % i[1], "w", encoding="utf-8") as fp:
# fp.write(letter_response.text)
async def fetch_letter(session, letter_url, originalId, semaphore):
letter_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
}
async with semaphore:
async with session.post(letter_url + originalId, headers=letter_headers) as response:
response.encoding = "utf-8"
html = await response.text()
return html
async def write_file(filename, content):
async with aiofiles.open(filename, "w", encoding="utf-8") as fp:
await fp.write(content)
async def get_letter_context(originalId_list):
letter_url1 = "https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="
letter_url2 = "https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="
semaphore = asyncio.Semaphore(20)
async with aiohttp.ClientSession() as session:
tasks = []
for i in originalId_list:
if i[0] == "1":
task = asyncio.ensure_future(fetch_letter(session, letter_url1, i[1], semaphore))
tasks.append(task)
else:
task = asyncio.ensure_future(fetch_letter(session, letter_url2, i[1], semaphore))
tasks.append(task)
responses = await asyncio.gather(*tasks)
file_writing_tasks = []
for i, response in zip(originalId_list, responses):
if i[0] == "1":
filename = "context/letter_consultation/%s.html" % i[1]
else:
filename = "context/letter_suggestion/%s.html" % i[1]
file_writing_task = asyncio.ensure_future(write_file(filename, response))
file_writing_tasks.append(file_writing_task)
await asyncio.gather(*file_writing_tasks)
def parse_letter_context(filepath,kind):
letter=open(filepath,"r",encoding="utf-8")
soup=BeautifulSoup(letter,"lxml")
strong=soup.find_all("strong")
if len(strong) == 0:
print("爬取失败!!!!!!!!!!!!!!!!!!!!!!!!!!!")
return
question_tittle=strong[0].string
question_people=soup.find("div",class_="col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted").string.split(":")[1].strip()
question_time = soup.find("div", class_="col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted").string.split(":")[1].strip()
question_context=soup.find("div", class_="col-xs-12 col-md-12 column p-2 text-muted mx-2").string
answer_institution=strong[1].string
answer_time=soup.find("div", class_="col-xs-12 col-sm-3 col-md-3 my-2").string.split(":")[1].strip()
answer_context=soup.find("div", class_="col-xs-12 col-md-12 column p-4 text-muted my-3").string
let=Letter(kind,question_tittle,question_people,question_time,question_context,answer_institution,answer_time,answer_context)
let.show()
if __name__ == '__main__':
id_list=get_original_id()
loop=asyncio.get_event_loop()
loop.run_until_complete(get_letter_context(id_list))
loop.close()
for filename in os.listdir("context/letter_consultation"):
filepath=os.path.join("context/letter_consultation",filename)
parse_letter_context(filepath,"咨询")
for filename in os.listdir("context/letter_suggestion"):
filepath=os.path.join("context/letter_suggestion",filename)
parse_letter_context(filepath,"建议")
Letter
class Letter(object):
def __init__(self,kind,question_tittle,question_people,question_time,question_context,answer_institution,answer_time,answer_context):
self.kind=kind
self.question_tittle=question_tittle
self.question_people=question_people
self.question_time=question_time
self.question_context=question_context
self.answer_institution=answer_institution
self.answer_time=answer_time
self.answer_context=answer_context
def show(self):
print("==============================================================================================================================")
print("问题类别:"+self.kind)
print("问题标题:"+self.question_tittle," 来信人:"+self.question_people," 来信时间:"+self.question_time)
print("信件内容:")
print(self.question_context)
print("回复机构:"+self.answer_institution," 回复时间:"+self.answer_time)
print("回复内容:")
print(self.answer_context)