Python多协程异步爬取北京市政信件

采用单线程+多协程的方式爬取

import asyncio
import json
import re
import aiofiles
import requests
from letter import Letter
from bs4 import BeautifulSoup
import os
import aiohttp


def get_original_id():
    my_url = "https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action"
    keyword = input("关键字:")
    startDate = input("开始日期:")
    endDate = input("结束日期:")
    letterType = input("信件类型:")
    if letterType is "":
        letterType = 0
    page_pageNo = input("显示的页数:")
    if page_pageNo is "":
        page_pageNo = 1
    page_pageSize = input("数据条数:")
    if page_pageSize is "":
        page_pageSize = 6
    orgtitleLength = input("标题长度:")
    if orgtitleLength is "":
        orgtitleLength = 26
    print(keyword, startDate, endDate, letterType, page_pageNo, page_pageSize, orgtitleLength)
    my_parm = {
        "keyword": keyword,
        "startDate": startDate,
        "endDate": endDate,
        "letterType": letterType,
        "page.pageNo": page_pageNo,
        "page.pageSize": page_pageSize,
        "orgtitleLength": orgtitleLength

    }
    my_header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
    }
    my_filename = input("文件名称:")
    response = requests.post(url=my_url, params=my_parm, headers=my_header)
    response.encoding = "utf-8"
    category = response.headers.get("Content-Type")
    print("返回数据类型:", category)
    text_json = response.text
    text_json = re.sub("page:", "\"page\":", text_json)
    text_json = re.sub("result:", "\"result\":", text_json)
    text_json = re.sub("pageNo:", "\"pageNo\":", text_json)
    text_json = re.sub("totalCount:", "\"totalCount\":", text_json)
    text_json = re.sub("totalPages:", "\"totalPages\":", text_json)
    text_json = re.sub("pageSize:", "\"pageSize\":", text_json)
    text_json = re.sub("originalId:", "\"originalId\":", text_json)
    text_json = re.sub("letterType:", "\"letterType\":", text_json)
    text_json = re.sub("letterTypeName:", "\"letterTypeName\":", text_json)
    text_json = re.sub("letterTitle:", "\"letterTitle\":", text_json)
    text_json = re.sub("showLetterTitle:", "\"showLetterTitle\":", text_json)
    text_json = re.sub("writeDate:", "\"writeDate\":", text_json)
    text_json = re.sub("orgNames:", "\"orgNames\":", text_json)
    text_json = re.sub("showOrgNames:", "\"showOrgNames\":", text_json)
    text_json = re.sub("'", '"', text_json)
    text_json = json.loads(text_json)
    text_dict = dict(text_json)
    test_list = list(text_dict.get("result"))
    res = []
    for i in test_list:
        res.append((i.get("letterType"),i.get("originalId")))
    fp = open("context/letter_json/%s.json" % my_filename, "w", encoding="utf-8")
    json.dump(text_json, fp, ensure_ascii=False)
    return res
#原本的单线程
# def get_letter_context(originalId):
#     letter_url1="https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="
#     letter_url2="https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="
#     letter_headers={
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
#     }
#     for i in originalId:
#         if i[0] == "1":
#             letter_response=requests.post(letter_url1+i[1],headers=letter_headers)
#             letter_response.encoding = "utf-8"
#             with open("context/letter_consultation/%s.html" % i[1], "w", encoding="utf-8") as fp:
#                 fp.write(letter_response.text)
#         else:
#             letter_response = requests.post(letter_url2 + i[1], headers=letter_headers)
#             letter_response.encoding = "utf-8"
#             with open("context/letter_suggestion/%s.html" % i[1], "w", encoding="utf-8") as fp:
#                 fp.write(letter_response.text)

async def fetch_letter(session, letter_url, originalId, semaphore):
    letter_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
    }

    async with semaphore:
        async with session.post(letter_url + originalId, headers=letter_headers) as response:
            response.encoding = "utf-8"
            html = await response.text()
            return html


async def write_file(filename, content):
    async with aiofiles.open(filename, "w", encoding="utf-8") as fp:
        await fp.write(content)


async def get_letter_context(originalId_list):
    letter_url1 = "https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="
    letter_url2 = "https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="
    semaphore = asyncio.Semaphore(20)

    async with aiohttp.ClientSession() as session:
        tasks = []
        for i in originalId_list:
            if i[0] == "1":
                task = asyncio.ensure_future(fetch_letter(session, letter_url1, i[1], semaphore))
                tasks.append(task)
            else:
                task = asyncio.ensure_future(fetch_letter(session, letter_url2, i[1], semaphore))
                tasks.append(task)

        responses = await asyncio.gather(*tasks)

        file_writing_tasks = []
        for i, response in zip(originalId_list, responses):
            if i[0] == "1":
                filename = "context/letter_consultation/%s.html" % i[1]
            else:
                filename = "context/letter_suggestion/%s.html" % i[1]

            file_writing_task = asyncio.ensure_future(write_file(filename, response))
            file_writing_tasks.append(file_writing_task)

        await asyncio.gather(*file_writing_tasks)


def parse_letter_context(filepath,kind):
    letter=open(filepath,"r",encoding="utf-8")
    soup=BeautifulSoup(letter,"lxml")
    strong=soup.find_all("strong")
    if len(strong) == 0:
        print("爬取失败!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        return
    question_tittle=strong[0].string
    question_people=soup.find("div",class_="col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted").string.split(":")[1].strip()
    question_time = soup.find("div", class_="col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted").string.split(":")[1].strip()
    question_context=soup.find("div", class_="col-xs-12 col-md-12 column p-2 text-muted mx-2").string

    answer_institution=strong[1].string
    answer_time=soup.find("div", class_="col-xs-12 col-sm-3 col-md-3 my-2").string.split(":")[1].strip()
    answer_context=soup.find("div", class_="col-xs-12 col-md-12 column p-4 text-muted my-3").string

    let=Letter(kind,question_tittle,question_people,question_time,question_context,answer_institution,answer_time,answer_context)
    let.show()


if __name__ == '__main__':
    id_list=get_original_id()
    loop=asyncio.get_event_loop()
    loop.run_until_complete(get_letter_context(id_list))
    loop.close()
    for filename in os.listdir("context/letter_consultation"):
        filepath=os.path.join("context/letter_consultation",filename)
        parse_letter_context(filepath,"咨询")
    for filename in os.listdir("context/letter_suggestion"):
        filepath=os.path.join("context/letter_suggestion",filename)
        parse_letter_context(filepath,"建议")

Letter

class Letter(object):
    def __init__(self,kind,question_tittle,question_people,question_time,question_context,answer_institution,answer_time,answer_context):
        self.kind=kind
        self.question_tittle=question_tittle
        self.question_people=question_people
        self.question_time=question_time
        self.question_context=question_context

        self.answer_institution=answer_institution
        self.answer_time=answer_time
        self.answer_context=answer_context
    def show(self):
        print("==============================================================================================================================")
        print("问题类别:"+self.kind)
        print("问题标题:"+self.question_tittle," 来信人:"+self.question_people," 来信时间:"+self.question_time)
        print("信件内容:")
        print(self.question_context)
        print("回复机构:"+self.answer_institution," 回复时间:"+self.answer_time)
        print("回复内容:")
        print(self.answer_context)

 

posted @ 2023-08-04 22:23  突破铁皮  阅读(14)  评论(0编辑  收藏  举报