2024.09.09 北京市政信件

 

       今天开学第一天,主要进行了北京市政百姓信件分析进行了爬虫

import json

 
import demjson3
import requests
from bs4 import BeautifulSoup
import csv
 
headers = {
    'Host''www.beijing.gov.cn',
    'User-Agent''Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
    'Accept''application/json, text/javascript, */*; q=0.01',
    'Accept-Language''zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding''gzip, deflate',
    'Content-Type''text/json',
    'X-Requested-With''XMLHttpRequest',
    'Content-Length''155',
    'Origin''http://www.beijing.gov.cn',
    'Connection''keep-alive',
    'Referer''http://www.beijing.gov.cn/hudong/hdjl/'
}
 
if __name__ == "__main__":
    page = 1
    datas = json.dumps({})
 
    while page < 175:
        print(page)
        url = f"https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo={page}&page.pageSize=6&orgtitleLength=26"
        = requests.post(url, data=datas, headers=headers)
        rr = demjson3.decode(r.text);
 
 
        for item in rr.get("result", []):
            originalId = item.get("originalId")  # 编号
            letterTypeName = item.get("letterTypeName")  # 信件类型
 
            # 构建详情页URL
            detail_url = f"http://www.beijing.gov.cn/hudong/hdjl/com.web.{('consult' if letterTypeName == '咨询' else 'suggest')}.{('consultDetail' if letterTypeName == '咨询' else 'suggesDetail')}.flow?originalId={originalId}"
            r1 = requests.get(detail_url, headers={'user-agent''Mozilla/5.0'})
 
            if r1.status_code == 200:
                demo = r1.text
                soup = BeautifulSoup(demo, "html.parser")
 
                title = soup.find("strong").get_text().replace("\n", "") if soup.find("strong") else ""
                fromPeople = soup.find_all("div", {"class""col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip('来信人:').lstrip().rstrip() if soup.find_all("div", {"class""col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"}) else ""
                fromTime = soup.find_all("div", {"class""col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip('时间:'if soup.find_all("div", {"class""col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"}) else ""
                problem = soup.find_all("div", {"class""col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip().replace("\r", "").replace("\n", "") if soup.find_all("div", {"class", "col-xs-12 col-md-12 column p-2 text-muted mx-2"}) else ""
                office = soup.find_all("div", {"class""col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().replace("\n", "") if soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"}) else ""
                answerTime = soup.find_all("div", {"class""col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip('答复时间:'if soup.find_all("div", {"class""col-xs-12 col-sm-3 col-md-3 my-2"}) else ""
                answer = soup.find_all("div", {"class""col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace("\n", "").replace("\r", "") if soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"}) else ""
 
                itemm = f"{originalId}|{letterTypeName}|{title}|{fromPeople}|{fromTime}|{problem}|{office}|{answerTime}|{answer}"
 
                with open("yijian.txt"'a', encoding='utf-8') as fp:
                    fp.write(itemm + '\n')
            else:
                print(f"Failed to retrieve details for ID: {originalId}")
 
        page += 1
posted @   new菜鸟  阅读(3)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示