北京市政百姓信件分析实战一 (利用python爬取数据)

因为我的python版本为3.12

所以安装一些软件包命令 与之前有些许不同 

pip install beautifulSoup4

pip install demjson3

pip install requests

话不多说 代码奉上

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
 
import demjson3
import requests
from bs4 import BeautifulSoup
import csv
 
headers = {
    'Host': 'www.beijing.gov.cn',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'Content-Type': 'text/json',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '155',
    'Origin': 'http://www.beijing.gov.cn',
    'Connection': 'keep-alive',
    'Referer': 'http://www.beijing.gov.cn/hudong/hdjl/'
}
 
if __name__ == "__main__":
    page = 1
    datas = json.dumps({})
 
    while page < 175:
        print(page)
        url = f"https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo={page}&page.pageSize=6&orgtitleLength=26"
        r = requests.post(url, data=datas, headers=headers)
        rr = demjson3.decode(r.text);
 
 
        for item in rr.get("result", []):
            originalId = item.get("originalId"# 编号
            letterTypeName = item.get("letterTypeName"# 信件类型
 
            # 构建详情页URL
            detail_url = f"http://www.beijing.gov.cn/hudong/hdjl/com.web.{('consult' if letterTypeName == '咨询' else 'suggest')}.{('consultDetail' if letterTypeName == '咨询' else 'suggesDetail')}.flow?originalId={originalId}"
            r1 = requests.get(detail_url, headers={'user-agent': 'Mozilla/5.0'})
 
            if r1.status_code == 200:
                demo = r1.text
                soup = BeautifulSoup(demo, "html.parser")
 
                title = soup.find("strong").get_text().replace("\n", "") if soup.find("strong") else ""
                fromPeople = soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip('来信人:').lstrip().rstrip() if soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"}) else ""
                fromTime = soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip('时间:') if soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"}) else ""
                problem = soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip().replace("\r", "").replace("\n", "") if soup.find_all("div", {"class", "col-xs-12 col-md-12 column p-2 text-muted mx-2"}) else ""
                office = soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().replace("\n", "") if soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"}) else ""
                answerTime = soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip('答复时间:') if soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"}) else ""
                answer = soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace("\n", "").replace("\r", "") if soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"}) else ""
 
                itemm = f"{originalId}|{letterTypeName}|{title}|{fromPeople}|{fromTime}|{problem}|{office}|{answerTime}|{answer}"
 
                with open("yijian.txt", 'a', encoding='utf-8') as fp:
                    fp.write(itemm + '\n')
            else:
                print(f"Failed to retrieve details for ID: {originalId}")
 
        page += 1

  

posted @   财神给你送元宝  阅读(22)  评论(0编辑  收藏  举报
点击右上角即可分享
微信分享提示