北京市政百姓信件分析实战一 (利用python爬取数据)
因为我的python版本为3.12
所以安装一些软件包命令 与之前有些许不同
pip install beautifulSoup4
pip install demjson3
pip install requests
话不多说 代码奉上
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import json import demjson3 import requests from bs4 import BeautifulSoup import csv headers = { 'Host' : 'www.beijing.gov.cn' , 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0' , 'Accept' : 'application/json, text/javascript, */*; q=0.01' , 'Accept-Language' : 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' , 'Accept-Encoding' : 'gzip, deflate' , 'Content-Type' : 'text/json' , 'X-Requested-With' : 'XMLHttpRequest' , 'Content-Length' : '155' , 'Origin' : 'http://www.beijing.gov.cn' , 'Connection' : 'keep-alive' , 'Referer' : 'http://www.beijing.gov.cn/hudong/hdjl/' } if __name__ = = "__main__" : page = 1 datas = json.dumps({}) while page < 175 : print (page) url = f "https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo={page}&page.pageSize=6&orgtitleLength=26" r = requests.post(url, data = datas, headers = headers) rr = demjson3.decode(r.text); for item in rr.get( "result" , []): originalId = item.get( "originalId" ) # 编号 letterTypeName = item.get( "letterTypeName" ) # 信件类型 # 构建详情页URL detail_url = f "http://www.beijing.gov.cn/hudong/hdjl/com.web.{('consult' if letterTypeName == '咨询' else 'suggest')}.{('consultDetail' if letterTypeName == '咨询' else 'suggesDetail')}.flow?originalId={originalId}" r1 = requests.get(detail_url, headers = { 'user-agent' : 'Mozilla/5.0' }) if r1.status_code = = 200 : demo = r1.text soup = BeautifulSoup(demo, "html.parser" ) title = soup.find( "strong" ).get_text().replace( "\n" , " ") if soup.find(" strong ") else " " fromPeople = soup.find_all( "div" , { "class" : "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted" })[ 0 ].get_text().lstrip( '来信人:' ).lstrip().rstrip() if soup.find_all( "div" , { "class" : "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted" }) else "" fromTime = soup.find_all( "div" , { "class" : "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted" })[ 0 ].get_text().lstrip( '时间:' ) if soup.find_all( "div" , { "class" : "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted" }) else "" problem = soup.find_all( "div" , { "class" : "col-xs-12 col-md-12 column p-2 text-muted mx-2" })[ 0 ].get_text().lstrip().rstrip().replace( "\r" , " ").replace(" \n ", " ") if soup.find_all(" div ", {" class ", " col - xs - 12 col - md - 12 column p - 2 text - muted mx - 2 "}) else " " office = soup.find_all( "div" , { "class" : "col-xs-9 col-sm-7 col-md-5 o-font4 my-2" })[ 0 ].get_text().replace( "\n" , " ") if soup.find_all(" div ", {" class ": " col - xs - 9 col - sm - 7 col - md - 5 o - font4 my - 2 "}) else " " answerTime = soup.find_all( "div" , { "class" : "col-xs-12 col-sm-3 col-md-3 my-2" })[ 0 ].get_text().lstrip( '答复时间:' ) if soup.find_all( "div" , { "class" : "col-xs-12 col-sm-3 col-md-3 my-2" }) else "" answer = soup.find_all( "div" , { "class" : "col-xs-12 col-md-12 column p-4 text-muted my-3" })[ 0 ].get_text().lstrip().rstrip().replace( "\n" , " ").replace(" \r ", " ") if soup.find_all(" div ", {" class ": " col - xs - 12 col - md - 12 column p - 4 text - muted my - 3 "}) else " " itemm = f "{originalId}|{letterTypeName}|{title}|{fromPeople}|{fromTime}|{problem}|{office}|{answerTime}|{answer}" with open ( "yijian.txt" , 'a' , encoding = 'utf-8' ) as fp: fp.write(itemm + '\n' ) else : print (f "Failed to retrieve details for ID: {originalId}" ) page + = 1 |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步