使用requests和BeautifulSoup对北京市政百姓信件进行爬取
for page in range(start_page, end_page + 1): url = url.format(page) request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) contents = response.read() # a标签中前一个是信件的类型,后面那个是信件编号 # 发送GET请求获取网页内容 # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(contents, "html.parser") a_tags = soup.find_all('a', onclick=True)
整个代码为(爬取的结果,根据不同类型存入了不同的txt文件中)
import requests from bs4 import BeautifulSoup import urllib.request import json import re def huoqu(): url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow" # 替换为目标网站的URL cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \ "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \ " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \ "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \ " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \ "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660." user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \ " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43" headers = {"User-Agent": user_agent, "Cookie": cookie} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) contents = response.read() # a标签中前一个是信件的类型,后面那个是信件编号 # 发送GET请求获取网页内容 # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(contents, "html.parser") return soup def huoqu1(start_page, end_page): url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow" # 替换为目标网站的URL cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \ "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \ " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \ "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \ " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \ "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660." user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \ " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43" headers = {"User-Agent": user_agent, "Cookie": cookie} f1 = open('G:/python/pythonProject/信件爬取/1.txt', 'a') f2 = open('G:/python/pythonProject/信件爬取/2.txt', 'a') f3 = open('G:/python/pythonProject/信件爬取/3.txt', 'a') for page in range(start_page, end_page + 1): url = url.format(page) request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) contents = response.read() # a标签中前一个是信件的类型,后面那个是信件编号 # 发送GET请求获取网页内容 # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(contents, "html.parser") a_tags = soup.find_all('a', onclick=True) for element in a_tags: onclick_value = element["onclick"] match = re.search(r"letterdetail\('(\d+)', '([^']+)'\)", onclick_value) if match: onclick_param1 = match.group(1) # print(type(onclick_param1)) onclick_param2 = match.group(2) if onclick_param1 == '1': f1.write(onclick_param2+'\n') if onclick_param1 == '2': f2.write(onclick_param2+'\n') if onclick_param1 == '3': f3.write(onclick_param2+'\n') print(f"onclick param 1: {onclick_param1}, onclick param 2: {onclick_param2}") f1.flush() f2.flush() f3.flush() f1.close() f2.close() f3.close() if __name__ == '__main__': huoqu1(1, 173)
__EOF__
本文作者:lss
本文链接:https://www.cnblogs.com/lss1226/p/17674262.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。您的鼓励是博主的最大动力!
本文链接:https://www.cnblogs.com/lss1226/p/17674262.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。您的鼓励是博主的最大动力!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· winform 绘制太阳,地球,月球 运作规律
· 上周热点回顾(3.3-3.9)