使用requests和BeautifulSoup对北京市政百姓信件进行爬取

    for page in range(start_page, end_page + 1):
        url = url.format(page)
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        contents = response.read()
        # a标签中前一个是信件的类型,后面那个是信件编号
        # 发送GET请求获取网页内容
        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(contents, "html.parser")
        a_tags = soup.find_all('a', onclick=True)

整个代码为(爬取的结果,根据不同类型存入了不同的txt文件中)

import requests
from bs4 import BeautifulSoup
import urllib.request
import json
import re

def huoqu():
    url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"  # 替换为目标网站的URL
    cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \
             "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \
             " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \
             "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \
             " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \
             "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660."
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \
                 " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
    headers = {"User-Agent": user_agent, "Cookie": cookie}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    contents = response.read()
    # a标签中前一个是信件的类型,后面那个是信件编号
    # 发送GET请求获取网页内容
    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(contents, "html.parser")
    return soup


def huoqu1(start_page, end_page):
    url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"  # 替换为目标网站的URL
    cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \
             "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \
             " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \
             "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \
             " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \
             "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660."
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \
                 " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
    headers = {"User-Agent": user_agent, "Cookie": cookie}
    f1 = open('G:/python/pythonProject/信件爬取/1.txt', 'a')
    f2 = open('G:/python/pythonProject/信件爬取/2.txt', 'a')
    f3 = open('G:/python/pythonProject/信件爬取/3.txt', 'a')
    for page in range(start_page, end_page + 1):
        url = url.format(page)
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        contents = response.read()
        # a标签中前一个是信件的类型,后面那个是信件编号
        # 发送GET请求获取网页内容
        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(contents, "html.parser")
        a_tags = soup.find_all('a', onclick=True)
        for element in a_tags:
            onclick_value = element["onclick"]
            match = re.search(r"letterdetail\('(\d+)', '([^']+)'\)", onclick_value)
            if match:
                onclick_param1 = match.group(1)
                # print(type(onclick_param1))
                onclick_param2 = match.group(2)
                if onclick_param1 == '1':
                    f1.write(onclick_param2+'\n')
                if onclick_param1 == '2':
                    f2.write(onclick_param2+'\n')
                if onclick_param1 == '3':
                    f3.write(onclick_param2+'\n')
                print(f"onclick param 1: {onclick_param1}, onclick param 2: {onclick_param2}")
    f1.flush()
    f2.flush()
    f3.flush()
    f1.close()
    f2.close()
    f3.close()


if __name__ == '__main__':
    huoqu1(1, 173)

__EOF__

本文作者lss
本文链接https://www.cnblogs.com/lss1226/p/17674262.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角推荐一下。您的鼓励是博主的最大动力!
posted @   lss1226  阅读(17)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· winform 绘制太阳,地球,月球 运作规律
· 上周热点回顾(3.3-3.9)
点击右上角即可分享
微信分享提示