还是request 香啊,总之,urllib,urllib3,requests总有一个管用。可以结合GPT多角度尝试

import requests
from lxml import etree

def create_request(page):
    if page == 1:
        url = 'http://www.zhb.org.cn/hbzx/news_2'
    else:
        url = 'http://www.zhb.org.cn/hbzx/news_2/index_' + str(page) + '.html'
    headers = {
        'Host': 'www.zhb.org.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }
    request = requests.get(url, headers=headers)
    return request

def get_content(request):
    content = request.content
    return content

def get_hreflist(content):
    url_list = []
    tree = etree.HTML(content)
    href_list = tree.xpath('//div[@class="newsbox_2"]//li/a/@href')
    href_list = [item for item in href_list if item != "javascript:;"]
    url = 'http://www.zhb.org.cn'
    for i in range(len(href_list)):
        new_url = url + href_list[i]
        url_list.append(new_url)
    return url_list
    

def download_text(url_list):
    failed_page_num = 0
    for url in url_list:
        try:
            headers = {
                'Host': 'www.zhb.org.cn',
                'If-Modified-Since': 'Sun, 17 Sep 2023 16:48:28 GMT',
                'If-None-Match': '"42c4b-7865-60590cb85967c"',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            content = response.content
            tree = etree.HTML(content)
            name = tree.xpath('//div[@class="news_titlenr"]/text()')[0]+'.txt'
            name = name.replace("/","")
            save_path = './中国环境保护协会/新闻/'+name
            text = tree.xpath('//div[@class="news_nrbox"]//p/text()')
            result = ''
            for t in text:
                result = result + '\n' + t
            with open(save_path,'w',encoding='utf-8') as fp:
                fp.write(result)
        except:
            failed_page_num += 1
            print("{} pages failed in this page".format(failed_page_num))
            pass

if __name__ == '__main__':
    start_page = 2
    end_page = 263
    
    for page in range(start_page,end_page+1):
        request = create_request(page)  # 导入了第page页
        content = get_content(request)  # 获得第page页的源代码
        url_list = get_hreflist(content) # 获得第page页所有的新闻链接
        download_text(url_list) #下载第page页所有的新闻文本
        print('' + str(page) + '页下载完成')

 

posted @ 2023-09-18 20:52  热爱工作的宁致桑  阅读(14)  评论(0编辑  收藏  举报