改进了headers的爬虫(Cookies)
import urllib.request from lxml import etree def create_request(page): if page == 1: url = 'http://www.chinaeol.net/hjxw/gnxw' else: url = 'http://www.chinaeol.net/hjxw/gnxw/index_' + str(page) + '.shtml' headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', #'Accept-Encoding':'gzip, deflate', #'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', #'Cache-Control':'max-age=0', 'Cookie': 'Hm_lvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lpvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lvt_fb9e17abd59ff9a2c324890c5a701eca=1695037543; Hm_lvt_2ed05369c38555b813edc07a4dc8e126=1695037543; Hm_lpvt_fb9e17abd59ff9a2c324890c5a701eca=1695038268; Hm_lpvt_2ed05369c38555b813edc07a4dc8e126=1695038268', 'Host': 'www.chinaeol.net', 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) return request def get_content(request): response = urllib.request.urlopen(request) content = response.read() return content def get_hreflist(content): url_list = [] tree = etree.HTML(content) href_list = tree.xpath('//ul[@class="cj_tianlibu"]//a/@href') href_list = [item for item in href_list if item != "javascript:;"] url = 'http://www.chinaeol.net/hjxw/gnxw/' for i in range(len(href_list)): new_url = url + href_list[i] url_list.append(new_url) return url_list def download_text(url_list): failed_page_num = 0 for url in url_list: #try: headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', #'Accept-Encoding':'gzip, deflate', #'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', #'Cache-Control':'max-age=0', 'Cookie': 'Hm_lvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lpvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lvt_fb9e17abd59ff9a2c324890c5a701eca=1695037543; Hm_lvt_2ed05369c38555b813edc07a4dc8e126=1695037543; Hm_lpvt_fb9e17abd59ff9a2c324890c5a701eca=1695038268; Hm_lpvt_2ed05369c38555b813edc07a4dc8e126=1695038268', 'Host': 'www.chinaeol.net', 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request) content = response.read() tree = etree.HTML(content) name = tree.xpath('//span[@class="toptitle"]/text()')[0]+'.txt' name = name.replace("/","") save_path = './生态环境部宣传教育中心/国内新闻/'+name text = tree.xpath('//div[@class="TRS_Editor"]//span/text()') result = '' for t in text: result = result + '\n' + t with open(save_path,'w') as fp: fp.write(result) '''except: failed_page_num += 1 print("{} pages failed in this page".format(failed_page_num)) pass''' if __name__ == '__main__': start_page = 1 end_page = 1 for page in range(start_page,end_page+1): request = create_request(page) # 导入了第page页 content = get_content(request) # 获得第page页的源代码 url_list = get_hreflist(content) # 获得第page页所有的新闻链接 #download_text(url_list) #下载第page页所有的新闻文本 print('第' + str(page) + '页下载完成') #except: # print("failed to reach page {}".format(page))