一个稍微用了下selenium的爬虫框架
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options def gethreflist(url) # Set the headers for the request headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } # Set up Chrome options chrome_options = Options() chrome_options.add_argument("--headless") # Run Chrome in headless mode # Initialize the WebDriver driver = webdriver.Chrome(options=chrome_options) # Open the URL with the specified headers url = "http://www.chinaenvironment.com/zxxwlb/index_123.html" driver.get(url) # Scroll to the end of the page driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END) # Click the "more content" button until it can't be found while True: try: more_content_button = driver.find_element(By.XPATH, '//a[@class="getMore"]') more_content_button.click() driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END) except: break # Get the final content after clicking all "more content" buttons content = driver.page_source # Close the WebDriver driver.quit() url_list = [] tree = etree.HTML(content) href_list = tree.xpath('//div[@class="sideL fl"]//a[@class = "title"]/@href') href_list = [item for item in href_list if item != "javascript:;"] url = 'http://www.chinaenvironment.com' for i in range(len(href_list)): new_url = url + href_list[i] url_list.append(new_url) return url_list def download_text(url_list): failed_page_num = 0 for url in url_list: try: headers = { 'Accept':'text/html, */*; q=0.01', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cookie':'ASP.NET_SessionId=ycrocaebez3wg5fvn30v1mjv', 'Host':'www.chinaenvironment.com', 'Proxy-Connection':'keep-alive', 'Referer':'http://www.chinaenvironment.com/zxxwlb/index_123_114250.html', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'X-Requested-With':'XMLHttpRequest' } request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request) content = response.read() tree = etree.HTML(content) name = tree.xpath('//div[@class="articleTit"]/text()')[0]+'.txt' name = name.replace("/","") save_path = './环保网/新闻/'+name text = tree.xpath('//div[@class="edits"]//span/text()') result = '' for t in text: result = result + '\n' + t with open(save_path,'w') as fp: fp.write(result) except: failed_page_num += 1 print("{} pages failed in this page".format(failed_page_num)) pass if __name__ == '__main__': url_list = gethreflist(url) # 获得main page页所有的新闻链接 download_text(url_list) #下载所有链接中的新闻文本 print('download complete!!')