一个稍微用了下selenium的爬虫框架

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

def gethreflist(url)
    # Set the headers for the request
    headers = {
        'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }

    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)

    # Open the URL with the specified headers
    url = "http://www.chinaenvironment.com/zxxwlb/index_123.html"
    driver.get(url)
    # Scroll to the end of the page
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)

    # Click the "more content" button until it can't be found
    while True:
        try:
            more_content_button = driver.find_element(By.XPATH, '//a[@class="getMore"]')
            more_content_button.click()
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        except:
            break

    # Get the final content after clicking all "more content" buttons
    content = driver.page_source

    # Close the WebDriver
    driver.quit()
    
    url_list = []
    tree = etree.HTML(content)
    href_list = tree.xpath('//div[@class="sideL fl"]//a[@class = "title"]/@href')
    href_list = [item for item in href_list if item != "javascript:;"]
    url = 'http://www.chinaenvironment.com'
    for i in range(len(href_list)):
        new_url = url + href_list[i]
        url_list.append(new_url)
    return url_list
    

def download_text(url_list):
    failed_page_num = 0
    for url in url_list:
        try:
            headers = {
                'Accept':'text/html, */*; q=0.01',
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
                'Cookie':'ASP.NET_SessionId=ycrocaebez3wg5fvn30v1mjv',
                'Host':'www.chinaenvironment.com',
                'Proxy-Connection':'keep-alive',
                'Referer':'http://www.chinaenvironment.com/zxxwlb/index_123_114250.html',
                'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                'X-Requested-With':'XMLHttpRequest'
            }
            request = urllib.request.Request(url=url,headers=headers)
            response = urllib.request.urlopen(request)
            content = response.read()
            tree = etree.HTML(content)
            name = tree.xpath('//div[@class="articleTit"]/text()')[0]+'.txt'
            name = name.replace("/","")
            save_path = './环保网/新闻/'+name
            text = tree.xpath('//div[@class="edits"]//span/text()')
            result = ''
            for t in text:
                result = result + '\n' + t
            with open(save_path,'w') as fp:
                fp.write(result)

        except:
            failed_page_num += 1
            print("{} pages failed in this page".format(failed_page_num))
            pass

        
if __name__ == '__main__':
    url_list = gethreflist(url) # 获得main page页所有的新闻链接
    download_text(url_list) #下载所有链接中的新闻文本
    print('download complete!!')
posted @ 2023-09-18 19:19 热爱工作的宁致桑阅读(8) 评论(0) 编辑收藏举报
刷新页面返回顶部
Eva's Notes

一个稍微用了下selenium的爬虫框架