记一次使用python的selenium库爬取动态页面内容的经历

安装与配置selenium

  1. 安装selenium库
pip install selenium
  1. 配置浏览器驱动(本次使用Google Chrome)

    1. 打开Chrome,在浏览器的地址栏,输入chrome://version/,回车后即可查看到对应版本
    2. 找到对应的chromedriver版本,参考版本选择 | ChromeDriver | Chrome for Developers
    3. 下载版本匹配的chromedriver,将其解压并置于自选路径下,将chromedriver.exe所在路径加入环境变量。
  2. 验证:

    编写并运行以下python文件,若不报错,则基本可认定配置成功:

    from selenium import webdriver
    # Chrome浏览器
    driver = webdriver.Chrome()
    

编写脚本

本次实现的是在Openstack Dashboard里寻找符合条件的镜像,并将其下载下来。镜像要求是名称以“1_”开头,且配置了qga。

脚本如下:

from selenium import webdriver
from selenium.common import ElementClickInterceptedException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# 设置 webdriver
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # 无头模式,不打开实际的浏览器窗口
options.add_argument('--ignore-certificate-errors')  # 忽略证书错误
driver = webdriver.Chrome(options=options)

# OpenStack Horizon登录页面URL
login_url = "https://192.168.0.60/horizon/auth/login/"
image_url = "https://192.168.0.60/horizon/project/images/"

may_url = []

username = "your_username"
password = "your_password"

def get_images():
    try:
        # 打开登录页面
        driver.get(login_url)
        # 等待并找到用户名输入框
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "username"))
        )

        # 输入用户名和密码
        driver.find_element(By.NAME, "domain").send_keys("default")
        driver.find_element(By.NAME, "username").send_keys(username)
        driver.find_element(By.NAME, "password").send_keys(password)

        # 提交表单
        driver.find_element(By.XPATH, '//button[@type="submit"]').click()

        # 确认登录成功,等待某个已知会出现在登录成功后的元素,例如导航栏中的一个元素
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "main_content"))
        )
        print("登录成功")

        # 访问镜像页面
        driver.get(image_url)

        all_images = []
        processed_image_ids = set()

        while True:
            # 等待页面加载,并确保特定数据行存在
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "tbody"))
            )

            # 确保表格内容已加载,等待特定行内容出现
            while True:
                rows = driver.find_elements(By.XPATH, "//tbody/tr")
                if len(rows)>=3:
                    break
                time.sleep(1)

            # 获取页面内容
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            rows = soup.find_all('tr', class_='ng-scope')

            # 处理每一行数据
            for row in rows:
                columns = row.find_all('td')
                if len(columns) > 2:  # 确保有足够的列
                    name_column = columns[3].find('a').text.strip().rsplit('.', 1)[0]
                    if name_column.startswith("1_"):
                        link = columns[3].find('a')['href']
                        may_url.append("https://192.168.0.60/horizon/" + link)

            # 检查是否有下一页
            try:
                next_button = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.XPATH,
                                                '//span[@ng-hide="currentPage === numPages"]/a[@ng-click="selectPage(currentPage + 1)"]'))
                )
                next_button.click()

            except (ElementClickInterceptedException, TimeoutException) as e:
                print(f"Failed to click the next button: {e}")
                break

        for link in may_url:
            print(link)
            driver.get(link)
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "dl"))
            )

            details_soup = BeautifulSoup(driver.page_source, 'html.parser')
            properties_div = details_soup.find_all('div', {'ng-repeat': 'prop in ctrl.image.properties'})
            # 查找包含名称的元素
            name_element = details_soup.find('span', class_='h1 ng-binding')
            name_column = name_element.text.strip().rsplit('.', 1)[0]

            has_qemu_guest_agent = False
            for prop in properties_div:
                dt = prop.find('dt').text.strip()
                dd = prop.find('dd').text.strip()
                if dt == "hw_qemu_guest_agent" and dd == "yes":
                    has_qemu_guest_agent = True
                    break

            if has_qemu_guest_agent:
                image_id = driver.current_url.split('/')[-1]
                if image_id not in processed_image_ids:
                    all_images.append({'name': name_column, 'id': image_id})
                    processed_image_ids.add(image_id)
                    print(f"Name: {name_column}, ID: {image_id}")
        with open("images.txt", "w") as file:
            for image in all_images:
                file.write(f"Name: {image['name']}, ID: {image['id']}\n")
    finally:
        driver.quit()

if __name__ == "__main__":
    get_images()

参考资源

  1. Selenium安装WebDriver最新Chrome驱动(含116/117/118/119)_chromedriver 119-CSDN博客
  2. selenium入门超详细教程——网页自动化操作-CSDN博客
posted @ 2024-06-27 17:27  月光下的犹大  阅读(12)  评论(0编辑  收藏  举报