Python selenium 爬虫设置

Linux 服务器上可以使用 Selenium 来进行网页爬取。为了在 Linux 服务器上使用 Selenium，通常需要做一些额外的配置，特别是与浏览器和浏览器驱动的兼容性相关

配置步骤

安装 Selenium： 你需要确保安装了 Selenium 库。可以使用以下命令来安装：
```
pip install selenium
```
安装浏览器驱动： Selenium 需要与浏览器的驱动程序一起使用。对于 Chrome 和 Firefox，常用的驱动程序是：

　　　　Chrome: 需要安装 ChromeDriver

3.安装无头浏览器（Headless）： 在服务器上，你通常需要使用无头模式的浏览器，例如 Chrome 或 Firefox 无头模式。以下是如何配置无头浏览器：

　　使用 Chrome 无头模式：

1.安装 Chrome 和 ChromeDriver

wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

　　2. 安装 Google Chrome

　　下载完成后，使用 dpkg 命令安装 Chrome：

sudo dpkg -i google-chrome-stable_current_amd64.deb

如果安装时遇到依赖问题，可以使用以下命令来修复：

sudo apt-get install -f

　　3. 验证安装

　　安装完成后，你可以通过以下命令验证 Chrome 是否安装成功：

google-chrome-stable --version

　　4. 安装 ChromeDriver

　　安装好 Google Chrome 后，你还需要安装对应版本的 ChromeDriver。可以手动下载 ChromeDriver 或使用以下命令：

　　ChromeDriver 地址： https://googlechromelabs.github.io/chrome-for-testing/#stable

# 下载 ChromeDriver（请根据你安装的 Chrome 版本选择合适的版本）
wget https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.204/linux64/chromedriver-linux64.zip# 解压并移动到 /usr/local/bin
unzip chromedriver_linux64.zip
sudo mv chromedriver /usr/local/bin/

5. 使用无头模式启动 Selenium

完成上述步骤后，你就可以使用无头模式启动 Chrome 进行 Selenium 自动化操作了，如下所示：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")  # 启动无头模式
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=chrome_options)
driver.get("https://example.com")

print(driver.title)
driver.quit()

完整的多线程爬虫例子

import os
import time
import traceback
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.common.by import By


def is_file_empty(file_path):
    return os.stat(file_path).st_size == 0


def setup_driver():
    """配置 Selenium WebDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # 设置无头模式（可选）
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    return webdriver.Chrome(options=options)


def get_img_url(driver, url, title="", image_cls=""):
    """从指定 URL 获取图片链接."""
    try:
        # 打开页面
        driver.get(url)
        time.sleep(2)  # 等待页面加载完成

        # 滚动页面到底部以加载所有内容
        scroll_pause_time = 4
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # 查找所有图片元素
        parent_elements = driver.find_elements(By.CLASS_NAME, "main-wrapper")

        # 创建保存图片的文件夹
        output_dir = f"{image_cls}/{title}"
        os.makedirs(output_dir, exist_ok=True)

        # 下载图片链接
        urls_path = os.path.join(output_dir, "urls.txt")
        with open(urls_path, "a+") as f:
            for parent in parent_elements:
                img_tags = parent.find_elements(By.CSS_SELECTOR, ".imgbox img")
                for img in img_tags:
                    img_url = img.get_attribute("src")
                    if img_url and img_url.startswith("http") and not img_url.startswith("https://static.zcool.cn"):
                        img_url = img_url.split('?')[0]
                        print("img_url:", img_url)
                        f.write(img_url + "\n")
    except Exception as e:
        print(f"Error processing {url}: {e}")
        traceback.print_exc()


def process_url(title_url_pair, image_cls):
    """处理每个URL."""
    title, url = title_url_pair
    output_dir = f"{image_cls}/{title}"
    output_txt = f"{image_cls}/{title}/url.txt"
    if os.path.exists(output_dir) and not is_file_empty(output_txt):
        print(f"已存在文件夹: {output_dir}")

        return

    driver = setup_driver()
    try:
        get_img_url(driver, url, title, image_cls)
    finally:
        driver.quit()


def process_file(file_name):
    """处理每个文件."""
    image_cls = os.path.splitext(file_name)[0]
    print(f"开始处理文件: {file_name}")

    # 读取待处理的 URL 列表
    with open(file_name, "r") as f:
        data = [line.strip().split("----") for line in f if line.strip()]

    total_lines = len(data)
    print(f"文件 {file_name} 共 {total_lines} 条记录")


    # # 使用线程池处理 URL
    max_threads = 3
    with ThreadPoolExecutor(max_threads) as executor:
        futures = [executor.submit(process_url, title_url_pair, image_cls) for title_url_pair in data]
        for i, future in enumerate(futures, 1):
            try:
                future.result()  # 捕获线程中的异常
                print(f"文件 {file_name} 已处理 {i}/{total_lines} 条记录")
            except Exception as e:
                print(f"Thread failed: {e}")


if __name__ == "__main__":
    # 待处理的文件列表
    files = ["product_photo.txt", "portrait_photography.txt", "fashion_photo.txt"]

    for file_name in files:
        process_file(file_name)

    print("所有文件处理完成。")

posted on 2024-12-31 10:32 星河赵阅读(38) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· CentOS 7下安装配置Nginx

· Linux服务器配置普通用户具有root权限（免密）

· selenium简单应用

· Linux 平台部署 Selenium

· 爬虫学习07之selenium

阅读排行：
· 无需6万激活码！GitHub神秘组织3小时极速复刻Manus，手把手教你使用OpenManus搭建本
· Manus爆火，是硬核还是营销？
· 终于写完轮子一部分：tcp代理了，记录一下
· 别再用vector＜bool＞了！Google高级工程师：这可能是STL最大的设计失误
· 单元测试从入门到精通

一天一点到