Python爬虫:无头浏览器爬虫

Ubuntu

使用chromium

sudo apt-get install -y chromium-browser # 安装浏览器,这部必须,如果只手动安装运行会报错,缺少依赖。

或者看这个安装新版浏览器并用binary_location指定位置(需要科学上网):
https://github.com/scheib/chromium-latest-linux
也可以不科学上网手动下载:
https://www.chromium.org/getting-involved/download-chromium

CentOS

使用firefox

yum -y install firefox

驱动:

将其权限+x
chrome:http://chromedriver.chromium.org/
firefox:https://github.com/mozilla/geckodriver/releases

使用浏览器的无头模式headless

安装模块:

pip3 install selenium beautifulsoup4 lxml # ChromeDriver

chrome

#!/usr/bin/env python
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

url = "http://www.qq.com"
options = Options()
options.headless = True
#options.binary_location = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"
#driver = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)

#options.binary_location = "/home/ubuntu/chrome-linux/chrome" 
driver = webdriver.Chrome(executable_path='/home/ubuntu/chromedriver', chrome_options=options)
driver.get(url)
html = driver.page_source
print(html)
driver.quit()

firefox

#!/usr/bin/env python
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time

url = 'http://www.qq.com/'
options = Options()
options.headless = True

#如果设置代理
"""
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', "127.0.0.1")
profile.set_preference('network.proxy.http_port', 1080)
profile.set_preference('network.proxy.socks', "127.0.0.1")
profile.set_preference('network.proxy.socks_port', 1080)
profile.set_preference('network.proxy.ssl', "127.0.0.1")
profile.set_preference('network.proxy.ssl_port', 1080)
profile.set_preference('network.proxy.ftp', "127.0.0.1")
profile.set_preference('network.proxy.ftp_port', 1080)

#profile.set_preference("network.proxy.share_proxy_settings", True)
#profile.update_preferences()
"""

#options.binary_location = "D:/Program Files/Mozilla Firefox/firefox.exe"
#driver = webdriver.Firefox(executable_path='geckodriver.exe', firefox_profile=profile, firefox_options=options)

#options.binary_location = "/root/firefox-linux/bin/firefox"
driver = webdriver.Firefox(executable_path='/root/geckodriver', firefox_profile=profile, firefox_options=options)
driver.get(url)
html = driver.page_source
print(html)
driver.quit()
posted @ 2018-12-28 09:05  xuejianbest  阅读(2826)  评论(0编辑  收藏  举报