Python爬虫:无头浏览器爬虫
Ubuntu
使用chromium
sudo apt-get install -y chromium-browser # 安装浏览器,这部必须,如果只手动安装运行会报错,缺少依赖。
或者看这个安装新版浏览器并用binary_location
指定位置(需要科学上网):
https://github.com/scheib/chromium-latest-linux
也可以不科学上网手动下载:
https://www.chromium.org/getting-involved/download-chromium
CentOS
使用firefox
yum -y install firefox
驱动:
将其权限+x
chrome:http://chromedriver.chromium.org/
firefox:https://github.com/mozilla/geckodriver/releases
使用浏览器的无头模式headless
安装模块:
pip3 install selenium beautifulsoup4 lxml # ChromeDriver
chrome
#!/usr/bin/env python
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
url = "http://www.qq.com"
options = Options()
options.headless = True
#options.binary_location = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"
#driver = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
#options.binary_location = "/home/ubuntu/chrome-linux/chrome"
driver = webdriver.Chrome(executable_path='/home/ubuntu/chromedriver', chrome_options=options)
driver.get(url)
html = driver.page_source
print(html)
driver.quit()
firefox
#!/usr/bin/env python
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
url = 'http://www.qq.com/'
options = Options()
options.headless = True
#如果设置代理
"""
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', "127.0.0.1")
profile.set_preference('network.proxy.http_port', 1080)
profile.set_preference('network.proxy.socks', "127.0.0.1")
profile.set_preference('network.proxy.socks_port', 1080)
profile.set_preference('network.proxy.ssl', "127.0.0.1")
profile.set_preference('network.proxy.ssl_port', 1080)
profile.set_preference('network.proxy.ftp', "127.0.0.1")
profile.set_preference('network.proxy.ftp_port', 1080)
#profile.set_preference("network.proxy.share_proxy_settings", True)
#profile.update_preferences()
"""
#options.binary_location = "D:/Program Files/Mozilla Firefox/firefox.exe"
#driver = webdriver.Firefox(executable_path='geckodriver.exe', firefox_profile=profile, firefox_options=options)
#options.binary_location = "/root/firefox-linux/bin/firefox"
driver = webdriver.Firefox(executable_path='/root/geckodriver', firefox_profile=profile, firefox_options=options)
driver.get(url)
html = driver.page_source
print(html)
driver.quit()