driver.find_element_by_*('*')唯一时,等同driver.find_elements_by_*('*')[0],返回WebElement对象,有.send_keys()、.click()等定位法,以及.text、.get_attribute('*')提取自家标签的内容、属性值。
Egの爬租房网agoda——
class="LazyLoad",连续点翻页键到页底才捕获全。有的网站class="… invisible",也得用browser爬。
import time,random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
profile=webdriver.FirefoxProfile()
profile.set_preference('permissions.default.image', 2) #禁图片
profile.set_preference('permissions.default.stylesheet', 2) #禁css
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #禁flash
profile.set_preference('javascript.enabled', 'false') #禁js
# ip='118.119.168.172:9999' #代理
# ip=[int(x) if x.isdigit() else x for x in ip.split(':')] #端口是int型
# profile.set_preference('network.proxy.type', 1) #1是手动代理
# profile.set_preference('network.proxy.http',ip[0])
# profile.set_preference('network.proxy.http_port',ip[1])
# profile.set_preference('network.proxy.ssl',ip[0])
# profile.set_preference('network.proxy.ssl_port',ip[1])
# profile.update_preferences()
#火狐驱动geckodriver或chromedriver的所在目录,没在系统path里则加executable_path参数
binary='C:/Program Files/Mozilla Firefox/firefox.exe' #火狐的安装路径为自定义
options=webdriver.firefox.options.Options();options.add_argument('-headless') #静默模式
driver=webdriver.Firefox(profile,binary,firefox_options=options)
def run(url):
driver.get(url)
ActionChains(driver).click().perform() #首次打开时被订房弹窗挡了,单击下使翻页键能用
while True:
#driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
for x in range(20): #下页键不知点几次才能到网页底端,点20次看看
ActionChains(driver).key_down(Keys.PAGE_DOWN).perform()
time.sleep(random.random())
t=driver.find_elements_by_css_selector('ol.hotel-list-container li div ul li h3 span')
print(len(t))
for ele in t: #只处理可见元素,有的网站用隐含字段(不看网页源代码没法知道)反爬
if ele.is_displayed():print(ele.text)
try:driver.find_element_by_id('paginationNext').click() #除末页外,都有下一页按钮
except:break
driver.quit()
run('https://www.agoda.com\
/zh-cn/pages/agoda/default/DestinationSearchResult.aspx?city=16670')
****************************************分割线****************************************
局の粮药: 删除下文俩网址和一句中文里的和蟹字
import time,re
from selenium import webdriver
driver=webdriver.Firefox()
indexUrl='http://ap反p1.sf击da.g爬ov.cn/data虫search/fac虫e3/di虫r.html'
def loginByBrowser():
css = driver.find_element_by_css_selector
driver.get(indexUrl)
time.sleep(3)
driver.find_element_by_partial_link_text('国①产①药①品(').click()
pages=int(re.findall('第1页/共(\d+)页',driver.page_source)[0])-1
for page in range(pages):
css('img[src="images/data删an除niu_07.gif"]').click()
if __name__ == '__main__':
loginByBrowser()
****************************************分割线****************************************
登录百度:
indexUrl='https://www.baidu.com/'
userName='……'
pwd='……'
import time
from selenium import webdriver
options=webdriver.ChromeOptions() #自定义路径、无地址栏信息条、无头、禁图、加载插件
options.binary_location='D:/Program Files/Browser/CentBrowser/Application/chrome.exe'
options.add_argument('disable-infobars')
#options.add_argument('headless') #有验证码,本例不用无头模式也不禁图片
#options.add_experimental_option('prefs',{'profile.managed_default_content_settings.images':2})
options.add_extension('D:/广告终结者 3.2.2.crx')
driver=webdriver.Chrome(chrome_options=options)
def loginByBrowser():
driver.get(indexUrl)
css=driver.find_element_by_css_selector
css('#u1> a.lb').click()
time.sleep(2)
css('.tang-pass-footerBarULogin').click()
css('[id$=userName]').send_keys(userName)
css('[id$=password]').send_keys(pwd)
css('#TANGRAM__PSP_10__submit').click()
input('浏览器端手动输完验证码后,在本句句尾任敲一字母:')
try:css('#TANGRAM__PSP_10__submit').click()
except:pass
if __name__ == '__main__':
loginByBrowser()
****************************************分割线****************************************
登录58同城:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
profile = webdriver.FirefoxProfile()
profile.set_preference('permissions.default.image', 2) #禁图片
profile.set_preference('permissions.default.stylesheet', 2) #禁css
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #禁flash
profile.set_preference('javascript.enabled', 'false') #禁js
options=webdriver.firefox.options.Options();
options.add_argument('-headless')
driver=webdriver.Firefox(profile,firefox_options=options)
def loginByBrowser(userName,pwd):
driver.get('http://passport.58.com/login')
css=driver.find_element_by_css_selector
css('#pwdLogin').click()
css('#usernameUser').send_keys(userName)
css('#passwordUserText').send_keys(pwd,Keys.ENTER)
input('浏览器端手动输入短信验证码并点击确定后,在本句句尾任敲一字母:')
time.sleep(4)
driver.find_element_by_link_text('退出').click()
driver.quit()
loginByBrowser('用户名','密码')
****************************************分割线****************************************
# 核对省校平台申请毕业或学位的学生:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from openpyxl import Workbook
driver=webdriver.Firefox() #验证码是图片,故本例不禁图
css=driver.find_element_by_css_selector
allPage=[]
def loginByBrowser(userName,pwd):
driver.get('http://222.19.127.21/PRTVUWeb/pages/common/frameset.jsp')
css('[name*=j_username]').send_keys(userName)
css('[name=j_password]').send_keys(pwd)
def graduateApplication():
#登录后的首个视图下,定位插件如SelectorGadget竟启动不了:直接get抓包发现的真实网址
driver.get('http://222.19.127.21/PRTVUWeb/pages/graduate/querystugraduate.jsp')
Select(css('[name=ifAppGraduate]')).select_by_value('1') #下拉框の申请毕业-存在
css('[type=submit]').click()
def studentsInfo():
page=driver.find_element_by_css_selector('[name=curPage]').get_attribute('value')
students=driver.find_elements_by_css_selector('tr[align=center][bgcolor]') #多属性定位
print(f'输出第{page}页,人数:{len(students)}')
for stu in students:
info = stu.text.split()
student=info[:2]+info[3:7]+info[8:10] #[ ]的.extend()、.append()返回None,+为[]
allPage.append(student)
try: #末页无下一页按钮
css('[value=下一页]').click()
studentsInfo()
except:pass
def saveToExcel():
wb=Workbook()
ws=wb.active
ws.append(['序号','学习中心','学号','姓名','专业','层次','申请毕业','申请学位'])
for student in allPage:
ws.append(student)
wb.save('E:/省校平台查毕业申请.xlsx')
if __name__ == '__main__':
loginByBrowser('用户名','密码')
input('浏览器端手动输完验证码并点击登录后,在本句句尾任敲一字母:')
graduateApplication()
studentsInfo()
saveToExcel()
****************************************分割线****************************************
某宝的物品搜索:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Firefox() #若驱动.exe的目录没在系统path,则()内写出其路径
wait=WebDriverWait(driver,9)
css=By.CSS_SELECTOR
def response():
wait.until(ec.presence_of_all_elements_located((css,'#mainsrp-itemlist .item')))
driver.execute_script('window.stop()') #加载出需要的所有信息后,就停止加载
soup=BeautifulSoup(driver.page_source,'lxml')
items=soup.find('div','m-itemlist').find_all('div','item')
for item in items: #36个item,缺少api网址中的12个
product={'img':item.img['data-src']+'_360x360Q90.jpg',
'price':item.find('div','price').text.strip(),'sales':item.find('div','deal-cnt').text[:-3],
'title':item.img['alt'].split()[0],'location':item.find('div','location').text}
print(product)
def search(commodity):
driver.get('https://www.taobao.com/')
wait.until(ec.presence_of_element_located((css,'#q'))).send_keys(f'{commodity}')
wait.until(ec.element_to_be_clickable((css,'.btn-search'))).click()
response()
def nextPage(page):
inputBox=wait.until(ec.presence_of_element_located((css,'[aria-label=页码输入框]')))
inputBox.clear()
inputBox.send_keys(page)
wait.until(ec.element_to_be_clickable((css,'span.btn.J_Submit'))).click()
wait.until(ec.text_to_be_present_in_element((css,'span.num'),str(page)))
print(f'当前是第{page}页')
response()
if __name__ == '__main__':
commodity=input('请输入要搜索的商品:')
search(commodity)
for x in range(2,10):
nextPage(x)