史高治

新随笔 联系 管理
driver.find_element_by_*('*')唯一时,等同driver.find_elements_by_*('*')[0],返回WebElement对象,有.send_keys()、.click()等定位法,以及.text、.get_attribute('*')提取自家标签的内容、属性值。
 
Egの爬租房网agoda——
class="LazyLoad",连续点翻页键到页底才捕获全。有的网站class="… invisible",也得用browser爬。
 
import time,random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
 
profile=webdriver.FirefoxProfile()
profile.set_preference('permissions.default.image', 2)  #禁图片
profile.set_preference('permissions.default.stylesheet', 2)  #禁css
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')  #禁flash
profile.set_preference('javascript.enabled', 'false')  #禁js
 
# ip='118.119.168.172:9999'   #代理
# ip=[int(x) if x.isdigit() else x for x in ip.split(':')]    #端口是int型
# profile.set_preference('network.proxy.type', 1)    #1是手动代理
# profile.set_preference('network.proxy.http',ip[0])
# profile.set_preference('network.proxy.http_port',ip[1])
# profile.set_preference('network.proxy.ssl',ip[0])
# profile.set_preference('network.proxy.ssl_port',ip[1])
# profile.update_preferences()
 
#火狐驱动geckodriver或chromedriver的所在目录,没在系统path里则加executable_path参数
binary='C:/Program Files/Mozilla Firefox/firefox.exe'   #火狐的安装路径为自定义
options=webdriver.firefox.options.Options();options.add_argument('-headless')  #静默模式
driver=webdriver.Firefox(profile,binary,firefox_options=options)
 
def run(url):
    driver.get(url)
    ActionChains(driver).click().perform()  #首次打开时被订房弹窗挡了,单击下使翻页键能用
    while True:
        #driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
        for x in range(20): #下页键不知点几次才能到网页底端,点20次看看
            ActionChains(driver).key_down(Keys.PAGE_DOWN).perform()
            time.sleep(random.random())
        t=driver.find_elements_by_css_selector('ol.hotel-list-container li div ul li h3 span') 
        print(len(t))
        for ele in t:  #只处理可见元素,有的网站用隐含字段(不看网页源代码没法知道)反爬
            if ele.is_displayed():print(ele.text)
        try:driver.find_element_by_id('paginationNext').click()  #除末页外,都有下一页按钮
        except:break
    driver.quit()
   
run('https://www.agoda.com\
/zh-cn/pages/agoda/default/DestinationSearchResult.aspx?city=16670')
****************************************分割线****************************************
局の粮药: 删除下文俩网址和一句中文里的和蟹字
 
import time,re
from selenium import webdriver
driver=webdriver.Firefox()
indexUrl='http://ap反p1.sf击da.g爬ov.cn/data虫search/fac虫e3/di虫r.html'
 
def loginByBrowser():
    css = driver.find_element_by_css_selector
    driver.get(indexUrl)
    time.sleep(3)
    driver.find_element_by_partial_link_text('国①产①药①品(').click()
    pages=int(re.findall('第1页/共(\d+)页',driver.page_source)[0])-1
    for page in range(pages):
        css('img[src="images/data删an除niu_07.gif"]').click()
 
if __name__ == '__main__':
    loginByBrowser()
****************************************分割线****************************************
登录百度:
 
indexUrl='https://www.baidu.com/'
userName='……'
pwd='……'
 
import time
from selenium import webdriver
 
options=webdriver.ChromeOptions()   #自定义路径、无地址栏信息条、无头、禁图、加载插件
options.binary_location='D:/Program Files/Browser/CentBrowser/Application/chrome.exe'
options.add_argument('disable-infobars')
#options.add_argument('headless')    #有验证码,本例不用无头模式也不禁图片
#options.add_experimental_option('prefs',{'profile.managed_default_content_settings.images':2})
options.add_extension('D:/广告终结者 3.2.2.crx')
driver=webdriver.Chrome(chrome_options=options)
 
def loginByBrowser():
    driver.get(indexUrl)
    css=driver.find_element_by_css_selector
    css('#u1> a.lb').click()
    time.sleep(2)
    css('.tang-pass-footerBarULogin').click()
    css('[id$=userName]').send_keys(userName)
    css('[id$=password]').send_keys(pwd)
    css('#TANGRAM__PSP_10__submit').click()
    input('浏览器端手动输完验证码后,在本句句尾任敲一字母:')
    try:css('#TANGRAM__PSP_10__submit').click()
    except:pass
 
if __name__ == '__main__':
    loginByBrowser()
****************************************分割线****************************************
登录58同城:
 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
 
profile = webdriver.FirefoxProfile()
profile.set_preference('permissions.default.image', 2)  #禁图片
profile.set_preference('permissions.default.stylesheet', 2) #禁css
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')    #禁flash
profile.set_preference('javascript.enabled', 'false')   #禁js
options=webdriver.firefox.options.Options();
options.add_argument('-headless')
driver=webdriver.Firefox(profile,firefox_options=options)
 
def loginByBrowser(userName,pwd):
    driver.get('http://passport.58.com/login')
    css=driver.find_element_by_css_selector
    css('#pwdLogin').click()
    css('#usernameUser').send_keys(userName)
    css('#passwordUserText').send_keys(pwd,Keys.ENTER)
    input('浏览器端手动输入短信验证码并点击确定后,在本句句尾任敲一字母:')
    time.sleep(4)
    driver.find_element_by_link_text('退出').click()
 
    driver.quit()
 
loginByBrowser('用户名','密码')
****************************************分割线****************************************
# 核对省校平台申请毕业或学位的学生:
 
from selenium import webdriver
from selenium.webdriver.support.select import Select
from openpyxl import Workbook
 
driver=webdriver.Firefox()    #验证码是图片,故本例不禁图
css=driver.find_element_by_css_selector
allPage=[]
 
def loginByBrowser(userName,pwd):
    driver.get('http://222.19.127.21/PRTVUWeb/pages/common/frameset.jsp')
    css('[name*=j_username]').send_keys(userName)
    css('[name=j_password]').send_keys(pwd)
 
def graduateApplication():
    #登录后的首个视图下,定位插件如SelectorGadget竟启动不了:直接get抓包发现的真实网址
    driver.get('http://222.19.127.21/PRTVUWeb/pages/graduate/querystugraduate.jsp')
    Select(css('[name=ifAppGraduate]')).select_by_value('1')   #下拉框の申请毕业-存在
    css('[type=submit]').click()
    
def studentsInfo():
    page=driver.find_element_by_css_selector('[name=curPage]').get_attribute('value')
    students=driver.find_elements_by_css_selector('tr[align=center][bgcolor]')  #多属性定位
    print(f'输出第{page}页,人数:{len(students)}')
    for stu in students:
        info = stu.text.split()
        student=info[:2]+info[3:7]+info[8:10]  #[ ]的.extend()、.append()返回None,+为[]
        allPage.append(student)
    try:    #末页无下一页按钮
        css('[value=下一页]').click()
        studentsInfo()
    except:pass
 
def saveToExcel():
    wb=Workbook()
    ws=wb.active
    ws.append(['序号','学习中心','学号','姓名','专业','层次','申请毕业','申请学位'])
    for student in allPage:
        ws.append(student)
    wb.save('E:/省校平台查毕业申请.xlsx')
 
if __name__ == '__main__':
    loginByBrowser('用户名','密码')
    input('浏览器端手动输完验证码并点击登录后,在本句句尾任敲一字母:')
    graduateApplication()
    studentsInfo()
    saveToExcel()
****************************************分割线****************************************
某宝的物品搜索:
 
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
 
driver = webdriver.Firefox()    #若驱动.exe的目录没在系统path,则()内写出其路径
wait=WebDriverWait(driver,9)
css=By.CSS_SELECTOR
 
def response():
    wait.until(ec.presence_of_all_elements_located((css,'#mainsrp-itemlist .item')))
    driver.execute_script('window.stop()')    #加载出需要的所有信息后,就停止加载
    soup=BeautifulSoup(driver.page_source,'lxml')
    items=soup.find('div','m-itemlist').find_all('div','item')
    for item in items:    #36个item,缺少api网址中的12个
        product={'img':item.img['data-src']+'_360x360Q90.jpg',
            'price':item.find('div','price').text.strip(),'sales':item.find('div','deal-cnt').text[:-3],
            'title':item.img['alt'].split()[0],'location':item.find('div','location').text}
        print(product)
def search(commodity):
    driver.get('https://www.taobao.com/')
    wait.until(ec.presence_of_element_located((css,'#q'))).send_keys(f'{commodity}')
    wait.until(ec.element_to_be_clickable((css,'.btn-search'))).click()
    response()
 
def nextPage(page):
    inputBox=wait.until(ec.presence_of_element_located((css,'[aria-label=页码输入框]')))
    inputBox.clear()
    inputBox.send_keys(page)
    wait.until(ec.element_to_be_clickable((css,'span.btn.J_Submit'))).click()
    wait.until(ec.text_to_be_present_in_element((css,'span.num'),str(page)))
    print(f'当前是第{page}页')
    response()
 
if __name__ == '__main__':
    commodity=input('请输入要搜索的商品:')
    search(commodity)
    for x in range(2,10):
        nextPage(x)
posted on 2017-10-19 17:49  史高治  阅读(384)  评论(0编辑  收藏  举报