整理好完美使用webdriver进行模拟驱动,再用BeautifulSoup进行提取
样例代码:
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 import time 4 5 if __name__ =='__main__': 6 #options=webdriver.ChromeOptions() 7 #options.binary_location=r'C:\Users\13313\AppData\Local\Google\Chrome\Application\chrome.exe' 8 driver=webdriver.Chrome('E:\Google\Driver\chromedriver.exe') 9 #get方法 打开指定网址 10 driver.get("http://www.baidu.com") 11 #选择网页元素 12 elemnt_keyword=driver.find_element_by_id('kw') 13 #element = driver.find_element(by=By.ID, value="kw") 14 15 #输入搜索信息 16 elemnt_keyword.send_keys('博二爷') 17 #找到搜索按钮 18 element_search_button=driver.find_element_by_id('su') 19 element_search_button.click() 20 21 #等待,进行爬取。 22 time.sleep(2) 23 ret=driver.find_element_by_id('1') 24 print(ret) 25 #获取 值 26 print(ret.text) 27 28 #获取属性值 29 ele = driver.find_element_by_id("1") 30 print(ele.get_attribute('href')) 31 32 #这一条的完整信息 33 ele = driver.find_element_by_id("baidulink") 34 print(ele.get_attribute('outerHTML ')) 35 36 #只获取内部源代码 37 ele = driver.find_element_by_id("baidulink") 38 print(ele.get_attribute('innerHTML')) 39 40 ''' 41 div id="food" style="margin-top:10px;color:red"> 42 <span calss="vegetable good">黄瓜</span> 43 <span calss="meat">牛肉</span> 44 <p calss="vegetable">南瓜</p> 45 <p calss="vegetable">青菜</p> 46 47 代码解决办法: 48 ele = driver.find_element_by_id('food') 49 foodText = ele.get_attribute('innerHTML') 50 1, 51 ret1 = foodText.split('</span>)[1] 52 ret2 = ret1.split('"')[1] 53 2, 54 ele = driver.find_element_by_id('food') 55 html= ele.get_Attribute('innerHTML) 56 from bs4 import BeautifulSoup 57 soup = BeautifulSoup(html,'html5lib') 58 target = soup.find_all('span')[1]['class']#列表 59 print(taget) 60 61 62 获取属性值:print(soup.find('a')['class']) 63 获取文本:soup.find('a')。get_text() 64 65 ''' 66 67 if ret.text.startswith("博二爷"): 68 print("成功") 69 else: 70 print("失败") 71 72 #driver.quit()#全部退出 73 #driver.close()