04Selenium剩余部分及练习:爬取京东商品信息
昨日回顾
一、爬取豆瓣电影top250
1.爬取电影页
2.解析提取电影信息
3.保存数据
二、Selenium请求库
驱动浏览器往目标网站发送请求,获取响应数据
-不需要分析复杂的通信流程
-执行js代码
-获取动态数据
三、Selenium使用
driver = webdriver.Chrome()
隐式等待
driver.get('网站') 往某个网站发送请求
显式等待
driver.close()
四、选择器
element:查找一个
elements:查找多个
by_id
by_class_name
by_name
by_link_text
by_partial_link_text
by_css_selector
今日内容
一、Selenium剩余部分
1.元素交互操作
1.1 点击,清除
click
clear
示例:
from selenium import webdriver
from selenium.webdriver.common.by import By #按照什么方式查找,By.Id,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC #和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time
driver = webdriver.Chrome(r'C:\Program Files (x86)\chromedriver.exe')
try:
driver.implicitly_wait(10)
driver.get("https://www.jd.com/")
time.sleep(5)
#点击、清除
input = driver.find_element_by_id('key')
input.send_keys('围城')
search = driver.find_element_by_class_name('button')
search.click()
time.sleep(3)
input2 = driver.find_element_by_id('key')
input2.clear()
time.sleep(1)
input2.send_keys('墨菲定律')
input2.send_keys(Keys.ENTER)
time.sleep(10)
finally:
driver.close()
1.2 Action Chains
是一个动作链对象,
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.Id,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC #和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time
driver = webdriver.Chrome(r'C:\Program Files (x86)\chromedriver.exe')
try:
driver.implicitly_wait(10)
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
time.sleep(5)
driver.switch_to.frame('iframeResult')
time.sleep(1)
#获取动作链对象
action = ActionChains(driver)
source = driver.find_element_by_id('draggable')
target = driver.find_element_by_id('droppable')
#方式一
#秒移
# action.drag_and_drop(source, target).perform() #拟定好一个动作。需要调用执行方法.perform
# 方式二
# 一点点移动
#找到滑动距离
print(source.tag_name)
print(source.text)
print(source.size)
print(target.location)
print(source.location)
x=target.location['x']-source.location['x']
#按住div,不同动作不能公用一个ActionChains
ActionChains(driver).click_and_hold(source).perform()
s = 0
while s < x:
#每一次都要调用一次ActionChains
ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
s += 2
time.sleep(0.1)
#放下div
ActionChains(driver).release(source).perform()
time.sleep(10)
finally:
driver.close()
1.3 fram切换
driver.switch_to.frame(frame 的id名)
1.4 执行js代码
driver.get("https://www.baidu.com/")
driver.execute_script(
'''
alert("你好")
'''
)
time.sleep(5)
1.5 其他
#模拟浏览器的前进后退
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')
#后退
browser.back()
time.sleep(10)
#前进
browser.forward()
browser.close()
二、练习:爬取京东商品信息
简单版本
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
driver = webdriver.Chrome(r'C:\Program Files (x86)\chromedriver.exe')
try:
driver.implicitly_wait(10)
driver.get('https://www.jd.com/')
# 往京东主页输入墨菲定律
input_tag = driver.find_element_by_id('key')
input_tag.send_keys('墨菲定律')
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
goods = driver.find_elements_by_class_name('gl-item')
for good in goods:
# print(good)
#商品名称
name = good.find_element_by_css_selector('.p-name em').text
# print(name)
#商品价格
price = good.find_element_by_class_name('p-price').text
#商品链接
url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
#商品评价
commit = good.find_element_by_class_name('p-commit').text
good_content = f'''
商品名称:{name}
商品价格:{price}
商品链接:{url}
商品评价:{commit}
\n
'''
print(good_content)
with open('jd.txt','a',encoding='utf-8') as f:
f.write(good_content)
print("写入成功")
finally:
driver.close()
改良版本
(加入了自动下拉加载商品与点击下一页):
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
def get_goods(driver):
num = 1
js_code = '''
window.scrollTo(0,5000)
'''
driver.execute_script(js_code)
try:
goods = driver.find_elements_by_class_name('gl-item')
for good in goods:
# print(good)
# 商品名称
name = good.find_element_by_css_selector('.p-name em').text
# print(name)
# 商品价格
price = good.find_element_by_class_name('p-price').text
# 商品链接
url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
# 商品评价
commit = good.find_element_by_class_name('p-commit').text
good_content = f'''
num:{num}
商品名称:{name}
商品价格:{price}
商品链接:{url}
商品评价:{commit}
'''
print(good_content)
with open('jd.txt', 'a', encoding='utf-8') as f:
f.write(good_content)
num += 1
print("写入成功")
#找到下一页
next_tag = driver.find_element_by_class_name('pn-next')
next_tag.click()
time.sleep(5)
get_goods(driver)
finally:
driver.close()
if __name__ == '__main__':
driver = webdriver.Chrome(r'C:\Program Files (x86)\chromedriver.exe')
try:
driver.implicitly_wait(10)
driver.get('https://www.jd.com/')
# 往京东主页输入墨菲定律
input_tag = driver.find_element_by_id('key')
input_tag.send_keys('墨菲定律')
input_tag.send_keys(Keys.ENTER)
get_goods(driver)
finally:
driver.close()