爬取京东商品信息 +元素交互操作 + bs4
2019-07-03 20:13 tankyy 阅读(492) 评论(0) 编辑 收藏 举报1、爬取京东商品信息
from selenium import webdriver from selenium.webdriver.common.keys import Keys #键盘按键操作 import time def get_good(driver): num=1 try: time.sleep(5) # 下拉滑动5000px js_code =''' window.scrollTo(0,5000) ''' driver.execute_script(js_code) time.sleep(5) good_list = driver.find_elements_by_class_name('gl-item') for good in good_list: good_name = good.find_element_by_css_selector('.p-name em').text good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href') good_price = good.find_element_by_class_name('p-price').text good_commit = good.find_element_by_class_name('p-commit').text good_content = f''' num:{num} 商品名称:{good_name} 商品连接:{good_url} 商品价格:{good_price} 商品评价:{good_commit} \n ''' print(good_content) with open('jd.txt','a',encoding='utf-8') as f: f.write(good_content) num += 1 print('商品信息写入成功') # 查找下一页并点击 next_tag = driver.find_element_by_class_name('pn-next') next_tag.click() time.sleep(5) # 递归调用函数本身 get_good(driver) finally: driver.close() if __name__ == '__main__': driver = webdriver.Chrome(r'D:\Program Files\Python\Python36\Scripts\chromedriver.exe') try: driver.implicitly_wait(10) driver.get('https://www.jd.com/') input_tag = driver.find_element_by_id('key') input_tag.send_keys('墨菲定律') input_tag.send_keys(Keys.ENTER) get_good(driver) finally: driver.close()
2、元素交互操作
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.keys import Keys #键盘按键操作 import time driver = webdriver.Chrome(r'D:\Program Files\Python\Python36\Scripts\chromedriver.exe') try: driver.implicitly_wait(5) driver.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') time.sleep(5) driver.switch_to.frame('iframeResult') time.sleep(1) # 起始方块id: draggable source = driver.find_element_by_id('draggable') # 目标方块id: droppable target = driver.find_element_by_id('droppable') print(source.size) print(source.tag_name) print(source.text) print(source.location) # 找到滑动距离 distance = target.location['x']-source.location['x'] # 按住起始滑块 ActionChains(driver).click_and_hold(source).perform() s=0 while s < distance: ActionChains(distance).move_by_offset(xoffset=2,yoffset=0).perform() s += 2 time.sleep(0.1) ActionChains(driver).release().perform() time.sleep(10) finally: driver.close()
3、bs4
(1)bs4使用
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup) print(type(soup)) # 美化功能 html = soup.prettify() print(html)
(2)bs4遍历文档树
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') # 1、直接使用 # print(soup.html) # print(type(soup.html)) # print(soup.a) # print(soup.p) # 2、获取标签的名称 # print(soup.a.name) # 3、获取标签的属性 # print(soup.a.attrs) # 获取a标签中所有属性 # print(soup.a.attrs['href']) # 4、获取标签的内容 # print(soup.p.text) # 5、嵌套选择 # print(soup.html.body.p) # 6、子节点、子孙节点 # print(soup.p.children) # 返回迭代器对象 # print(list(soup.p.children)) # 7、父节点、祖先节点 # print(soup.b.parent) # print(soup.b.parents) # print(list(soup.b.parents)) # 8、兄弟节点(sibling:兄弟姐妹) # print(soup.a) # 获取下一个兄弟节点 # print(soup.a.next_sibling) # 获取下一个的所有兄弟节点,返回的是一个生成器 # print(soup.a.next_siblings) # print(list(soup.a.next_siblings)) # 获取上一个兄弟节点 print(soup.a.previous_sibling) # 获取上一个的所有兄弟节点,返回的是一个生成器 print(list(soup.a.previous_siblings))
(3)bs4搜索文档树
'''''' ''' 标签查找与属性查找: name 属性匹配 name 标签名 attrs 属性查找匹配 text 文本匹配 标签: - 字符串过滤器 字符串全局匹配 - 正则过滤器 re模块匹配 - 列表过滤器 列表内的数据匹配 - bool过滤器 True匹配 - 方法过滤器 用于一些要的属性以及不需要的属性查找。 属性: - class_ - id ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') # p = soup.find(name = 'p') # p_s = soup.find_all(name = 'p') # print(p) # print(p_s) # name + attrs # p = soup.find(name='p',attrs={"id":"p"}) # print(p) # name + text # tag = soup.find(name='title',text="The Dormouse's story") # print(tag) # name + attrs + text # tag = soup.find(name='a',attrs={"class":"sister"},text="Elsie") # print(tag) import re # name # 根据re模块匹配带有a的节点 # a = soup.find(name=re.compile('a')) # print(a) # a_s = soup.find_all(name=re.compile('a')) # print(a_s) # attrs # a = soup.find(attrs={"id":re.compile('link')}) # print(a) # - 列表过滤器 # 列表内的数据匹配 # print(soup.find(name=['a','p','html',re.compile('a')])) # print(soup.find_all(name=['a','p','html',re.compile('a')])) # - bool过滤器 # True匹配 # print(soup.find(name=True,attrs={"id":True})) # - 方法过滤器 # 用于一些要的属性以及不需要的属性查找。 # def have_id_not_class(tag): # if tag.name =='p'and tag.has_attr("id") and not tag.has_attr("class"): # return tag # print(soup.find_all(name=have_id_not_class)) # 补充知识点 # id a = soup.find(id='link2') print(a) # class p = soup.find(class_='sister') print(p)