xpath作为XML路径语言(XML Path Language),它是一种用来确定XML文档中某部分位置的语言
简单语法:
nodename |
选取此节点的所有子节点 |
/ |
从根节点选取 /body/div |
/ / |
从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置 //div |
. |
选取当前节点 |
.. |
选取当前节点的父节点 |
@ |
选取属性 |
案例:
| doc=''' |
| <html> |
| <head> |
| <base href='http: |
| <title>Example website</title> |
| </head> |
| <body> |
| <div id='images'> |
| <a href='image1.html' id='id_a'>Name: My image 1 <br/><img src='image1_thumb.jpg' /></a> |
| <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> |
| <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> |
| <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> |
| <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> |
| <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a> |
| </div> |
| </body> |
| </html> |
| ''' |
| |
| from lxml import etree |
| |
| html=etree.HTML(doc) |
| # html=etree.parse('search.html',etree.HTMLParser()) |
| # 1 所有节点 |
| # a=html.xpath(' |
| # 2 指定节点(结果为列表) |
| # a=html.xpath('//head') |
| |
| # 3 子节点,子孙节点 |
| # a=html.xpath('//div/a') |
| # a=html.xpath('//body/a') #无数据 |
| # a=html.xpath('//body//a') |
| # 4 父节点 |
| # a=html.xpath('//body//a[@href="image1.html"]/..') |
| # a=html.xpath('//body//a[1]/..') |
| # 也可以这样 |
| # a=html.xpath('//body//a[1]/parent::*') |
| # a=html.xpath('//body//a[1]/parent::div') |
| # 5 属性匹配 |
| # a=html.xpath('//body//a[@href="image1.html"]') |
| |
| # 6 文本获取 text() ******** |
| # a=html.xpath('//body//a[@href="image1.html"]/text()') |
| |
| # 7 属性获取 ****** |
| # a=html.xpath('//body//a/@href') |
| # a=html.xpath('//body//a/@id') |
| # # 注意从1 开始取(不是从0) |
| # a=html.xpath('//body//a[1]/@id') |
| # 8 属性多值匹配 |
| # a 标签有多个class类,直接匹配就不可以了,需要用contains |
| # a=html.xpath('//body//a[@class="li"]') |
| # a=html.xpath('//body//a[@name="items"]') |
| # a=html.xpath('//body//a[contains(@class,"li")]') |
| # a=html.xpath('//body//a[contains(@class,"li")]/text()') |
| # 9 多属性匹配 |
| # a=html.xpath('//body//a[contains(@class,"li") or @name="items"]') |
| # a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()') |
| |
| # 10 按序选择 |
| # a=html.xpath('//a[2]/text()') |
| # a=html.xpath('//a[3]/@href') |
| # 取最后一个 |
| # a=html.xpath('//a[last()]/@href') |
| # 位置小于3的 |
| # a=html.xpath('//a[position()<3]/@href') |
| # 倒数第二个 |
| # a=html.xpath('//a[last()-2]/@href') |
| # 11 节点轴选择 |
| # ancestor:祖先节点 |
| # 使用了* 获取所有祖先节点 |
| # a=html.xpath('//a/ancestor::*') |
| # # 获取祖先节点中的div |
| # a=html.xpath('//a/ancestor::div') |
| # attribute:属性值 |
| # a=html.xpath('//a[1]/attribute::*') |
| # a=html.xpath('//a[1]/attribute::href') |
| # child:直接子节点 |
| # a=html.xpath('//a[1]/child::*') |
| # descendant:所有子孙节点 |
| # a=html.xpath('//a[6]/descendant::*') |
| |
| # following:当前节点之后所有节点 |
| # a=html.xpath('//a[1]/following::*') |
| # a=html.xpath('//a[1]/following::*[1]/@href') |
| # following-sibling:当前节点之后同级节点 |
| # a=html.xpath('//a[1]/following-sibling::*') |
| # a=html.xpath('//a[1]/following-sibling::a') |
| # a=html.xpath('//a[1]/following-sibling::*[2]') |
| a=html.xpath('//a[1]/following-sibling::*[2]/@href') |
| |
| print(a) |
| |
| -滑动验证码 |
| |
| |
| |
| -形式一: |
| actions=ActionChains(bro) |
| actions.drag_and_drop(sourse,target) |
| actions.perform() |
| -方式二: |
| ActionChains(bro).click_and_hold(sourse).perform() |
| distance=target.location['x']-sourse.location['x'] |
| track=0 |
| while track < distance: |
| ActionChains(bro).move_by_offset(xoffset=2,yoffset=0).perform() |
| track+=2 |
| |
| from selenium import webdriver |
| from selenium.webdriver.common.by import By |
| import time |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.chrome.options import Options |
| |
| options = Options() |
| options.add_argument("--disable-blink-features=AutomationControlled") |
| bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=options) |
| |
| bro.get('https://kyfw.12306.cn/otn/resources/login.html') |
| |
| bro.maximize_window() |
| |
| bro.implicitly_wait(10) |
| |
| try: |
| username = bro.find_element(by=By.ID, value='J-userName') |
| username.send_keys('') |
| password = bro.find_element(by=By.ID, value='J-password') |
| password.send_keys('') |
| time.sleep(3) |
| btn = bro.find_element(by=By.ID, value='J-login') |
| btn.click() |
| span = bro.find_element(by=By.ID, value='nc_1_n1z') |
| |
| ActionChains(bro).click_and_hold(span).perform() |
| ActionChains(bro).move_by_offset(xoffset=300, yoffset=0).perform() |
| |
| time.sleep(10) |
| |
| except Exception as e: |
| print(e) |
| |
| finally: |
| bro.close() |
超级鹰验证码识别平台
| import time |
| |
| from selenium import webdriver |
| from selenium.webdriver.common.by import By |
| from chaojiying import ChaojiyingClient |
| from PIL import Image |
| bro = webdriver.Chrome(executable_path='./chromedriver.exe') |
| bro.get('http://www.chaojiying.com/apiuser/login/') |
| bro.implicitly_wait(10) |
| bro.maximize_window() |
| try: |
| username = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input') |
| password = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input') |
| code = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input') |
| btn = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input') |
| username.send_keys('306334678') |
| password.send_keys('lqz123') |
| |
| |
| bro.save_screenshot('main.png') |
| |
| img = bro.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img') |
| location = img.location |
| size = img.size |
| print(location) |
| print(size) |
| |
| img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) |
| |
| |
| img = Image.open('./main.png') |
| |
| fram = img.crop(img_tu) |
| |
| fram.save('code.png') |
| |
| chaojiying = ChaojiyingClient('306334678', 'lqz123', '937234') |
| im = open('code.png', 'rb').read() |
| print(chaojiying.PostPic(im, 1902)) |
| res_code=chaojiying.PostPic(im, 1902)['pic_str'] |
| code.send_keys(res_code) |
| time.sleep(5) |
| btn.click() |
| time.sleep(10) |
| except Exception as e: |
| print(e) |
| finally: |
| bro.close() |
| |
| from selenium import webdriver |
| from selenium.webdriver.common.by import By |
| import time |
| from selenium.webdriver.common.keys import Keys |
| |
| |
| def get_goods(driver): |
| try: |
| goods = driver.find_elements(by=By.CLASS_NAME, value='gl-item') |
| for good in goods: |
| name = good.find_element(by=By.CSS_SELECTOR, value='.p-name em').text |
| price = good.find_element(by=By.CSS_SELECTOR, value='.p-price i').text |
| commit = good.find_element(by=By.CSS_SELECTOR, value='.p-commit a').text |
| url = good.find_element(by=By.CSS_SELECTOR, value='.p-name a').get_attribute('href') |
| img = good.find_element(by=By.CSS_SELECTOR, value='.p-img img').get_attribute('src') |
| if not img: |
| img ='https://'+ good.find_element(by=By.CSS_SELECTOR, value='.p-img img').get_attribute('data-lazy-img') |
| |
| print(''' |
| 商品名字:%s |
| 商品价格:%s |
| 商品链接:%s |
| 商品图片:%s |
| 商品评论:%s |
| ''' % (name, price, url, img, commit)) |
| |
| button = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='下一页') |
| button.click() |
| time.sleep(1) |
| get_goods(driver) |
| except Exception as e: |
| print(e) |
| |
| |
| def spider(url, keyword): |
| driver = webdriver.Chrome(executable_path='./chromedriver.exe') |
| driver.get(url) |
| driver.implicitly_wait(10) |
| try: |
| input_tag = driver.find_element(by=By.ID, value='key') |
| input_tag.send_keys(keyword) |
| input_tag.send_keys(Keys.ENTER) |
| get_goods(driver) |
| finally: |
| driver.close() |
| |
| |
| if __name__ == '__main__': |
| spider('https://www.jd.com/', keyword='情趣睡衣') |
| |
| |
| -做爬虫用的东西,都封装好了,只需要在固定的位置写固定的代码即可 |
| |
| |
| -django 大而全,做web相关的它都用 |
| -scrapy 大而全,做爬虫的,它都用 |
| |
| |
| |
| Scrapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的,使用它可以以快速、简单、可扩展的方式从网站中提取所需的数据。但目前Scrapy的用途十分广泛,可用于如数据挖掘、监测和自动化测试等领域,也可以应用在获取API所返回的数据或者通用的网络爬虫 |
| |
| |
| |
| |
| -mac,linux: |
| pip3 install scrapy |
| -win:看人品 |
| -pip3 install scrapy |
| -人品不好: |
| 1、pip3 install wheel |
| 3、pip3 install lxml |
| 4、pip3 install pyopenssl |
| 5、下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/pywin32/ |
| 6、下载twisted的wheel文件:http://www.lfd.uci.edu/~gohlke/pythonlibs/ |
| 7、执行pip3 install 下载目录\Twisted-17.9.0-cp36-cp36m-win_amd64.whl |
| 8、pip3 install scrapy |
| |
| |
| |
| -以后使用这个创建爬虫项目 ---》django-admin创建django项目 |
| |
| |
| |
| scrapy startproject myfirstscrapy |
| |
| |
| scrapy genspider cnblogs www.cnblogs.com |
| |
| |
| scrapy crawl cnblogs --nolog |
| |
| |
| 新建run.py |
| from scrapy.cmdline import execute |
| execute(['scrapy', 'crawl', 'cnblogs','--nolog']) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)