爬虫从入门到入狱之入门(2)
1 css选择器
bs4 可以通过遍历,搜索,css选择器选择标签
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'lxml') # res=soup.select('a') # res=soup.select('#link1') #代表id # res=soup.select('.sister') .代表class # res=soup.select('body>p>a') # 只需要会了css选择,几乎所有的解析器[bs4,lxml...],都会支持css和xpath # res=soup.select('body>p>a:nth-child(2)') # res=soup.select('body>p>a:nth-last-child(1)') # [attribute=value] res=soup.select('a[href="http://example.com/tillie"]') print(res) ''' 记住的: 1 标签名 2 .类名 3 #id号 4 body a body下子子孙孙中得a 5 body>a body下子的a,没有孙 6 其他的参照css选择器 '''
2 selenium基本使用
# requests 发送http请求获取数据,获取数据是xml使用bs4解析,解析出咱么想要的数据 -使用requests获取回来的数据,跟直接在浏览器中看到的数据,可能不一样 -requests不能执行js -如果使用requets,需要分析当次请求发出了多少请求,每个都要发送一次,才能拼凑出网页完整的数据 # selenium 操作浏览器,控制浏览器,模拟人的行为 # 人为点:功能测试 # 自动化测试(接口测试,压力测试),网站,认为点,脚本 appnium # 测试开发 selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题 selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器 # 使用: -安装模块:pip3 install selenium -下载浏览器驱动:selenium操作浏览器,需要有浏览器(谷歌浏览器),谷歌浏览器驱动 -https://registry.npmmirror.com/binary.html?path=chromedriver/ -浏览器版本对应的驱动 106.0.5249.119 找到相应的驱动 -写代码测试 from selenium import webdriver import time # 驱动放到环境变量中,就不用传这个参数了 # 打开一个浏览器 bro = webdriver.Chrome(executable_path='./chromedriver.exe') # 在地址栏输入 网站 bro.get('http://www.baidu.com') time.sleep(3) bro.close() # 关闭tab页 bro.quit() # 关闭浏览器 # rpa:自动化流程机器人,认为做的体力活
3 无界面浏览器
from selenium import webdriver import time from selenium.webdriver.chrome.options import Options # 驱动放到环境变量中,就不用传这个参数了 # 打开一个浏览器 chrome_options = Options() # chrome_options.add_argument('window-size=1920x3000') # 指定浏览器分辨率 # chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug # chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面 # chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度 chrome_options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 # chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置 bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options) # 在地址栏输入 网站 bro.get('https://www.jd.com/') print(bro.page_source) # 浏览器中看到的页面的内容 time.sleep(3) bro.close() # 关闭tab页 bro.quit() # 关闭浏览器
5 selenium其它用法
5.1 小案例,自动登录百度
from selenium import webdriver from selenium.webdriver.common.by import By import time bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('http://www.baidu.com') bro.implicitly_wait(10) # 等待,找一个标签,如果标签没加载出来,等一会 bro.maximize_window() # 全屏 # 通过 a标签文字内容查找标签的方式 a = bro.find_element(by=By.LINK_TEXT, value='登录') # 点击标签 a.click() # 页面中id唯一,如果有id,优先用id input_name = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__userName') # 输入用户名 input_name.send_keys('33333@qq.com') time.sleep(1) input_password = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__password') input_password.send_keys('lqz12345') time.sleep(1) input_submit = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__submit') # 点击 input_submit.click() time.sleep(5) bro.close()
5.1 获取位置属性大小,文本
# 查找标签 bro.find_element(by=By.ID,value='id号') bro.find_element(by=By.LINK_TEXT,value='a标签文本内容') bro.find_element(by=By.PARTIAL_LINK_TEXT,value='a标签文本内容模糊匹配') bro.find_element(by=By.CLASS_NAME,value='类名') bro.find_element(by=By.TAG_NAME,value='标签名') bro.find_element(by=By.NAME,value='属性name') # -----通用的---- bro.find_element(by=By.CSS_SELECTOR,value='css选择器') bro.find_element(by=By.XPATH,value='xpath选择器') # 获取标签位置,大小 print(code.location) print(code.size) ------- print(code.tag_name) print(code.id)
from selenium import webdriver from selenium.webdriver.common.by import By import time import base64 bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('https://kyfw.12306.cn/otn/resources/login.html') # # bro.get('https://www.jd.com/') # bro.implicitly_wait(10) # # bro.maximize_window() # # # 找到扫码登录的标签,搜索标签 # bro.find_element(by=By.ID,value='id号') # bro.find_element(by=By.LINK_TEXT,value='a标签文本内容') # bro.find_element(by=By.PARTIAL_LINK_TEXT,value='a标签文本内容模糊匹配') # bro.find_element(by=By.CLASS_NAME,value='类名') # bro.find_element(by=By.TAG_NAME,value='标签名') # bro.find_element(by=By.NAME,value='属性name') # # -----通用的---- # bro.find_element(by=By.CSS_SELECTOR,value='css选择器') # bro.find_element(by=By.XPATH,value='xpath选择器') # # a = bro.find_element(by=By.LINK_TEXT, value='扫码登录') # a = bro.find_element(by=By.CSS_SELECTOR, value='.login-hd-account>a') a.click() # code = bro.find_element(by=By.ID, value='J-qrImg') code = bro.find_element(by=By.CSS_SELECTOR, value='#J-qrImg') # # # # code = bro.find_element(by=By.CSS_SELECTOR, value='.logo_scene_img') # # print(code) # # # 方案一:通过位置,和大小,截图截出来 print(code.id) print(code.location) print(code.tag_name) print(code.size) # # 方案二:通过src属性获取到图片 print(code.location) print(code.size) print(code.id) # 不是标签的id号 print(code.tag_name) # 是标签的名字 s = code.get_attribute('src') print(s) with open('code.png','wb') as f: res=base64.b64decode(s.split(',')[-1]) f.write(res) time.sleep(3) bro.close()
5.2 等待元素被加载
# 代码执行很快,有些标签还没加载出来,直接取,取不到 # 等待 -显示等待:一般不用,需要指定等待哪个标签,如果标签很多,每个都要设置比较麻烦 -隐士等待: bro.implicitly_wait(10) find找标签的时候,如果找不到,等最多10s钟
5.3 元素操作
# 点击 标签.click() # input写文字 标签.send_keys('文字') #input清空文字 标签.clear() # 模拟键盘操作 from selenium.webdriver.common.keys import Keys input_search.send_keys(Keys.ENTER)
5.4 执行js代码
import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('https://www.jd.com/') # 1 能干很多事情,打印了cookie # bro.execute_script('alert(document.cookie)') # 2 滚动页面,到最底部 # 一点点滑动 # for i in range(10): # y=400*(i+1) # bro.execute_script('scrollTo(0,%s)'%y) # time.sleep(1) # 一次性直接滑动到最底部 bro.execute_script('scrollTo(0,document.body.scrollHeight)') time.sleep(3) bro.close()
5.5 切换选项卡
import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('https://www.jd.com/') # 使用js打开新的选项卡 bro.execute_script('window.open()') # 切换到这个选项卡上,刚刚打开的是第一个 bro.switch_to.window(bro.window_handles[1]) bro.get('http://www.taobao.com') time.sleep(2) bro.switch_to.window(bro.window_handles[0]) time.sleep(3) bro.close() bro.quit()
5.6 浏览器前进后退
import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('https://www.jd.com/') time.sleep(2) bro.get('https://www.taobao.com/') time.sleep(2) bro.get('https://www.baidu.com/') # 后退一下 bro.back() time.sleep(1) # 前进一下 bro.forward() time.sleep(3) bro.close()
5.7 异常处理
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException try: except Exception as e: print(e) finally: bro.close()
6 xpath的使用
# html中选择标签,可以使用的通用方式 -css选择 -xpath选择 -XPath即为XML路径语言(XML Path Language),它是一种用来确定XML文档中某部分位置的语言 # 语法的简单介绍 -nodename 选取此节点的所有子节点 -/ 从根节点选取 /body/div -// 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置 //div -. 选取当前节点。 -.. 选取当前节点的父节点。 -@ 选取属性。 # 终极大招 -复制 # 案例
doc=''' <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html' id='id_a'>Name: My image 1 <br/><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a> </div> </body> </html> ''' from lxml import etree html=etree.HTML(doc) # html=etree.parse('search.html',etree.HTMLParser()) # 1 所有节点 # a=html.xpath('//*') # 2 指定节点(结果为列表) # a=html.xpath('//head') # 3 子节点,子孙节点 # a=html.xpath('//div/a') # a=html.xpath('//body/a') #无数据 # a=html.xpath('//body//a') # 4 父节点 # a=html.xpath('//body//a[@href="image1.html"]/..') # a=html.xpath('//body//a[1]/..') # 也可以这样 # a=html.xpath('//body//a[1]/parent::*') # a=html.xpath('//body//a[1]/parent::div') # 5 属性匹配 # a=html.xpath('//body//a[@href="image1.html"]') # 6 文本获取 text() ******** # a=html.xpath('//body//a[@href="image1.html"]/text()') # 7 属性获取 ****** # a=html.xpath('//body//a/@href') # a=html.xpath('//body//a/@id') # # 注意从1 开始取(不是从0) # a=html.xpath('//body//a[1]/@id') # 8 属性多值匹配 # a 标签有多个class类,直接匹配就不可以了,需要用contains # a=html.xpath('//body//a[@class="li"]') # a=html.xpath('//body//a[@name="items"]') # a=html.xpath('//body//a[contains(@class,"li")]') # a=html.xpath('//body//a[contains(@class,"li")]/text()') # 9 多属性匹配 # a=html.xpath('//body//a[contains(@class,"li") or @name="items"]') # a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()') # 10 按序选择 # a=html.xpath('//a[2]/text()') # a=html.xpath('//a[3]/@href') # 取最后一个 # a=html.xpath('//a[last()]/@href') # 位置小于3的 # a=html.xpath('//a[position()<3]/@href') # 倒数第二个 # a=html.xpath('//a[last()-2]/@href') # 11 节点轴选择 # ancestor:祖先节点 # 使用了* 获取所有祖先节点 # a=html.xpath('//a/ancestor::*') # # 获取祖先节点中的div # a=html.xpath('//a/ancestor::div') # attribute:属性值 # a=html.xpath('//a[1]/attribute::*') # a=html.xpath('//a[1]/attribute::href') # child:直接子节点 # a=html.xpath('//a[1]/child::*') # descendant:所有子孙节点 # a=html.xpath('//a[6]/descendant::*') # following:当前节点之后所有节点 # a=html.xpath('//a[1]/following::*') # a=html.xpath('//a[1]/following::*[1]/@href') # following-sibling:当前节点之后同级节点 # a=html.xpath('//a[1]/following-sibling::*') # a=html.xpath('//a[1]/following-sibling::a') # a=html.xpath('//a[1]/following-sibling::*[2]') a=html.xpath('//a[1]/following-sibling::*[2]/@href') print(a)
7 selenium 动作链
# 网站中有些按住鼠标,滑动的效果 -滑动验证码 # 两种形式 -形式一: actions=ActionChains(bro) #拿到动作链对象 actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行 actions.perform() -方式二: ActionChains(bro).click_and_hold(sourse).perform() distance=target.location['x']-sourse.location['x'] track=0 while track < distance: ActionChains(bro).move_by_offset(xoffset=2,yoffset=0).perform() track+=2
8 自动登录12306
from selenium import webdriver from selenium.webdriver.common.by import By import time from selenium.webdriver import ActionChains from selenium.webdriver.chrome.options import Options options = Options() options.add_argument("--disable-blink-features=AutomationControlled") # 去掉自动化控制的提示 bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=options) bro.get('https://kyfw.12306.cn/otn/resources/login.html') bro.maximize_window() # 12306检测到了我们使用了selenium控制了浏览器,所以它的滑块出不来 bro.implicitly_wait(10) try: username = bro.find_element(by=By.ID, value='J-userName') username.send_keys('') password = bro.find_element(by=By.ID, value='J-password') password.send_keys('') time.sleep(3) btn = bro.find_element(by=By.ID, value='J-login') btn.click() span = bro.find_element(by=By.ID, value='nc_1_n1z') ActionChains(bro).click_and_hold(span).perform() # 鼠标点主 ActionChains(bro).move_by_offset(xoffset=300, yoffset=0).perform() #滑动 time.sleep(10) except Exception as e: print(e) finally: bro.close()
9 打码平台使用
# 咱们把验证码图片发给第三方,第三方帮咱们解决,我们只需要用钱就可以了,一句话总结钱是万能的
import time from selenium import webdriver from selenium.webdriver.common.by import By from mayun import YdmVerify from PIL import Image bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('http://www.chaojiying.com/apiuser/login/') bro.implicitly_wait(10) bro.maximize_window() try: username = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input') password = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input') code = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input') btn = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input') username.send_keys('306334678') password.send_keys('lqz123') # 获取验证码: # 1 整个页面截图 bro.save_screenshot('main.png') # 2 使用pillow,从整个页面中截取出验证码图片 code.png img = bro.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img') # location = img.location # size = img.size # print(location) # print(size) # print(int(location['x'])) # print(int(location['y'])) # print(int(location['x'] + size['width'])) # print(int(location['y'] + size['height'])) # # 使用pillow扣除大图中的验证码 # img_tu = ( # int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) # # # 抠出验证码 # # #打开 # img = Image.open('./main.png') # 抠图 # fram = img.crop(img_tu) # 截出来的小图 img.screenshot('code.png') # 3 使用超级鹰破解 Y = YdmVerify() with open('code.png', 'rb') as f: s = f.read() res_code = Y.common_verify(image=s) code.send_keys(res_code) time.sleep(5) btn.click() time.sleep(10) except Exception as e: print(e) finally: bro.close()
9.1云码开发文档
import json import time import requests import base64 class YdmVerify(object): _custom_url = "https://www.jfbym.com/api/YmServer/customApi" _token = "dWZ2Bq8zD4qIq5ydIisze6NpIvQo1YHxgLSJtp+EBCA" _headers = { 'Content-Type': 'application/json' } def common_verify(self, image, verify_type="10110"): # 数英汉字类型 # 通用数英1-4位 10110 # 通用数英5-8位 10111 # 通用数英9~11位 10112 # 通用数英12位及以上 10113 # 通用数英1~6位plus 10103 # 定制-数英5位~qcs 9001 # 定制-纯数字4位 193 # 中文类型 # 通用中文字符1~2位 10114 # 通用中文字符 3~5位 10115 # 通用中文字符6~8位 10116 # 通用中文字符9位及以上 10117 # 中文字符 1~4位 plus 10118 # 定制-XX西游苦行中文字符 10107 # 计算类型 # 通用数字计算题 50100 # 通用中文计算题 50101 # 定制-计算题 cni 452 payload = { "image": base64.b64encode(image).decode(), "token": self._token, "type": verify_type } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def slide_verify(self, slide_image, background_image, verify_type="20101"): # 滑块类型 # 通用双图滑块 20111 payload = { "slide_image": base64.b64encode(slide_image).decode(), "background_image": base64.b64encode(background_image).decode(), "token": self._token, "type": verify_type } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def sin_slide_verify(self, image, verify_type="20110"): # 通用单图滑块(截图) 20110 payload = { "image": base64.b64encode(image).decode(), "token": self._token, "type": verify_type } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def traffic_slide_verify(self, seed, data, href, verify_type="900010"): # 定制-滑块协议slide_traffic 900010 payload = { "seed": seed, "data": data, "href": href, "token": self._token, "type": verify_type } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def click_verify(self, image, extra=None, verify_type="30100"): # 通用任意点选1~4个坐标 30009 # 通用文字点选1(通用,xd;extra,点选文字逗号隔开,原图) 30100 # 定制-文字点选2(xy3,extra="click",原图) 30103 # 定制-单图文字点选(xd) 30102 # 定制-图标点选1(xd,原图) 30104 # 定制-图标点选2(xy3,原图,extra="icon") 30105 # 定制-语序点选1(xy3,原图,extra="phrase") 30106 # 定制-语序点选2(xd,原图) 30107 # 定制-空间推理点选1(xd,原图,extra="请点击xxx") 30109 # 定制-空间推理点选1(xy3,原图,extra="请_点击_小尺寸绿色物体。") 30110 # 定制-tx空间点选(extra="请点击侧对着你的字母") 50009 # 定制-tt_空间点选 30101 # 定制-推理拼图1(xd,原图,extra="交换2个图块") 30108 # 定制-xy4九宫格点选(label_image,image) 30008 # 点选二字TX 30111 # 定制-文字点选3(extra="je4_click") 30112 # 定制-图标点选3(extra="je4_icon") 30113 # 定制-语序点选3(extra="je4_phrase") 30114 payload = { "image": base64.b64encode(image).decode(), "token": self._token, "type": verify_type } if extra: payload['extra'] = extra resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def rotate(self, image): # 定制-X度单图旋转 90007 payload = { "image": base64.b64encode(image).decode(), "token": self._token, "type": "90007" } # 定制-Tt双图旋转,2张图,内圈图,外圈图 90004 # payload = { # "out_ring_image": base64.b64encode(image).decode(), # "inner_circle_image": base64.b64encode(image).decode(), # "token": self._token, # "type": "90004" # } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def google_verify(self, googlekey, pageurl, invisible=1, data_s=""): _headers = { 'Content-Type': 'application/json' } """ 第一步,创建验证码任务 :param :return taskId : string 创建成功的任务ID """ url = "https://www.jfbym.com/api/YmServer/funnelApi" payload = json.dumps({ "token": self._token, "type": "40010", ## v2 # "type": "40011", ## v3 "googlekey": googlekey, "enterprise": 0, ## 是否为企业版 "pageurl": pageurl, "invisible": invisible, "data-s": data_s, ## V2+企业如果能找到,找不到传空字符串 # 'action':"" #V3必传 # 'min_score':"" #V3才支持的可选参数 }) # 发送JSON格式的数据 result = requests.request("POST", url, headers=_headers, data=payload).json() print(result) # {'msg': '识别成功', 'code': 10000, 'data': {'code': 0, 'captchaId': '51436618130', 'recordId': '74892'}} captcha_id = result.get('data').get("captchaId") record_id = result.get('data').get("recordId") times = 0 is_solved = 0 while times < 150: try: url = f"https://www.jfbym.com/api/YmServer/funnelApiResult" data = { "token": self._token, "captchaId": captcha_id, "recordId": record_id } result = requests.post(url, headers=_headers, json=data).json() print(result) # {'msg': '结果准备中,请稍后再试', 'code': 10009, 'data': []} if result['msg'] == "结果准备中,请稍后再试": continue if result['msg'] == '请求成功' and result['code'] == 10001: is_solved = 1 return result['data']['data'] # {'msg': '请求成功', 'code': 10001, 'data': {'data': '03AGdBq2611GTOgA2v9HUpMMEUE70p6dwOtYyHJQK4xhdKF0Y8ouSGsFZt647SpJvZ22qinYrm6MYBJGFQxMUIApFfSBN6WTGspk6DmFdQAoWxynObRGV7qNMQOjZ_m4w3_6iRu8SJ3vSUXH_HHuA7wXARJbKEpU4J4R921NfpKdahgeFD8rK1CFYAqLd5fz4l-8_VRmRE83dRSfkgyTN338evQ1doWKJRipZbk4ie-89Ud0KGdOsP4QzG3stRZgj2oaEoMDSAP62vxKGYqtDEqTcwtlgo-ot3rF5SmntaoKGwcKPo0NrekWA5gtj0vqKLU6lY2GcnSci_tgBzBwuH40uvyR1PFu02VK_E44mopJ7FOO4cUukNaLGqypU2YCA8QuaaebOIoCMU7RGqGs_41RYNCG1GSdthiwcwk2hHFbi-TXuICXSwh4Er5mgVW9A3t_9Ndp0eJcyr3HtuJrcA7BtlcgruuQxK5h4Ew4ert4KPH_aQGN9ww5VsUtbSManzUDnUOs7aEdvFk1DOOPmLys-aX20ZFN2CcQcZZSO-7HZpZZt3EDeWWE5S02HFDY8gl3_0xqIts8774Tr4GMVJaddG0NR6pcBFC11FqNcK2a18gM3gaKDy3_2ZMeSU4nj4NWwoAhPjQN2BS8JxX4kKVpX4rD959kc93vczVD3TYD6_4GJahGSpBvM7Y5_GGIdLL8imXde1R35mZnEcFYXQ40zcy3DdJFkk_gzGTVOEb1Q1IZpjMxzCxyGgwjgL9dtDIgst5H5CSZoerX_Lz-DmsBvYIYZdpbPLEMROx9MODImaEw8Cp6M8Xj7_foijiGE9hh-pzJSTlKl3HytiSUyJJ7r1BssrX5C_TFWxl0IXNg8azP8H-ZIOWwnYlMWCS1w9piHdoLg5zACiYIN3Txdlsvi61MuPmzJggJd1_dlyMdAlzb5_zdfweqj0_Ko1ODP378YT7sV7LECgRj5QJU6sF5nlf4m2g5sFypBw9GFAkEE-OaWGYxRJOy2ioU41ggAJIkcza2B_N5AL2KLROtm0-c2MxplM4ZzHxrUv9A24zlgzo3Pz4NONwU_gaOcDB7j1dZKXD8UaoIrZv0BTd8JeojYowm9Usdg7Rt4Fpo_vDLJdrEUfbxVlXieDD9Fr1fu72-d4AduT_J3n-rIhyX4gFav-KfP-qOxqOZsmjXZirsBxZs7042NYeirRYnLv35cxIAJARz03FJmeKViUivwC5mCWw64hjRad9XyyBOP2n8KFOrTXhPskC-WwEfksGtfLxi6VW76FHGvRdwHXzMwVfNqe3P5H_WZUc-vxeTAsTnqZz3WA97lM4MLrX0nTZYgXxCEiS6raSOiEMqcx_Nv7Zxre-abj4LZRbFpH8nx1SEiaOV2Dm-a1iPFEmCs0L4kDtt6VImSVIQaTOAd3KFSo7W_XTvRPsQJOtblrcKyuagztX_Yr0lT0YqN9I9MZAARo7M5OfwSLJW16rdmp4NuRefEvNPNHO2cVh1Xha1qNGuF_QDvWFFmWG0Y6IbRqLmF-Dv8BY4TWyOeVnADJftGQw2QSr8RmbCHryA'}} except Exception as e: print(e) finally: if is_solved: break print("sleep 5s...") time.sleep(5) times += 5 def hcaptcha_verify(self, site_key, site_url, verify_type="50001"): # 定制类接口-Hcaptcha payload = { "site_key": site_key, "site_url": site_url, "token": self._token, "type": verify_type } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] def fun_captcha_verify(self, publickey, pageurl, verify_type="40007"): # 定制类接口-Hcaptcha payload = { "publickey": publickey, "pageurl": pageurl, "token": self._token, "type": verify_type } resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload)) print(resp.text) return resp.json()['data']['data'] if __name__ == '__main__': Y = YdmVerify() with open('2.png', 'rb') as f: s = f.read() Y.common_verify(image=s)
10 使用selenium爬取京东商品信息
from selenium import webdriver from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR import time from selenium.webdriver.common.keys import Keys def get_goods(driver): try: goods = driver.find_elements(by=By.CLASS_NAME, value='gl-item') for good in goods: name = good.find_element(by=By.CSS_SELECTOR, value='.p-name em').text price = good.find_element(by=By.CSS_SELECTOR, value='.p-price i').text commit = good.find_element(by=By.CSS_SELECTOR, value='.p-commit a').text url = good.find_element(by=By.CSS_SELECTOR, value='.p-name a').get_attribute('href') img = good.find_element(by=By.CSS_SELECTOR, value='.p-img img').get_attribute('src') if not img: img ='https://'+ good.find_element(by=By.CSS_SELECTOR, value='.p-img img').get_attribute('data-lazy-img') print(''' 商品名字:%s 商品价格:%s 商品链接:%s 商品图片:%s 商品评论:%s ''' % (name, price, url, img, commit)) button = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='下一页') button.click() time.sleep(1) get_goods(driver) except Exception as e: print(e) def spider(url, keyword): driver = webdriver.Chrome(executable_path='./chromedriver.exe') driver.get(url) driver.implicitly_wait(10) # 使用隐式等待 try: input_tag = driver.find_element(by=By.ID, value='key') input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_goods(driver) finally: driver.close() if __name__ == '__main__': spider('https://www.jd.com/', keyword='华为手机')
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人