| |
| -解析josn:发http的请求,返回的数据,可能是xml格式,json格式 |
| request.get().json() |
| -ssl认证 |
| -http和https的区别 |
| -https=http+ssl/tsl |
| -http版本区别 |
| -0.9:底层基于tcp,每次http请求,都是建立一个tcp连接,三次握手,请求结束需要四次挥手 |
| -1.1:请求头中有个参数Keep-alive,可以保证多个http请求公用一个TCP连接 |
| -2.x:多路复用,多个请求使用同一个数据包 |
| |
| -代理: |
| -发送请求,如果使用自己的ip,可能会被封(加黑名单),需要使用代理ip, |
| -如果使用了代理ip,还能不能访问本地的django项目----》不能 |
| -res = requests.post('https://www.cnblogs.com',proxies={'http':'27.79.236.66:4001'}) |
| -高匿,透明 |
| -http请求头:x-forword-for,user-agent,cookie,referer,contenType |
| -http: |
| -请求协议: |
| 请求首行:请求头地址,请求方式,http的版本 |
| 请求头:key-value |
| 请求体 |
| -响应协议: |
| 响应首行:响应状态码,响应字符串描述 |
| 响应头:key-vaule,响应状态码,cookie |
| 响应体 |
| |
| -代理池:搭建免费代理池 |
| -开源的---》原理 |
| 爬取免费代理---》验证---》存到redis中 |
| 起了一个flask服务,监听5000,访问地址,就可以随机获取代理 |
| -自己的django 测试使用代理 |
| -超时 |
| -异常 |
| -上传文件 |
| |
| |
| -花生壳 |
| |
| |
| -请求头中的数据 |
| -请求回来的数据,不一一定能直接用 |
| |
| |
| -bs4:find_all,find |
| |
| |
| |
| |

0 bs4介绍,遍历文档树
0.1 bs4的遍历文档树
| from bs4 import BeautifulSoup |
| html_doc = """ |
| <html><head><title>The Dormouse's story</title></head> |
| <body> |
| <p class="title"> |
| lqz |
| <b>The Dormouse's story</b> |
| </p> |
| |
| <p class="story">Once upon a time there were three little sisters; and their names were |
| <a href="http://example.com/elsie" class="sister" id="link1" name='lqz'>Elsie</a> |
| <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and |
| <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; |
| and they lived at the bottom of a well.</p> |
| |
| <p class="story">...</p> |
| """ |
| |
| soup=BeautifulSoup(html_doc,'lxml') |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| print(list(soup.a.previous_siblings)) |
1 bs4搜索文档树
| |
| |
| html_doc = """ |
| <html><head><title>The Dormouse's story</title></head> |
| <body> |
| <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b> |
| </p> |
| |
| <p class="story">Once upon a time there were three little sisters; and their names were |
| <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, |
| <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and |
| <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; |
| and they lived at the bottom of a well.</p> |
| |
| <p class="story">...</p> |
| """ |
| |
| from bs4 import BeautifulSoup |
| |
| soup = BeautifulSoup(html_doc, 'lxml') |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| def has_class_but_no_id(tag): |
| return tag.has_attr('class') and not tag.has_attr('id') |
| |
| print(soup.find_all(name=has_class_but_no_id)) |
1.1 find的其他参数
| |
| name |
| class_ |
| id |
| text |
| attrs |
| ------- |
| limit:限制调试,find_all用的 find本质是find_all limit=1 |
| recursive:查找的时候,是只找第一层还是子子孙孙都找,默认是True,子子孙孙都找 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| res=soup.html.body.p.find_all(name='b',recursive=False) |
| print(res) |
2 css选择器
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from bs4 import BeautifulSoup |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import requests |
| res=requests.get('https://www.w3school.com.cn/css/css_selector_attribute.asp') |
| soup=BeautifulSoup(res.text,'lxml') |
| |
| print(soup.select('#intro > p:nth-child(1) > strong')[0].text) |
| |
| |
3 selenium基本使用
| |
| selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题 |
| |
| selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器 |
| |
| |
| |
| |
| 1 下载selenium |
| 2 操作浏览器:分不同浏览器,需要下载不同浏览器的驱动 |
| -用谷歌---》谷歌浏览器驱动:https://registry.npmmirror.com/binary.html?path=chromedriver/ |
| -跟谷歌浏览器版本要对应 111.0.5563.65: |
| |
| |
| 3 下载完的驱动,放在项目路径下 |
| |
| 4 写代码,控制谷歌浏览器 |
| from selenium import webdriver |
| import time |
| bro = webdriver.Chrome(executable_path='chromedriver.exe') |
| bro.get('https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3') |
| print(bro.page_source) |
| with open('1.html','w',encoding='utf-8') as f: |
| f.write(bro.page_source) |
| time.sleep(5) |
| bro.close() |
| |
| |
| |
| |
| |
4 无界面浏览器
| |
| from selenium import webdriver |
| import time |
| |
| from selenium.webdriver.chrome.options import Options |
| |
| chrome_options = Options() |
| chrome_options.add_argument('window-size=1920x3000') |
| chrome_options.add_argument('--hide-scrollbars') |
| chrome_options.add_argument('blink-settings=imagesEnabled=false') |
| chrome_options.add_argument('--headless') |
| |
| |
| |
| |
| |
| bro = webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options) |
| |
| bro.get('https://www.cnblogs.com/') |
| print(bro.page_source) |
| time.sleep(5) |
| bro.close() |
| |
4.1 模拟登录百度
| from selenium import webdriver |
| import time |
| from selenium.webdriver.common.by import By |
| |
| bro = webdriver.Chrome(executable_path='chromedriver.exe') |
| bro.get('https://www.baidu.com') |
| |
| bro.implicitly_wait(10) |
| |
| |
| btn = bro.find_element(by=By.LINK_TEXT, value='登录') |
| |
| btn.click() |
| |
| |
| btn_2 = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__changeSmsCodeItem') |
| btn_2.click() |
| time.sleep(1) |
| |
| btn_2 = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__changePwdCodeItem') |
| btn_2.click() |
| time.sleep(1) |
| |
| name = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__userName') |
| password = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__password') |
| name.send_keys('306334678@qq.com') |
| password.send_keys('1234') |
| time.sleep(1) |
| |
| submit=bro.find_element(by=By.ID,value='TANGRAM__PSP_11__submit') |
| submit.click() |
| time.sleep(2) |
| bro.close() |
| |
5 selenium其它用法
5.0 查找标签
| |
| bro.find_element 找一个 |
| bro.find_elements 找所有 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
5.1 获取位置属性大小,文本
| print(tag.get_attribute('src')) |
| tag.text |
| |
| print(tag.id) |
| print(tag.location) |
| print(tag.tag_name) |
| print(tag.size) |
5.2 等待元素被加载
5.3 元素操作
| |
| tag.click() |
| |
| |
| tag.send_keys() |
| |
| |
| tag.clear() |
| |
| |
| |
| bro.maximize_window() |
| |
| bro.save_screenshot('main.png') |
| |
5.4 执行js代码
| bro.execute_script('alert("美女")') |
| |
| |
| |
| -获取当前访问的地址 window.location |
| -打开新的标签 |
| -滑动屏幕--》bro.execute_script('scrollTo(0,document.documentElement.scrollHeight)') |
| -获取cookie,获取定义的全局变量 |
5.5 切换选项卡
| import time |
| from selenium import webdriver |
| |
| browser=webdriver.Chrome(executable_path='chromedriver.exe') |
| browser.get('https://www.baidu.com') |
| browser.execute_script('window.open()') |
| |
| print(browser.window_handles) |
| |
| |
| browser.switch_to.window(browser.window_handles[1]) |
| browser.get('https://www.taobao.com') |
| time.sleep(2) |
| browser.switch_to.window(browser.window_handles[0]) |
| browser.get('https://www.sina.com.cn') |
| browser.close() |
5.6 浏览器前进后退
| import time |
| from selenium import webdriver |
| |
| browser=webdriver.Chrome(executable_path='chromedriver.exe') |
| browser.get('https://www.baidu.com') |
| browser.get('https://www.taobao.com') |
| browser.get('http://www.sina.com.cn/') |
| |
| browser.back() |
| time.sleep(2) |
| browser.forward() |
| browser.close() |
5.7 异常处理
| import time |
| from selenium import webdriver |
| |
| browser=webdriver.Chrome(executable_path='chromedriver.exe') |
| try: |
| except Exception as e: |
| print(e) |
| |
| finally: |
| browser.close() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY