python3爬虫基础之urllib、requests、BeautifulSoup库使用
urllib
import urllib.request
import urllib.parse
import urllib.error
url = "http://localhost:3000/request.php"
# 直接使用
with urllib.request.urlopen(url) as response:
# 返回所获取页面的真实 URL
print(response.geturl()) # http://localhost:3000/request.php
# 返回Http状态码
print(response.getcode()) # 200
# 参数不为空则读取指定长度网页内容 否则读取整个网页内容
print(response.read().decode("utf-8")) # welcome
# 读取文件的一行内容。
print(response.readline()) # b'' 内容只能读一次
# 读取文件的全部内容,它会把读取的内容赋值给一个列表变量。
print(response.readlines()) # []
# 返回HTTPMessage对象,表示远程服务器返回的头信息
info = response.info()
# Request 对象
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
the_page = response.read()
print(the_page) # b'welcome'
# FTP协议
req = urllib.request.Request("ftp://example.com/")
with urllib.request.urlopen(req) as response:
the_page = response.read()
# GET请求
data = {}
data["location"] = "Northampton"
data["language"] = "Python"
url_values = urllib.parse.urlencode(data)
print(url_values) # location=Northampton&language=Python
full_url = url + "?" + url_values
data = urllib.request.urlopen(full_url)
print(data) # <http.client.HTTPResponse object at 0x00000246FCC3E410>
# POST请求传递数据
values = {"language": "Python"}
data = urllib.parse.urlencode(values)
data = data.encode("ascii") # data should be bytes
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as response:
the_page = response.read()
print(the_page.decode("utf-8")) # {"data":{"language":"Python"},"method":"post","ua":"Python-urllib\/3.10"}
# 请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"}
data = urllib.parse.urlencode(values)
data = data.encode("ascii")
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
print(the_page.decode("utf-8")) # {"data":{"language":"Python"},"method":"post","ua":"Mozilla\/5.0 (Windows NT 6.1; Win64; x64)"}
# 异常处理
# URLError封装的错误信息一般是由网络引起的,包括url错误。HTTPError封装的错误信息一般是服务器返回了错误状态码。HTTPError是URLError的子类。
req = urllib.request.Request(url)
try:
urllib.request.urlopen(req)
except urllib.error.URLError as e:
if hasattr(e, "reason"):
print("We failed to reach a server.")
print("Reason: ", e.reason)
elif hasattr(e, "code"):
print("The server couldn't fulfill the request.")
print("Error code: ", e.code)
else:
pass
# 解析URL
p_url = urllib.parse.urlparse(full_url, scheme="", allow_fragments=True)
print(p_url.scheme) # http 协议
print(p_url.netloc) # localhost:3000 域名
print(p_url.path) # /request.php 路径
print(p_url.params) # 路径参数
print(p_url.query) # location=Northampton&language=Python 查询参数
print(p_url.fragment) # 片段
print(p_url.hostname) # localhost
query_lst = urllib.parse.parse_qsl(p_url.query)
print(query_lst) # [('location', 'Northampton'), ('language', 'Python')]
url_compos = [p_url.scheme, p_url.netloc, p_url.path, p_url.params, p_url.query, p_url.fragment]
url_str = urllib.parse.urlunparse(url_compos)
print(url_str) # http://localhost:3000/request.php?location=Northampton&language=Python
s_url = urllib.parse.urlsplit(url_str)
print(s_url.scheme) # http 协议
print(s_url.netloc) # localhost:3000 域名
print(s_url.path) # /request.php 路径
print(s_url.query) # location=Northampton&language=Python 查询参数
print(s_url.fragment) # 片段
print(s_url.hostname) # localhost
url_compos = [s_url.scheme, s_url.netloc, s_url.path, s_url.query, s_url.fragment]
url_str = urllib.parse.urlunsplit(url_compos)
print(url_str) # http://localhost:3000/request.php?location=Northampton&language=Python
url_str = urllib.parse.urljoin(url_str, "request1.php")
print(url_str) # http://localhost:3000/request1.php
kw = urllib.parse.quote("中文")
url = "http://localhost:3000/request.php?kw={}".format(kw)
print(url) # http://localhost:3000/request.php?kw=%E4%B8%AD%E6%96%87
kw = urllib.parse.unquote(kw)
print(kw) # 中文
query_string = {"kw": "中文"}
result = urllib.parse.urlencode(query_string)
url = "http://localhost:3000/request.php?{}".format(result)
print(url) # http://localhost:3000/request.php?kw=%E4%B8%AD%E6%96%87
request.php
<?php
$ua = $_SERVER['HTTP_USER_AGENT'];
if (!empty($_GET)) {
echo json_encode(['data' => $_GET, 'method' => 'get', 'ua' => $ua]);
} else if (!empty($_POST)) {
echo json_encode(['data' => $_POST, 'method' => 'post', 'ua' => $ua]);
} else {
echo 'welcome';
}
requests
使用 https://www.cnblogs.com/caroline2016/p/17007956.html 建立的api
模拟登录时laravel api那边出现了 Session store not set on request. 错误。解决办法在app/Http/Kernel.php 中 api 中间件组中添加两行代码:
<?php
protected $middlewareGroups = [
...
'api' => [
...
\App\Http\Middleware\EncryptCookies::class, // <------- 添加的代码
\Illuminate\Session\Middleware\StartSession::class, // <------ 添加的代码
],
];
demo
import requests
import concurrent.futures
import requests.adapters
from requests.auth import HTTPBasicAuth
url = "http://127.0.0.1:8000"
url_get = "http://127.0.0.1:8000/api/data/getPoem?p=1"
url_post = "http://127.0.0.1:8000/login"
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
}
headers_api = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
"Authorization": "Bearer 69|3eSTphCsfIrv1gOg3fGqNmwouJEpRwxymUc6Bvt3",
}
params = {}
response = requests.get(url_get, headers=headers_api, params=params)
if response.status_code == 200:
# 查看响应码
print(response.status_code) # 200
# 查看完整url地址
print(response.url) # http://127.0.0.1:8000/api/data/getPoem?p=1
# 查看响应头部字符编码
print(response.encoding) # utf-8
# 查看响应头
print(response.headers)
# 查看cookies
print(response.cookies)
# 查看响应内容,返回Unicode格式的数据
print(response.text)
# 查看响应内容,返回字节流数据
print(response.content)
# 查看响应内容,返回JSON响应内容
print(response.json())
# 查看响应内容,返回原始响应内容
print(response.raw) # <urllib3.response.HTTPResponse object at 0x000001E38C6F00A0>
# 转换编码
response.encoding = "utf-8"
# cookies
cookiejar = response.cookies
cookiedict = requests.utils.dict_from_cookiejar(cookiejar)
print("cookiedict", cookiedict) # {'laravel_session': 'VS6SDGagCeub2h9c4WYzODpLGBERCJNiz2r78ZE1'}
# 会话管理
data = {"key": "bb@bb3.com", "password": "12345678"}
session = requests.session()
session.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
}
response1 = session.get(url)
response2 = session.post(url_post, data=data) # 模拟登录
r = session.get(url_get)
if r.status_code == 200:
print(response2.text) # {"token":"73|gxwyT2nV6RGLO7EuT6cncXaPryTTgouxyGTsIb7U","code":0,"userinfo":{"name":"bb","email":"bb@bb3.com"}}
# 身份验证
response = requests.post(url_post, auth=HTTPBasicAuth("username", "password"), headers=headers)
if response.status_code == 200:
print(response.text)
# 代理设置
proxy = {"http": "http://username:password@ip:port"}
proxy = {"http": "http://ip:port"}
response = requests.get(url_get, proxies=proxy)
# SSL验证
response = requests.get("https://www.baidu.com/", verify=True)
print("verify", response)
# 错误处理
try:
response = requests.get(url_get, timeout=1, headers=headers_api)
response.raise_for_status()
except requests.exceptions.ConnectionError:
print("connect error")
except requests.exceptions.Timeout:
print("time out")
except requests.exceptions.HTTPError as err:
print("server error:", err)
else:
print("get data")
# 连接池
session = requests.session()
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
session.mount("http://", adapter=adapter)
response = session.get(url_get)
# 持久连接
response = session.get(url_get, headers={"Connection": "keep-alive"})
# 并发
def fetch_data(url):
response = requests.get(url)
return response.text
urls = ["url1", "url2"]
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = executor.map(fetch_data, urls)
for result in results:
print(result)
# 其它方法
response = requests.put(url, data=data)
response = requests.delete(url)
response = requests.head(url)
response = requests.options(url)
BeautifulSoup
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库
安装
pip install beautifulsoup4
使用
from bs4 import BeautifulSoup
import re
html_doc = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Document</title>
</head>
<body>
<p><a href="baidu.com">baidu</a><a href="github.com">github</a></p>
<div><a href="cnblogs.com">cnblogs</a></div>
<div>
<p class="title" id="title">title</p>
<p class="title text" data-tid="1">second title</p>
<div class="content">content</div>
</div>
"""
soup = BeautifulSoup(html_doc, "lxml") # 具有容错功能
res = soup.prettify() # 处理好缩进,结构化显示
# print(res)
print(soup.p) # <p class="link"><a href="baidu.com">baidu</a><a href="github.com">github</a></p> 返回第一个
print(soup.a) # <a href="baidu.com">baidu</a> 返回第一个
print(soup.p.name) # p
print(soup.a.attrs) # {'href': 'baidu.com'} 获取标签的属性
print(soup.p.text) # baidugithub 获取标签的内容
print(soup.p.string) # None 获取标签的内容
print(soup.div.string) # cnblogs 获取标签的内容
print(soup.p.strings) # <generator object Tag._all_strings at 0x0000024371CE61F0>
for str in soup.p.strings:
print(str)
for line in soup.stripped_strings: # 去掉html标签,只留下文字内容
print(line)
print(soup.head.title.string) # Document
print(soup.body.a.string) # baidu
print(soup.p.contents) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>]
print(soup.p.children) # <list_iterator object at 0x0000017C83EB7FA0> 所有子级节点
for i, child in enumerate(soup.p.children):
print(i, child)
print(soup.p.descendants) # 所有后代节点
for i, child in enumerate(soup.p.descendants):
print(i, child)
print(soup.a.parent) # 获取父节点
print(soup.a.parents) # <generator object PageElement.parents at 0x0000017C83D56180> 所有祖先节点
print(soup.a.next_sibling) # <a href="github.com">github</a> 下一个兄弟
print(soup.a.previous_sibling) # None 上一个兄弟
print(list(soup.a.next_siblings)) # [<a href="github.com">github</a>]
print(soup.a.previous_siblings) # <generator object PageElement.previous_siblings at 0x0000017C83D56180>
print(soup.find("title")) # <title>Document</title>
print(soup.find("a")) # <a href="baidu.com">baidu</a>
print(soup.find("p", class_=re.compile("title"))) # <p class="title" id="title">title</p>
def has_class_but_no_id(tag):
return tag.has_attr("class") and not tag.has_attr("id")
print(soup.find_all(has_class_but_no_id)) # [<div class="content">content</div>] 过滤器
print(soup.find_all("a")) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>, <a href="cnblogs.com">cnblogs</a>]
print(soup.find_all(re.compile("^a"))) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>, <a href="cnblogs.com">cnblogs</a>]
print(soup.find_all(["a", "b"])) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>, <a href="cnblogs.com">cnblogs</a>]
print(soup.find_all(name=re.compile("^a"))) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>, <a href="cnblogs.com">cnblogs</a>]
print(soup.find_all(id=re.compile("^t"))) # [<p class="title" id="title">title</p>]
print(soup.find_all(href=re.compile("bai"))) # [<a href="baidu.com">baidu</a>]
print(soup.find_all(id=True)) # [<p class="title" id="title">title</p>]
print(soup.find_all(attrs={"data-tid": "1"})) # [<p class="title text" data-tid="1">second title</p>]
print(soup.find_all("p", class_="title")) # [<p class="title" id="title">title</p>, <p class="title text" data-tid="1">second title</p>]
print(soup.find_all("p", class_="title text")) # [<p class="title text" data-tid="1">second title</p>]
print(soup.find_all(class_=re.compile("^text"))) # [<p class="title text" data-tid="1">second title</p>]
print(soup.find_all("p", attrs={"class": "text"})) # [<p class="title text" data-tid="1">second title</p>]
print(soup.find_all(string="baidu")) # ['baidu']
print(soup.find_all("a", string="baidu")) # [<a href="baidu.com">baidu</a>]
print(soup.find_all("a", limit=2)) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>]
print(soup.html.find_all("a")) # [<a href="baidu.com">baidu</a>, <a href="github.com">github</a>, <a href="cnblogs.com">cnblogs</a>] 默认递归搜索
print(soup.html.find_all("a", recursive=False)) # [] 只搜索直接子节点
a = soup.find("a", string="baidu")
print(a.findParent()) # <p><a href="baidu.com">baidu</a><a href="github.com">github</a></p>
# print(a.findParents())
print(a.find_next_sibling()) # <a href="github.com">github</a>
print(a.find_next_siblings()) # [<a href="github.com">github</a>]
print(a.find_previous_sibling()) # None
print(a.find_previous_siblings()) # []
print(a.find_next("p")) # <p class="title" id="title">title</p>
print(a.find_all_next("p")) # [<p class="title" id="title">title</p>, <p class="title text" data-tid="1">second title</p>]
print(a.find_previous("p")) # <p><a href="baidu.com">baidu</a><a href="github.com">github</a></p>
print(a.find_all_previous("p")) # [<p><a href="baidu.com">baidu</a><a href="github.com">github</a></p>]
print(soup.select(".title")) # [<p class="title" id="title">title</p>, <p class="title text" data-tid="1">second title</p>]
print(soup.select(".title.text")) # [<p class="title text" data-tid="1">second title</p>]
print(soup.select("#title")) # [<p class="title" id="title">title</p>]
print(soup.select("div .title")) # [<p class="title" id="title">title</p>, <p class="title text" data-tid="1">second title</p>]
print(soup.select("div")[1].select(".title")) # [<p class="title" id="title">title</p>, <p class="title text" data-tid="1">second title</p>]
print(soup.select("#title")[0].attrs) # {'class': ['title'], 'id': 'title'}
print(soup.select("#title")[0].get_text()) # title
分类:
python3
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· 【自荐】一款简洁、开源的在线白板工具 Drawnix