xpath解析实战
tree= etree.parse("./test.html")
# 普通定位
res = tree.xpath("/html/head/title")[0]
# 获取所有标签
res = tree.xpath("//div")
# 索引定位->
res = tree.xpath("//div[1]")
# 属性定位 //tag_name[@attr_name="value"]
res= tree.xpath('//div[@class="song"]')
# 层级定位 定位到li 标签下的 a 标签
res = tree.xpath('//div[@class="tang"]/ul/li/a')
res = tree.xpath('//div[@class="tang"]//li/a')
print(res)
# 数据提取
res = tree.xpath('//a[@id="feng"]/text()') # 提取a标签下面的文本
print(res)
res = tree.xpath('//div[@class="song"]/p/text()')
print(res)
爬取美女图片
import requests
from lxml import etree
import os
# 模拟请求头
headers = {
"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"
}
# 爬取5页数据
for page in range(1,6):
if page == 1:
url = "http://pic.netbian.com/4kmeinv/"
else:
url = f"http://pic.netbian.com/4kmeinv/index_{page}.html"
url_text = requests.get(url=url,headers=headers)
url_text.encoding= "gbk" # 乱码解决
# print(url_text.text)
tree = etree.HTML(url_text.text)
li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
next_url = "https://pic.netbian.com"+li.xpath("./a/@href")[0] # 拼接图片路径
title = li.xpath("./a/b/text()")[0]
res = requests.get(url=next_url,headers=headers).text
next_tree = etree.HTML(res)
image_src = "https://pic.netbian.com"+next_tree.xpath('//*[@id="img"]/img/@src')[0]
image_bin = requests.get(url=image_src,headers=headers).content # 获取图片二进制数据
if not os.path.exists("./彼岸图库美女"):
os.mkdir("./彼岸图库美女")
image_path = "./彼岸图库美女/"+title+".jpg"
with open(image_path,"wb") as fp:
fp.write(image_bin)
print(f"{title}:下载成功~")
小说章节及内容爬取
import requests
from lxml import etree
url = "https://bixuejian.5000yan.com/"
headers ={"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"}
response = requests.get(url,headers)
response.encoding="utf-8"
tree = etree.HTML(response.text)
# 解析a标签获取link及title
result = tree.xpath("/html/body/div[2]/div[1]/main/ul/li/a")
for i in result:
title = i.xpath("./text()")[0]
novel_url = i.xpath("./@href")[0]
novel_response = requests.get(novel_url,headers=headers)
novel_response.encoding="utf-8"
novel_result = etree.HTML(novel_response.text)
content = novel_result.xpath("/html/body/div[2]/div[1]/main/section/div[1]//text()")
content_res = "".join(content).strip()
with open(f"./碧血剑章节详情内容/{title}.txt","w",encoding="utf-8") as f:
f.write(content_res)
print(f"{title}写入成功")
懒加载爬取图片
from lxml import etree
import requests
url = "https://sc.chinaz.com/tupian/meinvtupian.html"
res = requests.get(url=url,headers={
"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"
})
res.encoding="utf-8"
tree = etree.HTML(res.text)
# res = tree.xpath('//div/img/@data-original')
res = tree.xpath('/html/body/div[3]/div[2]/div/img/@data-original') # 数据懒加载,正式的数据在data-original里面,而不是正常的src/href中
print(res)
for i in res:
image_url = "https:"+i
image = requests.get(image_url,headers={
"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76",
"Referer":"https://sc.chinaz.com/"
})
image_name = i.split("/")[-1]
with open(f"./download/{image_name}","wb") as f:
f.write(image.content)
print("下载完成")
爬取简历模板
import requests
from lxml import etree
import os
headers = {
"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"
}
for i in range(1,6):
if i == 1:
url = "https://sc.chinaz.com/jianli/free.html"
else:
url = f"https://sc.chinaz.com/jianli/free_{i}.html"
article_res = requests.get(url,headers)
article_res.encoding="utf-8"
tree = etree.HTML(article_res.text)
div_list = tree.xpath('//*[@id="container"]/div')
for b in div_list:
curriculum_vitae_name = b.xpath('./p/a/text()')[0] # 注意需要添加./
curriculum_vitae_url = b.xpath('./p/a/@href')[0]
print(curriculum_vitae_name,curriculum_vitae_url)
curriculum_detail = requests.get(url=curriculum_vitae_url,headers=headers)
detail_tree = etree.HTML(curriculum_detail.text)
download_url = detail_tree.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
res_content = requests.get(download_url,headers=headers)
if not os.path.exists("./站长素材简历"):
os.mkdir("./站长素材简历/")
with open(f"./站长素材简历/{curriculum_vitae_name}.rar","wb") as f:
f.write(res_content.content)