python 基础 6 解析之xpath
解析
xpath使用
安装lxml库
pip install lxml ‐i https://pypi.douban.com/simple
pip install lxml -i https://mirrors.aliyun.com/pypi/simple
导入lxml etree
from lxml import etree
etree.parse() 解析本地文件
html_tree = etree.parse('XX.html')
etree.parse() 解析本地文件
html_tree = etree.parse('XX.html')
etree.HTML() 服务器响应文件
html_tree = etree.HTML(response.read().decode('utf‐8')
html_tree.xpath(xpath路径)
xpath基本语法
1、路径查询
//:查找所有子孙节点,不考虑层级关系
/:查找子节点
2、谓词查询
//div[@id]
3、属性查询
//@class
4、模糊查询
//div[contains(@id,"he")]
//div[starts-with(id,"he")]
5、内容查询
//div/hi/text()
6、逻辑查询
//div[@id=:"head" and @class="s_down"]
tree = etree.HTML(context)
result = tree.xpath("//input[@id='su']/@value")[0]
print(result)
抓取素材图片案例
# https://sc.chinaz.com/tupian/shuaigetupian.html
import urllib.request
import lxml.etree
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
}
def create_request(page):
if page == 1:
url = "https://sc.chinaz.com/tupian/shuaigetupian.html"
else:
url = "https://sc.chinaz.com/tupian/shuaigetupian_"+str(page)+".html"
print(url)
requestObj = urllib.request.Request(headers=headers,url=url)
return requestObj
def get_context(requestObj):
context = urllib.request.urlopen(requestObj)
result = context.read().decode('utf-8')
return result
def down_load(context):
tree = lxml.etree.HTML(context)
name_list = tree.xpath("//div[@id='container']//a/img/@alt")
pic_url_list = tree.xpath("//div[@id='container']//a/img/@src2")
for index in range(len(name_list)):
urllib.request.urlretrieve(url="https:"+pic_url_list[index],filename="./img/"+name_list[index]+'.jpg')
if __name__ == '__main__':
start_page = int(input("请输入起始页"))
end_page = int(input("请输入结束页"))
for page in range(start_page,end_page+1):
requestObj = create_request(page)
context = get_context(requestObj)
down_load(context)
import urllib.request
import urllib.parse
from lxml import etree
def create_request(page):
base_url = 'https://sc.chinaz.com/tupian/meinvtupian'
if page == 1:
url = url = base_url + '.html'
else:
url = base_url +'_' + str(page) + '.html'
#定制请求对象
request = urllib.request.Request(url)
return request
def get_response(request):
response = urllib.request.urlopen(request)
context = response.read().decode('utf-8')
#解析
tree = etree.HTML(context)
result_url = tree.xpath('//div[@class="item"]/img/@data-original')
result_name = tree.xpath('//div[@class="item"]/img/@alt')
# result = tree.xpath('/html/body/div[3]/div[2]/div/img/@data-original')
print(result_url,result_name)
return result_name,result_url
def downlaod(result_name,result_url):
name_list = 0
for index in range(len(result_name)):
# print(result_url[index])
urllib.request.urlretrieve(url="https:"+result_url[index],filename="./pic/"+ result_name[index]+'.jpg' )
if __name__ == '__main__':
start_page = int(input('请输入起始页'))
end_page = int(input('请输入结束页'))
for page in range(start_page,end_page+1):
request = create_request(page)
result_name,result_url = get_response(request)
downlaod(result_name,result_url)