xpath解析 站长素材简历模板爬取

import requests
import os
from lxml import etree

url = 'https://sc.chinaz.com/jianli/free.html'
url1 = 'https://sc.chinaz.com/jianli/free_%d.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
}

创建一个文件夹

if not os.path.exists('./imgLibs'):
os.mkdir('./imgLibs')

for pageNum in range(1,3):
if pageNum == 1:
new_url = url
else:
new_url = format(url1%pageNum)
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
a_href = 'https:' + div.xpath('./a/@href')[0]
page1_text = requests.get(url=a_href, headers=headers).text
tree = etree.HTML(page1_text)
a1_href = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')[0]
# print(a1_href)
jianli_name = a1_href.split('/')[-1]
jianli_path = './imgLibs/' + jianli_name
jianli_data = requests.get(url=a1_href,headers=headers).content
with open(jianli_path, 'wb') as fp:
fp.write(jianli_data)
print(jianli_name,'下载完成!!!')

posted @ 2021-02-19 01:58  未来全栈攻城狮  阅读(96)  评论(0编辑  收藏  举报