爬虫基础-xpath解析样例
爬取城市列表
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == "__main__":
# 获取页面数据
if not os.path.exists('./download'):
os.mkdir('./download')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
all_city_names = []
# 全部城市: 定位属性值为bottom的div标签下所有ul标签下第二个div标签下的li标签 (//div[@class="bottom"]/ul/div[2]/li)
#list_li = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
# 热门城市: 定位属性值为bottom的div标签下所有ul标签下所有li标签
#list_li = tree.xpath('//div[@class="bottom"]/ul/li')
# 全部+热门: 方法一 - 找规律
#list_li = tree.xpath('//div[@class="bottom"]//li')
# 全部+热门: 方法二 - 通用
list_li = tree.xpath('//div[@class="bottom"]/ul/div[2]/li | //div[@class="bottom"]/ul/li')
for li in list_li:
city_name = li.xpath('./a/text()')[0]
all_city_names.append(city_name)
print(len(all_city_names)
爬取图片
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == "__main__":
# 获取页面数据
if not os.path.exists('./download'):
os.mkdir('./download')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'http://pic.netbian.com/4kyouxi/'
response = requests.get(url=url, headers=headers)
# response.encoding = 'gbk'
page_text = response.text
tree = etree.HTML(page_text)
list_li = tree.xpath('//ul[@class="clearfix"]/li')
for li in list_li:
# 获取属性值拼接后作为名称
img_name = li.xpath('./a/img/@alt')[0] + '.jgp'
# 处理中文乱码
img_name = img_name.encode('iso-8859-1').decode('gbk')
# 获取属性值值拼接地址
img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
img_path = './download/' + img_name
img_data = requests.get(img_src,headers=headers).content
fp = open(img_path,'wb')
fp.write(img_data)
fp.close()
print(img_name, '--', '下载完成')
从站长素材下载免费PPT
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
import time
if __name__ == "__main__":
# 获取页面数据
if not os.path.exists('./download'):
os.mkdir('./download')
url = 'http://sc.chinaz.com/ppt/free.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
tree = etree.HTML(response.text)
list_a = tree.xpath('//div[@id="main"]//p/a')
for a in list_a:
ppt_name = a.xpath('./text()')[0]
ppt_src = a.xpath('./@href')[0]
detail_ppt = requests.get(url=ppt_src, headers=headers)
detail_ppt.encoding = 'utf-8'
ppt_tree = etree.HTML(detail_ppt.text)
detail_ppt_li = ppt_tree.xpath('//div[@class="down_wrap"]//ul[@class="clearfix"]/li[1]')
for li in detail_ppt_li:
down_url = li.xpath('./a/@href')[0]
ppt_path = './download/' + ppt_name + '.rar'
ppt_download = requests.get(url=down_url,headers=headers).content
fp = open(ppt_path,'wb')
fp.write(ppt_download)
fp.close()
time.sleep(10)
print(ppt_name, ':', '下载完成')