爬虫基础-xpath解析样例

爬取城市列表

# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os

if __name__ == "__main__":
    # 获取页面数据
    if not os.path.exists('./download'):
        os.mkdir('./download')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    url = 'https://www.aqistudy.cn/historydata/'
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    page_text = response.text
    tree = etree.HTML(page_text)
    all_city_names = []
    
    # 全部城市: 定位属性值为bottom的div标签下所有ul标签下第二个div标签下的li标签 (//div[@class="bottom"]/ul/div[2]/li)
    #list_li = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
    # 热门城市:  定位属性值为bottom的div标签下所有ul标签下所有li标签
    #list_li = tree.xpath('//div[@class="bottom"]/ul/li')

    # 全部+热门: 方法一 - 找规律
    #list_li = tree.xpath('//div[@class="bottom"]//li')

    # 全部+热门: 方法二 - 通用
    list_li = tree.xpath('//div[@class="bottom"]/ul/div[2]/li | //div[@class="bottom"]/ul/li')
    for li in list_li:
        city_name = li.xpath('./a/text()')[0]
        all_city_names.append(city_name)
    print(len(all_city_names)

爬取图片

# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os

if __name__ == "__main__":
    # 获取页面数据
    if not os.path.exists('./download'):
        os.mkdir('./download')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    url = 'http://pic.netbian.com/4kyouxi/'
    response = requests.get(url=url, headers=headers)
#    response.encoding = 'gbk'
    page_text = response.text
    tree = etree.HTML(page_text)
    list_li = tree.xpath('//ul[@class="clearfix"]/li')
    for li in list_li:
        # 获取属性值拼接后作为名称
        img_name = li.xpath('./a/img/@alt')[0] + '.jgp'
        # 处理中文乱码
        img_name = img_name.encode('iso-8859-1').decode('gbk')
        # 获取属性值值拼接地址
        img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
        img_path = './download/' + img_name
        img_data = requests.get(img_src,headers=headers).content
        fp = open(img_path,'wb')
        fp.write(img_data)
        fp.close()
        print(img_name, '--', '下载完成')

从站长素材下载免费PPT

# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
import time

if __name__ == "__main__":
    # 获取页面数据
    if not os.path.exists('./download'):
        os.mkdir('./download')
    url = 'http://sc.chinaz.com/ppt/free.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    tree = etree.HTML(response.text)
    list_a = tree.xpath('//div[@id="main"]//p/a')
    for a in list_a:
        ppt_name = a.xpath('./text()')[0]
        ppt_src = a.xpath('./@href')[0]
        detail_ppt = requests.get(url=ppt_src, headers=headers)
        detail_ppt.encoding = 'utf-8'
        ppt_tree = etree.HTML(detail_ppt.text)
        detail_ppt_li = ppt_tree.xpath('//div[@class="down_wrap"]//ul[@class="clearfix"]/li[1]')

        for li in detail_ppt_li:
            down_url = li.xpath('./a/@href')[0]
            ppt_path = './download/' + ppt_name + '.rar'
            ppt_download = requests.get(url=down_url,headers=headers).content
            fp = open(ppt_path,'wb')
            fp.write(ppt_download)
            fp.close()
            time.sleep(10)
            print(ppt_name, ':', '下载完成')

 

 

 

posted @ 2020-08-14 13:07  消磨_时间  阅读(126)  评论(0编辑  收藏  举报