Python之xml.etree.ElementTree模块的使用

xml.etree.ElementTree模块的作用

    基于事件和基于文档的APID来解析XML，可以使用XPath表达式搜索已解析的文件，具有对文档的增删改查的功能，该方式需要注意大xml文件，因为是一次性加载到内存，
所以如果是大xml文件，不推荐使用该模块解析，应该使用sax方式。

测试解析的内容

<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>

test.xml

1、解析xml文档

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

print(tree)

ElementTree_parse_xml.py

运行结果

#返回ElementTree对象
<xml.etree.ElementTree.ElementTree object at 0x0000020A1F090088>

2、遍历解析XML树,获取节点名字

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

for node in tree.iter():
    print(node.tag)

ElementTree_dump_xml.py

运行结果

#打印所有节点名字
data
country
rank
year
gdppc
neighbor
neighbor
country
rank
year
gdppc
neighbor
country
rank
year
gdppc
neighbor
neighbor

3、遍历解析XML树,获取属性值

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

for node in tree.iter('neighbor'):
    attr_name = node.attrib.get('name')
    attr_direction = node.attrib.get('direction')

    # 如果两个值都不为空，则打印两个值，否则打印一个值
    if attr_name and attr_direction:
        print('{:<25}{:<25}'.format(attr_name, attr_direction))
    else:
        print('{:<25}'.format(attr_name))

ElementTree_show_name_direction.py

运行结果

Austria                  E                        
Switzerland              W                        
Malaysia                 N                        
Costa Rica               W                        
Colombia                 E

4、利用XPath在XML文档中查找节点

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

for node in tree.findall('.//neighbor'):
    name = node.attrib.get('name')
    if name:
        print(name)

ElementTree_find_feeds_by_tag.py

运行结果

Austria
Switzerland
Malaysia
Costa Rica
Colombia

5、利用XPath在XML文档中查找更深一层的节点

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

for node in tree.findall('.//neighbor/neighbor'):
    name = node.attrib.get('name')
    if name:
        print(name)

ElementTree_find_feeds_by_structure.py

运行结果

Malaysia

6、利用XPath表达式，查询节点的属性名和值

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

node = tree.find('./country')
print('标签名:', node.tag)
for name, value in node.attrib.items():
    print('属性名:{name},属性值:{value}'.format(name=name, value=value))

ElementTree_node_attributes.py

运行结果

标签名: country
属性名:name,属性值:Liechtenstein

7、利用XPath表达式,查询多个路径的文本即text

from xml.etree import ElementTree

with open('test.xml', 'tr', encoding='utf-8') as rf:
    tree = ElementTree.parse(rf)

for path in ['./country/year', './country/gdppc']:
    node = tree.find(path)
    print('节点名字', node.tag)
    print(node.text)
    print(node.tail)

ElementTree_node_text.py

运行结果

节点名字 year
2008

        
节点名字 gdppc
141100

8、解析监听标签的事件

from xml.etree.ElementTree import iterparse

# 计算深度值
depth = 0

# 前缀的长度
prefix_width = 8

# 前缀的圆点数量
prefix_dots = '.' * prefix_width

# 拼接格式化字符串模板
line_template = ''.join([
    '{prefix:<0.{prefix_len}}',
    '{event:<8}',
    '{suffix:<{suffix_len}} ',
    '{node.tag:<12} ',
    '{node_id}',
])

EVENT_NAMES = ['start', 'end', 'start-ns', 'end-ns']

for (event, node) in iterparse('test.xml', EVENT_NAMES):

    # 如果是结束，深度减1
    if event == 'end':
        depth -= 1

    # 前缀的长度
    prefix_len = depth * 2

    print(line_template.format(
        prefix=prefix_dots,  # 前缀显示的内容
        prefix_len=prefix_len,  # 前缀的长度
        suffix='',  # 后缀显示的内容
        suffix_len=(prefix_width - prefix_len),  # 后缀的长度=前缀总长度-前缀实际的长度
        event=event,  # 当前的事件
        node_id=id(node),  # 显示内存的ID
        node=node,  # ElementTree的对象
    ))

    # 如果是开始，深度加1
    if event == 'start':
        depth += 1

ElementTree_show_all_events.py

运行结果

start            data         3102087901736
..start          country      3102087901816
....start        rank         3102087901896
....end          rank         3102087901896
....start        year         3102087901976
....end          year         3102087901976
....start        gdppc        3102087902056
....end          gdppc        3102087902056
....start        neighbor     3102087902136
....end          neighbor     3102087902136
....start        neighbor     3102087902216
....end          neighbor     3102087902216
..end            country      3102087901816
..start          country      3102087902296
....start        rank         3102087902376
....end          rank         3102087902376
....start        year         3102087902456
....end          year         3102087902456
....start        gdppc        3102087902536
....end          gdppc        3102087902536
....start        neighbor     3102087902616
......start      neighbor     3102087902776
......end        neighbor     3102087902776
....end          neighbor     3102087902616
..end            country      3102087902296
..start          country      3102087902936
....start        rank         3102087903016
....end          rank         3102087903016
....start        year         3102087903096
....end          year         3102087903096
....start        gdppc        3102087903176
....end          gdppc        3102087903176
....start        neighbor     3102087903336
....end          neighbor     3102087903336
....start        neighbor     3102087903496
....end          neighbor     3102087903496
..end            country      3102087902936
end              data         3102087901736

9、XML转为CVS的文件格式，这里只存到内存中测试，生产中是存到硬盘上

import csv
import sys

from xml.etree.ElementTree import iterparse

writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC)

group_name = ''

parsing = iterparse('test.xml', events=['start'])
for event, node in parsing:

    #去除不想获取的标签
    if node.tag in ['rank', 'year', 'gdppc']:
        continue

    #如果没有属性名为name的话，则为父标签，否则为子标签
    if not node.attrib.get('name'):
        group_name = node.attrib.get('text')

    else:
        writer.writerow(
            (group_name, node.attrib.get('name'), node.attrib.get('direction'))
        )

ElementTree_write_podcast_csv.py

测试效果

"Liechtenstein","Austria","E"
"Liechtenstein","Switzerland","W"
"Singapore","Malaysia","N"
"Singapore","Malaysia","N"
"Panama","Costa Rica","W"
"Panama","Colombia","E"

10、创建一个定制的树的构造器

import csv
import sys
from xml.etree.ElementTree import XMLParser


class PodcastListToCSV(object):
    def __init__(self, output_file):
        self.writer = csv.writer(
            output_file,
            quoting=csv.QUOTE_NONNUMERIC
        )

    def start(self, tag, attrib):
        if tag in ['rank', 'year', 'gdppc']:
            return
        if not attrib.get('name'):
            self.group_name = attrib.get('text')
        else:
            self.writer.writerow(
                (self.group_name,
                 tag,
                 attrib['name'],
                 attrib['direction'])
            )

    def end(self, tag):
        """忽略关闭标签"""
        pass

    def data(self, data):
        """忽略节点内部的数据"""
        pass

    def close(self):
        """在这里没什么特别的"""
        pass


target = PodcastListToCSV(sys.stdout)

parser = XMLParser(target=target)

with open('test.xml', 'rt') as rf:
    for line in rf:
        parser.feed(line)

parser.close()

ElementTree_podcast_csv_treebuilder.py

数据源

<?xml version="1.0"?>

    <country text="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>

test.xml

运行效果

"Liechtenstein","neighbor","Austria","E"
"Liechtenstein","neighbor","Switzerland","W"

11、利用递归的方法，解析XML

from xml.etree.ElementTree import XML


def show_node(node):
    if node.text is not None and node.text.strip():
        print('文本内容: %s' % node.text)
    if node.tail is not None and node.tail.strip():
        print('尾部内容: %s' % node.tail)
    for name, value in sorted(node.attrib.items()):
        print('%s=%s' % (name, value))

    for child in node:
        show_node(child)


parsed = XML("""
<root>
  <group>
    <child id="a">This is child "a".</child>
    <child id="b">This is child "b".</child>
  </group>
  <group>
    <child id="c">This is child "c".</child>
  </group>
</root>
""")
print('parsed = ', parsed)
for elem in parsed:
    show_node(elem)

ElementTree_XML.py

运行结果

parsed =  <Element 'root' at 0x00000240F004AB38>
文本内容: This is child "a".
id=a
文本内容: This is child "b".
id=b
文本内容: This is child "c".
id=c

12、利用属性节点为标识，解析XML子节点

from xml.etree.ElementTree import XMLID

tree, id_map = XMLID('''
<root>
  <group>
    <child id="a">This is child "a".</child>
    <child id="b">This is child "b".</child>
  </group>
  <group>
    <child id="c">This is child "c".</child>
  </group>
</root>
''')
for key, value in sorted(id_map.items()):
    print('%s=%s' % (key, value))

ElementTree_XMLID.py

运行结果

a=<Element 'child' at 0x000001FC1BAC0228>
b=<Element 'child' at 0x000001FC1BAE0098>
c=<Element 'child' at 0x000001FC1BAE0188>

13、创建XML节点,并且打印出来

from xml.etree.ElementTree import (Element, SubElement, Comment, tostring)

top = Element('top')
comment = Comment('这个是创建一个XML top根节点')
top.append(comment)

child = SubElement(top, 'child')
child.text = '这个是子节点child的文本'

child_with_tail = SubElement(top, 'child_with_tail')
child_with_tail.text = '这个是子节点child_with_tail的text'
child_with_tail.tail = '这个是子节点child_with_tail的tail'

print(tostring(top, encoding='utf-8').decode('utf-8'))

ElementTree_create.py

运行结果

<top><!--这个是创建一个XML top根节点--><child>这个是子节点child的文本</child><child_with_tail>这个是子节点child_with_tail的text</child_with_tail>这个是子节点child_with_tail的tail</top>

14、创建XML节点,并且格式化打印出来

from xml.etree import ElementTree
from xml.dom import minidom

def prettify(elem):
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

ElementTree_format.py

from xml.etree.ElementTree import (Element, SubElement, Comment, tostring)
from ElementTree_format import prettify

top = Element('top')
comment = Comment('这个是创建一个XML top根节点')
top.append(comment)

child = SubElement(top, 'child')
child.text = '这个是子节点child的文本'

child_with_tail = SubElement(top, 'child_with_tail')
child_with_tail.text = '这个是子节点child_with_tail的text'
child_with_tail.tail = '这个是子节点child_with_tail的tail'

print(prettify(top))

ElementTree_pretty.py

运行结果

<?xml version="1.0" ?>
<top>
  <!--这个是创建一个XML top根节点-->
  <child>这个是子节点child的文本</child>
  <child_with_tail>这个是子节点child_with_tail的text</child_with_tail>
  这个是子节点child_with_tail的tail
</top>

15、创建XML节点并且设置节点元素的属性

from xml.etree.ElementTree import (Element, SubElement, Comment)

from ElementTree_format import prettify

# 创建根节点，并且设置属性方式一
root = Element('root')
root.set('version', '1.0')

# 增加注释
root.append(Comment('这个是测试设置属性值的Demo'))

# 设置属性方式二
head = SubElement(root, 'head', {'name': 'My Cyc'})
head.text = '这个是文本'

title = SubElement(root, 'title')
title.text = 'My Title'

print(prettify(root))

ElementTree_set_attribute.py

运行结果

<?xml version="1.0" ?>
<root version="1.0">
  <!--这个是测试设置属性值的Demo-->
  <head name="My Cyc">这个是文本</head>
  <title>My Title</title>
</root>

16、自创建XML节点的扩展

from xml.etree.ElementTree import Element

from ElementTree_format import prettify

# 创建根节点
root = Element('top')

# 列表推导式生成三个子节点
children = [
    Element('child', {'num': str(i)}) for i in range(3)
]

# 用根节点，扩展3个子节点
root.extend(children)

print(prettify(root))

ElementTree_extend.py

运行结果

<?xml version="1.0" ?>
<top>
  <child num="0"/>
  <child num="1"/>
  <child num="2"/>
</top>

17、创建XML()节点，对节点进行扩展

from xml.etree.ElementTree import (Element, SubElement, XML)

from ElementTree_format import prettify

# 创建根节点
root = Element('top')

# 将parent挂载到root节点
parent = SubElement(root, 'parent')

# 解析XML的对象
children = XML('<root><child num="0" /><child num="1" /><child num="2" /></root>')

# 在parnet节点扩展children节点
parent.extend(children)

print(prettify(root))

ElementTree_extend_node.py

运行结果

<?xml version="1.0" ?>
<top>
  <parent>
    <child num="0"/>
    <child num="1"/>
    <child num="2"/>
  </parent>
</top>

18、创建XML()节点，对节点进行扩展并不会改变现有父子节点的关系

from xml.etree.ElementTree import (Element, SubElement, XML)

from ElementTree_format import prettify

# 创建根节点
root = Element('top')

# 将parent挂载到root节点
parent_a = SubElement(root, 'parent', {'id': 'a'})
parent_b = SubElement(root, 'parent', {'id': 'b'})

# 解析XML的对象
childrens = XML('<root><child num="0" /><child num="1" /><child num="2" /></root>')

# 给所有的子节点设置属性
for child in childrens:
    child.set('id', str(id(child)))

# 给parent_a扩展子节点
print('A:')
parent_a.extend(childrens)
print(prettify(root))

# 给parent_b扩展子节点
print('B:')
parent_b.extend(childrens)
print(prettify(root))

ElementTree_extend_node_copy.py

运行结果

A:
<?xml version="1.0" ?>
<top>
  <parent id="a">
    <child id="2386560994184" num="0"/>
    <child id="2386561245080" num="1"/>
    <child id="2386561265736" num="2"/>
  </parent>
  <parent id="b"/>
</top>

B:
<?xml version="1.0" ?>
<top>
  <parent id="a">
    <child id="2386560994184" num="0"/>
    <child id="2386561245080" num="1"/>
    <child id="2386561265736" num="2"/>
  </parent>
  <parent id="b">
    <child id="2386560994184" num="0"/>
    <child id="2386561245080" num="1"/>
    <child id="2386561265736" num="2"/>
  </parent>
</top>
从上面可以看出来，内存的id都是一样的

19、将创建完成的XML序列化到标准的输出流，显示出来

import sys
from xml.etree.ElementTree import (
    Element, SubElement, Comment, ElementTree,
)

from ElementTree_format import prettify

root = Element('root')

comment = Comment('注释的功能')
root.append(comment)

child = SubElement(root, 'child')
child.text = '这个是child的text'

child_with_tail = SubElement(root, '这个是child的tail')
child_with_tail.text = '这个是child_with_tail的text'
child_with_tail.tail = '这个是child_with_tail的tail'

child_with_entity_ref = SubElement(root, 'child_with_entity_ref')
child_with_entity_ref.text = '这个是child_with_entity_ref的text'

empty_child = SubElement(root, 'empty_child')

sys.stdout.write(prettify(root))

ElementTree_write.py

运行结果

<?xml version="1.0" ?>
<root>
  <!--注释的功能-->
  <child>这个是child的text</child>
  <这个是child的tail>这个是child_with_tail的text</这个是child的tail>
  这个是child_with_tail的tail
  <child_with_entity_ref>这个是child_with_entity_ref的text</child_with_entity_ref>
  <empty_child/>
</root>

20、将创建完成的XML序列化到标准的输出流，并且设置xml,html,text不同的方法，显示不一样的效果

import sys
from xml.etree.ElementTree import (
    Element, SubElement, ElementTree,
)

# 创建根节点
root = Element('root')

# 将child节点，增加到root节点
child = SubElement(root, 'child')

# 设置child节点的文本
child.text = 'Contains text.'

# 将empty_child，增加到root节点
empty_child = SubElement(root, 'empty_child')

for method in ['xml', 'html', 'text']:
    print(method)
    sys.stdout.flush()
    ElementTree(root).write(sys.stdout.buffer, method=method)
    print('\n')

ElementTree_write_method.py

运行结果

xml
<root><child>Contains text.</child><empty_child /></root>

html
<root><child>Contains text.</child><empty_child></empty_child></root>

text
Contains text.

posted @ 2020-04-21 14:57 小粉优化大师阅读(1528) 评论(0) 收藏举报

刷新页面返回顶部

小粉优化大师

闻道有先后，术业有专攻 -《师说》

Python之xml.etree.ElementTree模块的使用

公告