xml文件_解析示例_python实现

注:文中代码的目的为解析xml 文件中的内容,输出xml 文件中的数据。

最原始的目的是为了统计数据集中图片标签的类别数。

xml文件内容展示

<annotation>
  <folder>images</folder>
  <filename>Czech_000022.jpg</filename>
  <size>
    <depth>3</depth>
    <width>600</width>
    <height>600</height>
  </size>
  <object>
    <name>D00</name>
    <bndbox>
      <xmin>182</xmin>
      <ymin>471</ymin>
      <xmax>229</xmax>
      <ymax>512</ymax>
    </bndbox>
  </object>
</annotation>

 读取并解析 xml 文件的 python 代码(代码不具有通用性,但是一种可实现的方式)

 1 import os
 2 import xml.etree.ElementTree as ET
 3 
 4 def fun(xml_root_path, txt_save_path):
 5     xml_files = os.listdir(xml_root_path)
 6     print(type(xml_files), len(xml_files), xml_files[0])
 7 
 8     for xml_file in xml_files:
 9         xml_path = os.path.join(xml_root_path, xml_file)
10         print('xml路径:', xml_path)
11 
12         ### 解析xml
13         tree  = ET.parse(xml_path)
14         root = tree.getroot()        #获取根结点
15         print(type(root), len(root), root, '\n')
16 
17         # 子标签:文件夹名称
18         folder_name = root[0].text
19         print('folder_name: \t', folder_name)
20 
21         # 子标签:文件名称
22         file_name = root[1].text
23         print('file_name: \t', file_name)
24 
25         # 子标签: 图像尺寸
26         size_name = root[2]
27         # print( len(size_name) )     # 3
28         size_depth = size_name[0].text
29         print('size_depth: \t', size_depth)
30         size_width = size_name[1].text
31         print('size_width: \t', size_width)
32         size_height = size_name[2].text
33         print('size_height: \t', size_height)
34 
35         # 子标签: 标记框信息
36         object_name = root[3]
37         # print(len(object_name))     # 2
38         label_name = object_name[0].text
39         print('label_name:\t', label_name)  ####### 标签名称,主要统计的是这个
40         # bbox
41         bndbox = object_name[1]
42         # print(len(bndbox))     # 4
43         bbox = [bndbox[0].text, bndbox[1].text, bndbox[2].text, bndbox[3].text ]
44         print(bbox)
45 
46 
47 temp_path = '../data/JanpanRoad'
48 txt_save_path = './temp.txt'
49 
50 fun(temp_path, txt_save_path)

输出结果

 1 <class 'list'> 1 Czech_000022.xml
 2 xml路径: ../data/JanpanRoad\Czech_000022.xml
 3 <class 'xml.etree.ElementTree.Element'> 4 <Element 'annotation' at 0x00000281DA735368> 
 4 
 5 folder_name:      images
 6 file_name:      Czech_000022.jpg
 7 size_depth:      3
 8 size_width:      600
 9 size_height:      600
10 label_name:     D00
11 ['182', '471', '229', '512']

 

可通用性代码:

 1 import os
 2 import xml.dom.minidom
 3 
 4 def xml_label_names(xml_root_path):
 5     xml_files = os.listdir(xml_root_path)
 6     for xml_file in xml_files:
 7         xml_path = os.path.join(xml_root_path, xml_file)
 8         # 打开xml文档
 9         DOMTree = xml.dom.minidom.parse(xml_path)
10         # 得到文档元素对象
11         collection = DOMTree.documentElement
12         ### 获取文件名
13         filenamelist = collection.getElementsByTagName("filename")
14         filename = filenamelist[0].childNodes[0].data
15         print('\n', len(filenamelist), filename)
16         ### 得到标签名为object的信息
17         objectlist = collection.getElementsByTagName("object")
18         for objects in objectlist:
19             ### 每个 object 中得到子标签名为 name 的信息
20             namelist = objects.getElementsByTagName('name')
21             ### 获得标记框的标签名
22             objectname = namelist[0].childNodes[0].data
23             # print('类别名为: ', objectname)       ########### 索要统计的信息
24 
25             bndbox = objects.getElementsByTagName('bndbox')
26             for box in bndbox:
27                 x1_list = box.getElementsByTagName('xmin')
28                 x1 = int(x1_list[0].childNodes[0].data)
29                 y1_list = box.getElementsByTagName('ymin')
30                 y1 = int(y1_list[0].childNodes[0].data)
31                 x2_list = box.getElementsByTagName('xmax')  # 注意坐标,看是否需要转换
32                 x2 = int(x2_list[0].childNodes[0].data)
33                 y2_list = box.getElementsByTagName('ymax')
34                 y2 = int(y2_list[0].childNodes[0].data)
35                 bbox = [x1, y1, x2, y2]
36                 print('文件名:', filename, ' ,标签名:',  objectname, ' ,标记框:' , bbox)
37                 
38 xml_root_path = '../data/JanpanRoad'
39 xml_label_names(xml_root_path)

输出结果:

1  1 Czech_000022.jpg
2 文件名: Czech_000022.jpg  ,标签名: D00  ,标记框: [182, 471, 229, 512]
3 
4  1 Czech_000031.jpg
5 文件名: Czech_000031.jpg  ,标签名: D20  ,标记框: [273, 442, 343, 488]
6 文件名: Czech_000031.jpg  ,标签名: D20  ,标记框: [262, 395, 307, 439]
7 文件名: Czech_000031.jpg  ,标签名: D00  ,标记框: [155, 396, 194, 437]

 

posted @ 2021-12-08 15:04  Bro_Li  阅读(161)  评论(0编辑  收藏  举报