python读xml写到txt
读取xml信息,写到txt中。
这个是在当前路径执行的,只能操作当前路径下的文件。
# -*- coding: utf-8 -*- import os import xml.dom.minidom def extract_xml_to_txt(srcdir, dstdir): num = 0 filelist = os.listdir(srcdir) for i in range(0, len(filelist)): if filelist[i][-3:] == 'xml': file = os.path.join(srcdir, filelist[i]) newfile = filelist[i].replace(".xml", ".txt") dstfile = os.path.join(dstdir, newfile) print("processing file", dstfile) # write file file_lineinfo = open(dstfile, 'w', encoding='utf-8') content_tree = xml.dom.minidom.parse(file) content = content_tree.documentElement print(content) LineInfos = content.getElementsByTagName('LineInfo') for lineinfo in LineInfos: if lineinfo.hasAttribute("ptLTX"): ltx = lineinfo.getAttribute("ptLTX") print("LTX:", ltx) if lineinfo.hasAttribute("ptLTY"): lty = lineinfo.getAttribute("ptLTY") print("LTY:", lty) if lineinfo.hasAttribute("ptLBX"): lbx = lineinfo.getAttribute("ptLBX") print("LBX:", lbx) if lineinfo.hasAttribute("ptLBY"): lby = lineinfo.getAttribute("ptLBY") print("LBY:", lby) if lineinfo.hasAttribute("ptRTX"): rtx = lineinfo.getAttribute("ptRTX") print("RTX:", rtx) if lineinfo.hasAttribute("ptRTY"): rty = lineinfo.getAttribute("ptRTY") print("RTY:", rty) if lineinfo.hasAttribute("ptRBX"): rbx = lineinfo.getAttribute("ptRBX") print("RBX:", rbx) if lineinfo.hasAttribute("ptRBY"): rby = lineinfo.getAttribute("ptRBY") print("RBY:", rby) if lineinfo.hasAttribute("Chars"): chars = lineinfo.getAttribute("Chars") chars = chars.strip('\n') print("Chars:", chars) line_info = [ltx, ',', lty, ',', lbx, ',', lby, ',', rtx, ',', rty, ',', rbx, ',', rby, ',', chars, '\n'] file_lineinfo.writelines(line_info) file_lineinfo.close() if __name__ == '__main__': src_directory = os.getcwd() dst_directory = os.getcwd() extract_xml_to_txt(src_directory, dst_directory)
这个是递归执行所有文件的。
# -*- coding: utf-8 -*- import os import xml.dom.minidom def getFiles(path, suffix): return [os.path.join(root, file) for root, dirs, files in os.walk(path) for file in files if file.endswith(suffix)] def extract_xml_to_txt(srcdir, dstdir): num = 0 filelist = getFiles(srcdir, '.xml') #filelist = os.listdir(srcdir) for i in range(0, len(filelist)): if filelist[i][-3:] == 'xml': file = os.path.join(srcdir, filelist[i]) newfile = filelist[i].replace(".xml", ".txt") dstfile = os.path.join(dstdir, newfile) print("processing file", dstfile) # write file file_lineinfo = open(dstfile, 'w', encoding='utf-8') content_tree = xml.dom.minidom.parse(file) content = content_tree.documentElement print(content) LineInfos = content.getElementsByTagName('LineInfo') for lineinfo in LineInfos: if lineinfo.hasAttribute("ptLTX"): ltx = lineinfo.getAttribute("ptLTX") print("LTX:", ltx) if lineinfo.hasAttribute("ptLTY"): lty = lineinfo.getAttribute("ptLTY") print("LTY:", lty) if lineinfo.hasAttribute("ptLBX"): lbx = lineinfo.getAttribute("ptLBX") print("LBX:", lbx) if lineinfo.hasAttribute("ptLBY"): lby = lineinfo.getAttribute("ptLBY") print("LBY:", lby) if lineinfo.hasAttribute("ptRTX"): rtx = lineinfo.getAttribute("ptRTX") print("RTX:", rtx) if lineinfo.hasAttribute("ptRTY"): rty = lineinfo.getAttribute("ptRTY") print("RTY:", rty) if lineinfo.hasAttribute("ptRBX"): rbx = lineinfo.getAttribute("ptRBX") print("RBX:", rbx) if lineinfo.hasAttribute("ptRBY"): rby = lineinfo.getAttribute("ptRBY") print("RBY:", rby) if lineinfo.hasAttribute("Chars"): chars = lineinfo.getAttribute("Chars") chars = chars.strip('\n') print("Chars:", chars) if chars == "" or rby == '-1': continue line_info = [ltx, ',', lty, ',', lbx, ',', lby, ',', rtx, ',', rty, ',', rbx, ',', rby, ',', chars, '\n'] file_lineinfo.writelines(line_info) file_lineinfo.close() if __name__ == '__main__': src_directory = os.getcwd() dst_directory = os.getcwd() extract_xml_to_txt(src_directory, dst_directory)