通过xsd schema结构来验证xml是否合法
1 import sys 2 import StringIO 3 import lxml 4 5 from lxml import etree 6 from StringIO import StringIO 7 8 # Construct XML relevant to the XML schema we're validating against. By altering the string, adding/removing elements 9 # we can force different errors to occur when validating. 10 xml = StringIO(''' 11 <CompanyDataRequest xmlns="http://xmlgw.companieshouse.gov.uk" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://xmlgw.companieshouse.gov.uk http://xmlgw.companieshouse.gov.uk/v2-1/schema/CompanyData-v2-2.xsd"> 12 <CompanyNumber>06937730</CompanyNumber> 13 <CompanyAuthenticationCode>123456</CompanyAuthenticationCode> 14 <MadeUpDate>2010-06-30x</MadeUpDate> 15 </CompanyDataRequest> 16 ''') 17 18 # Clear any previous errors 19 lxml.etree.clear_error_log() 20 21 try: 22 # Get the XML schema to validate against 23 schema = lxml.etree.XMLSchema(file = 'http://xmlgw.companieshouse.gov.uk/v2-1/schema/CompanyData-v2-2.xsd') 24 # Parse string of XML 25 xml_doc = lxml.etree.parse(xml) 26 # Validate parsed XML against schema returning a readable message on failure 27 schema.assertValid(xml_doc) 28 # Validate parsed XML against schema returning boolean value indicating success/failure 29 print 'schema.validate() returns "%s".' % schema.validate(xml_doc) 30 31 except lxml.etree.XMLSchemaParseError, xspe: 32 # Something wrong with the schema (getting from URL/parsing) 33 print "XMLSchemaParseError occurred!" 34 print xspe 35 36 except lxml.etree.XMLSyntaxError, xse: 37 # XML not well formed 38 print "XMLSyntaxError occurred!" 39 print xse 40 41 except lxml.etree.DocumentInvalid, di: 42 # XML failed to validate against schema 43 print "DocumentInvalid occurred!" 44 45 error = schema.error_log.last_error 46 if error: 47 # All the error properties (from libxml2) describing what went wrong 48 print 'domain_name: ' + error.domain_name 49 print 'domain: ' + str(error.domain) 50 print 'filename: ' + error.filename # '<string>' cos var is a string of xml 51 print 'level: ' + str(error.level) 52 print 'level_name: ' + error.level_name # an integer 53 print 'line: ' + str(error.line) # a unicode string that identifies the line where the error occurred. 54 print 'message: ' + error.message # a unicode string that lists the message. 55 print 'type: ' + str(error.type) # an integer 56 print 'type_name: ' + error.type_name
封装类
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 # Author:Eric.yue 4 5 import os 6 import lxml.etree as ET 7 from StringIO import StringIO 8 import chardet 9 10 11 class R3xmlCheck(object): 12 def __init__(self, element_xml): 13 self.elem_xml = element_xml 14 15 def validate_xsd_xml(self, f_xml, elem_xsd): 16 try: 17 elem_xsd = elem_xsd.encode('utf-8') 18 xsd_doc = StringIO(elem_xsd) 19 xml_doc = StringIO(f_xml) 20 xmlschema_doc = ET.parse(xsd_doc) 21 xmlschema = ET.XMLSchema(xmlschema_doc) 22 xml = ET.parse(xml_doc) 23 xmlschema.assertValid(xml) 24 print 'schema.validate() returns "%s".' % xmlschema.validate(xml) 25 26 except ET.XMLSchemaParseError, xspe: 27 # Something wrong with the schema (getting from URL/parsing) 28 print "XMLSchemaParseError occurred!" 29 print xspe 30 31 except ET.XMLSyntaxError, xse: 32 # XML not well formed 33 print "XMLSyntaxError occurred!" 34 print xse 35 36 except ET.DocumentInvalid, di: 37 # XML failed to validate against schema 38 print "DocumentInvalid occurred!" 39 40 error = xmlschema.error_log.last_error 41 if error: 42 # All the error properties (from libxml2) describing what went wrong 43 print 'domain_name: ' + error.domain_name 44 print 'domain: ' + str(error.domain) 45 print 'filename: ' + error.filename # '<string>' cos var is a string of xml 46 print 'level: ' + str(error.level) 47 print 'level_name: ' + error.level_name # an integer 48 print 'line: ' + str(error.line) # a unicode string that identifies the line where the error occurred. 49 print 'message: ' + error.message # a unicode string that lists the message. 50 print 'type: ' + str(error.type) # an integer 51 print 'type_name: ' + error.type_name 52 53 def run(self): 54 res = self.validate_xml(self.elem_xml) 55 if res["result"] is not True: 56 return res["info"] 57 58 elem_xsd = self.get_xsd() 59 60 with open(self.elem_xml) as f: 61 f_xml = f.read() 62 chardet_info = chardet.detect(f_xml) 63 if chardet_info['encoding'] == 'ascii': 64 f_xml = f_xml.encode('utf-8') 65 self.validate_xsd_xml(f_xml.strip(),elem_xsd) 66 67 # matching schemaLocation url 68 def get_xsd(self): 69 with open("./xsd/multicacheschemas/MCCI_IN200100UV01.xsd") as f: 70 elem_xsd = f.read() 71 return elem_xsd 72 73 def validate_xml(self, exml): 74 rinfo = {} 75 if os.path.exists(exml): 76 try: 77 ET.parse(exml) 78 rinfo['result'] = True 79 except Exception as err: 80 rinfo['result'] = False 81 rinfo['info'] = 'Parsing error info:{0}'.format(err) 82 return rinfo 83 84 if __name__ == "__main__": 85 aa = R3xmlCheck("./xsd/aa.xml") 86 aa.run()