通过xsd schema结构来验证xml是否合法

 1 import sys
 2 import StringIO
 3 import lxml
 4 
 5 from lxml import etree
 6 from StringIO import StringIO
 7 
 8 # Construct XML relevant to the XML schema we're validating against. By altering the string, adding/removing elements
 9 # we can force different errors to occur when validating.
10 xml = StringIO('''
11 <CompanyDataRequest xmlns="http://xmlgw.companieshouse.gov.uk" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://xmlgw.companieshouse.gov.uk http://xmlgw.companieshouse.gov.uk/v2-1/schema/CompanyData-v2-2.xsd">
12     <CompanyNumber>06937730</CompanyNumber>
13     <CompanyAuthenticationCode>123456</CompanyAuthenticationCode>
14     <MadeUpDate>2010-06-30x</MadeUpDate>
15 </CompanyDataRequest>
16 ''')
17 
18 # Clear any previous errors
19 lxml.etree.clear_error_log()
20 
21 try:
22     # Get the XML schema to validate against
23     schema = lxml.etree.XMLSchema(file = 'http://xmlgw.companieshouse.gov.uk/v2-1/schema/CompanyData-v2-2.xsd')
24     # Parse string of XML
25     xml_doc = lxml.etree.parse(xml)
26     # Validate parsed XML against schema returning a readable message on failure
27     schema.assertValid(xml_doc)
28     # Validate parsed XML against schema returning boolean value indicating success/failure
29     print 'schema.validate() returns "%s".' % schema.validate(xml_doc)
30 
31 except lxml.etree.XMLSchemaParseError, xspe:
32     # Something wrong with the schema (getting from URL/parsing)
33     print "XMLSchemaParseError occurred!"
34     print xspe
35 
36 except lxml.etree.XMLSyntaxError, xse:
37     # XML not well formed
38     print "XMLSyntaxError occurred!"
39     print xse
40     
41 except lxml.etree.DocumentInvalid, di:
42     # XML failed to validate against schema
43     print "DocumentInvalid occurred!"
44 
45     error = schema.error_log.last_error
46     if error:
47         # All the error properties (from libxml2) describing what went wrong
48         print 'domain_name: ' + error.domain_name
49         print 'domain: ' + str(error.domain)
50         print 'filename: ' + error.filename # '<string>' cos var is a string of xml
51         print 'level: ' + str(error.level)
52         print 'level_name: ' + error.level_name # an integer
53         print 'line: ' + str(error.line) # a unicode string that identifies the line where the error occurred.
54         print 'message: ' + error.message # a unicode string that lists the message.
55         print 'type: ' + str(error.type) # an integer
56         print 'type_name: ' + error.type_name

封装类

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 # Author:Eric.yue
 4 
 5 import os
 6 import lxml.etree as ET
 7 from StringIO import StringIO
 8 import chardet
 9 
10 
11 class R3xmlCheck(object):
12     def __init__(self, element_xml):
13         self.elem_xml = element_xml
14 
15     def validate_xsd_xml(self, f_xml, elem_xsd):
16         try:
17             elem_xsd = elem_xsd.encode('utf-8')
18             xsd_doc = StringIO(elem_xsd)
19             xml_doc = StringIO(f_xml)
20             xmlschema_doc = ET.parse(xsd_doc)
21             xmlschema = ET.XMLSchema(xmlschema_doc)
22             xml = ET.parse(xml_doc)
23             xmlschema.assertValid(xml)
24             print 'schema.validate() returns "%s".' % xmlschema.validate(xml)
25 
26         except ET.XMLSchemaParseError, xspe:
27             # Something wrong with the schema (getting from URL/parsing)
28             print "XMLSchemaParseError occurred!"
29             print xspe
30 
31         except ET.XMLSyntaxError, xse:
32             # XML not well formed
33             print "XMLSyntaxError occurred!"
34             print xse
35 
36         except ET.DocumentInvalid, di:
37             # XML failed to validate against schema
38             print "DocumentInvalid occurred!"
39 
40             error = xmlschema.error_log.last_error
41             if error:
42                 # All the error properties (from libxml2) describing what went wrong
43                 print 'domain_name: ' + error.domain_name
44                 print 'domain: ' + str(error.domain)
45                 print 'filename: ' + error.filename  # '<string>' cos var is a string of xml
46                 print 'level: ' + str(error.level)
47                 print 'level_name: ' + error.level_name  # an integer
48                 print 'line: ' + str(error.line)  # a unicode string that identifies the line where the error occurred.
49                 print 'message: ' + error.message  # a unicode string that lists the message.
50                 print 'type: ' + str(error.type)  # an integer
51                 print 'type_name: ' + error.type_name
52 
53     def run(self):
54         res = self.validate_xml(self.elem_xml)
55         if res["result"] is not True:
56             return res["info"]
57 
58         elem_xsd = self.get_xsd()
59 
60         with open(self.elem_xml) as f:
61             f_xml = f.read()
62             chardet_info = chardet.detect(f_xml)
63             if chardet_info['encoding'] == 'ascii':
64                 f_xml = f_xml.encode('utf-8')
65             self.validate_xsd_xml(f_xml.strip(),elem_xsd)
66 
67     # matching schemaLocation url
68     def get_xsd(self):
69         with open("./xsd/multicacheschemas/MCCI_IN200100UV01.xsd") as f:
70             elem_xsd = f.read()
71             return elem_xsd
72 
73     def validate_xml(self, exml):
74         rinfo = {}
75         if os.path.exists(exml):
76             try:
77                 ET.parse(exml)
78                 rinfo['result'] = True
79             except Exception as err:
80                 rinfo['result'] = False
81                 rinfo['info'] = 'Parsing error info:{0}'.format(err)
82         return rinfo
83 
84 if __name__ == "__main__":
85     aa = R3xmlCheck("./xsd/aa.xml")
86     aa.run()

 

posted @ 2018-11-09 11:05  北京流浪儿  阅读(1095)  评论(0编辑  收藏  举报