1 | #! /usr/bin/env python |
---|
2 | '''XML Canonicalization |
---|
3 | |
---|
4 | Patches Applied to xml.dom.ext.c14n: |
---|
5 | http://sourceforge.net/projects/pyxml/ |
---|
6 | |
---|
7 | [ 1444526 ] c14n.py: http://www.w3.org/TR/xml-exc-c14n/ fix |
---|
8 | -- includes [ 829905 ] c14n.py fix for bug #825115, |
---|
9 | Date Submitted: 2003-10-24 23:43 |
---|
10 | -- include dependent namespace declarations declared in ancestor nodes |
---|
11 | (checking attributes and tags), |
---|
12 | -- handle InclusiveNamespaces PrefixList parameter |
---|
13 | |
---|
14 | This module generates canonical XML of a document or element. |
---|
15 | http://www.w3.org/TR/2001/REC-xml-c14n-20010315 |
---|
16 | and includes a prototype of exclusive canonicalization |
---|
17 | http://www.w3.org/Signature/Drafts/xml-exc-c14n |
---|
18 | |
---|
19 | Requires PyXML 0.7.0 or later. |
---|
20 | |
---|
21 | Known issues if using Ft.Lib.pDomlette: |
---|
22 | 1. Unicode |
---|
23 | 2. does not white space normalize attributes of type NMTOKEN and ID? |
---|
24 | 3. seems to be include "\n" after importing external entities? |
---|
25 | |
---|
26 | Note, this version processes a DOM tree, and consequently it processes |
---|
27 | namespace nodes as attributes, not from a node's namespace axis. This |
---|
28 | permits simple document and element canonicalization without |
---|
29 | XPath. When XPath is used, the XPath result node list is passed and used to |
---|
30 | determine if the node is in the XPath result list, but little else. |
---|
31 | |
---|
32 | Authors: |
---|
33 | "Joseph M. Reagle Jr." <reagle@w3.org> |
---|
34 | "Rich Salz" <rsalz@zolera.com> |
---|
35 | |
---|
36 | $Date: 2006-03-30 23:47:16 +0000 (Thu, 30 Mar 2006) $ by $Author: boverhof $ |
---|
37 | ''' |
---|
38 | |
---|
39 | _copyright = '''Copyright 2001, Zolera Systems Inc. All Rights Reserved. |
---|
40 | Copyright 2001, MIT. All Rights Reserved. |
---|
41 | |
---|
42 | Distributed under the terms of: |
---|
43 | Python 2.0 License or later. |
---|
44 | http://www.python.org/2.0.1/license.html |
---|
45 | or |
---|
46 | W3C Software License |
---|
47 | http://www.w3.org/Consortium/Legal/copyright-software-19980720 |
---|
48 | ''' |
---|
49 | |
---|
50 | import string |
---|
51 | from xml.dom import Node |
---|
52 | try: |
---|
53 | from xml.ns import XMLNS |
---|
54 | except: |
---|
55 | class XMLNS: |
---|
56 | BASE = "http://www.w3.org/2000/xmlns/" |
---|
57 | XML = "http://www.w3.org/XML/1998/namespace" |
---|
58 | try: |
---|
59 | import cStringIO |
---|
60 | StringIO = cStringIO |
---|
61 | except ImportError: |
---|
62 | import StringIO |
---|
63 | |
---|
64 | _attrs = lambda E: (E.attributes and E.attributes.values()) or [] |
---|
65 | _children = lambda E: E.childNodes or [] |
---|
66 | _IN_XML_NS = lambda n: n.name.startswith("xmlns") |
---|
67 | _inclusive = lambda n: n.unsuppressedPrefixes == None |
---|
68 | |
---|
69 | |
---|
70 | # Does a document/PI has lesser/greater document order than the |
---|
71 | # first element? |
---|
72 | _LesserElement, _Element, _GreaterElement = range(3) |
---|
73 | |
---|
74 | def _sorter(n1,n2): |
---|
75 | '''_sorter(n1,n2) -> int |
---|
76 | Sorting predicate for non-NS attributes.''' |
---|
77 | |
---|
78 | i = cmp(n1.namespaceURI, n2.namespaceURI) |
---|
79 | if i: return i |
---|
80 | return cmp(n1.localName, n2.localName) |
---|
81 | |
---|
82 | |
---|
83 | def _sorter_ns(n1,n2): |
---|
84 | '''_sorter_ns((n,v),(n,v)) -> int |
---|
85 | "(an empty namespace URI is lexicographically least)."''' |
---|
86 | |
---|
87 | if n1[0] == 'xmlns': return -1 |
---|
88 | if n2[0] == 'xmlns': return 1 |
---|
89 | return cmp(n1[0], n2[0]) |
---|
90 | |
---|
91 | def _utilized(n, node, other_attrs, unsuppressedPrefixes): |
---|
92 | '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean |
---|
93 | Return true if that nodespace is utilized within the node''' |
---|
94 | if n.startswith('xmlns:'): |
---|
95 | n = n[6:] |
---|
96 | elif n.startswith('xmlns'): |
---|
97 | n = n[5:] |
---|
98 | if (n=="" and node.prefix in ["#default", None]) or \ |
---|
99 | n == node.prefix or n in unsuppressedPrefixes: |
---|
100 | return 1 |
---|
101 | for attr in other_attrs: |
---|
102 | if n == attr.prefix: return 1 |
---|
103 | # For exclusive need to look at attributes |
---|
104 | if unsuppressedPrefixes is not None: |
---|
105 | for attr in _attrs(node): |
---|
106 | if n == attr.prefix: return 1 |
---|
107 | |
---|
108 | return 0 |
---|
109 | |
---|
110 | |
---|
111 | def _inclusiveNamespacePrefixes(node, context, unsuppressedPrefixes): |
---|
112 | '''http://www.w3.org/TR/xml-exc-c14n/ |
---|
113 | InclusiveNamespaces PrefixList parameter, which lists namespace prefixes that |
---|
114 | are handled in the manner described by the Canonical XML Recommendation''' |
---|
115 | inclusive = [] |
---|
116 | if node.prefix: |
---|
117 | usedPrefixes = ['xmlns:%s' %node.prefix] |
---|
118 | else: |
---|
119 | usedPrefixes = ['xmlns'] |
---|
120 | |
---|
121 | for a in _attrs(node): |
---|
122 | if a.nodeName.startswith('xmlns') or not a.prefix: continue |
---|
123 | usedPrefixes.append('xmlns:%s' %a.prefix) |
---|
124 | |
---|
125 | unused_namespace_dict = {} |
---|
126 | for attr in context: |
---|
127 | n = attr.nodeName |
---|
128 | if n in unsuppressedPrefixes: |
---|
129 | inclusive.append(attr) |
---|
130 | elif n.startswith('xmlns:') and n[6:] in unsuppressedPrefixes: |
---|
131 | inclusive.append(attr) |
---|
132 | elif n.startswith('xmlns') and n[5:] in unsuppressedPrefixes: |
---|
133 | inclusive.append(attr) |
---|
134 | elif attr.nodeName in usedPrefixes: |
---|
135 | inclusive.append(attr) |
---|
136 | elif n.startswith('xmlns:'): |
---|
137 | unused_namespace_dict[n] = attr.value |
---|
138 | |
---|
139 | return inclusive, unused_namespace_dict |
---|
140 | |
---|
141 | #_in_subset = lambda subset, node: not subset or node in subset |
---|
142 | _in_subset = lambda subset, node: subset is None or node in subset # rich's tweak |
---|
143 | |
---|
144 | |
---|
145 | class _implementation: |
---|
146 | '''Implementation class for C14N. This accompanies a node during it's |
---|
147 | processing and includes the parameters and processing state.''' |
---|
148 | |
---|
149 | # Handler for each node type; populated during module instantiation. |
---|
150 | handlers = {} |
---|
151 | |
---|
152 | def __init__(self, node, write, **kw): |
---|
153 | '''Create and run the implementation.''' |
---|
154 | self.write = write |
---|
155 | self.subset = kw.get('subset') |
---|
156 | self.comments = kw.get('comments', 0) |
---|
157 | self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes') |
---|
158 | nsdict = kw.get('nsdict', { 'xml': XMLNS.XML, 'xmlns': XMLNS.BASE }) |
---|
159 | |
---|
160 | # Processing state. |
---|
161 | self.state = (nsdict, {'xml':''}, {}, {}) #0422 |
---|
162 | |
---|
163 | if node.nodeType == Node.DOCUMENT_NODE: |
---|
164 | self._do_document(node) |
---|
165 | elif node.nodeType == Node.ELEMENT_NODE: |
---|
166 | self.documentOrder = _Element # At document element |
---|
167 | if not _inclusive(self): |
---|
168 | inherited,unused = _inclusiveNamespacePrefixes(node, self._inherit_context(node), |
---|
169 | self.unsuppressedPrefixes) |
---|
170 | self._do_element(node, inherited, unused=unused) |
---|
171 | else: |
---|
172 | inherited = self._inherit_context(node) |
---|
173 | self._do_element(node, inherited) |
---|
174 | elif node.nodeType == Node.DOCUMENT_TYPE_NODE: |
---|
175 | pass |
---|
176 | else: |
---|
177 | raise TypeError(str(node)) |
---|
178 | |
---|
179 | |
---|
180 | def _inherit_context(self, node): |
---|
181 | '''_inherit_context(self, node) -> list |
---|
182 | Scan ancestors of attribute and namespace context. Used only |
---|
183 | for single element node canonicalization, not for subset |
---|
184 | canonicalization.''' |
---|
185 | |
---|
186 | # Collect the initial list of xml:foo attributes. |
---|
187 | xmlattrs = filter(_IN_XML_NS, _attrs(node)) |
---|
188 | |
---|
189 | # Walk up and get all xml:XXX attributes we inherit. |
---|
190 | inherited, parent = [], node.parentNode |
---|
191 | while parent and parent.nodeType == Node.ELEMENT_NODE: |
---|
192 | for a in filter(_IN_XML_NS, _attrs(parent)): |
---|
193 | n = a.localName |
---|
194 | if n not in xmlattrs: |
---|
195 | xmlattrs.append(n) |
---|
196 | inherited.append(a) |
---|
197 | parent = parent.parentNode |
---|
198 | return inherited |
---|
199 | |
---|
200 | |
---|
201 | def _do_document(self, node): |
---|
202 | '''_do_document(self, node) -> None |
---|
203 | Process a document node. documentOrder holds whether the document |
---|
204 | element has been encountered such that PIs/comments can be written |
---|
205 | as specified.''' |
---|
206 | |
---|
207 | self.documentOrder = _LesserElement |
---|
208 | for child in node.childNodes: |
---|
209 | if child.nodeType == Node.ELEMENT_NODE: |
---|
210 | self.documentOrder = _Element # At document element |
---|
211 | self._do_element(child) |
---|
212 | self.documentOrder = _GreaterElement # After document element |
---|
213 | elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE: |
---|
214 | self._do_pi(child) |
---|
215 | elif child.nodeType == Node.COMMENT_NODE: |
---|
216 | self._do_comment(child) |
---|
217 | elif child.nodeType == Node.DOCUMENT_TYPE_NODE: |
---|
218 | pass |
---|
219 | else: |
---|
220 | raise TypeError(str(child)) |
---|
221 | handlers[Node.DOCUMENT_NODE] = _do_document |
---|
222 | |
---|
223 | |
---|
224 | def _do_text(self, node): |
---|
225 | '''_do_text(self, node) -> None |
---|
226 | Process a text or CDATA node. Render various special characters |
---|
227 | as their C14N entity representations.''' |
---|
228 | if not _in_subset(self.subset, node): return |
---|
229 | s = string.replace(node.data, "&", "&") |
---|
230 | s = string.replace(s, "<", "<") |
---|
231 | s = string.replace(s, ">", ">") |
---|
232 | s = string.replace(s, "\015", "
") |
---|
233 | if s: self.write(s) |
---|
234 | handlers[Node.TEXT_NODE] = _do_text |
---|
235 | handlers[Node.CDATA_SECTION_NODE] = _do_text |
---|
236 | |
---|
237 | |
---|
238 | def _do_pi(self, node): |
---|
239 | '''_do_pi(self, node) -> None |
---|
240 | Process a PI node. Render a leading or trailing #xA if the |
---|
241 | document order of the PI is greater or lesser (respectively) |
---|
242 | than the document element. |
---|
243 | ''' |
---|
244 | if not _in_subset(self.subset, node): return |
---|
245 | W = self.write |
---|
246 | if self.documentOrder == _GreaterElement: W('\n') |
---|
247 | W('<?') |
---|
248 | W(node.nodeName) |
---|
249 | s = node.data |
---|
250 | if s: |
---|
251 | W(' ') |
---|
252 | W(s) |
---|
253 | W('?>') |
---|
254 | if self.documentOrder == _LesserElement: W('\n') |
---|
255 | handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi |
---|
256 | |
---|
257 | |
---|
258 | def _do_comment(self, node): |
---|
259 | '''_do_comment(self, node) -> None |
---|
260 | Process a comment node. Render a leading or trailing #xA if the |
---|
261 | document order of the comment is greater or lesser (respectively) |
---|
262 | than the document element. |
---|
263 | ''' |
---|
264 | if not _in_subset(self.subset, node): return |
---|
265 | if self.comments: |
---|
266 | W = self.write |
---|
267 | if self.documentOrder == _GreaterElement: W('\n') |
---|
268 | W('<!--') |
---|
269 | W(node.data) |
---|
270 | W('-->') |
---|
271 | if self.documentOrder == _LesserElement: W('\n') |
---|
272 | handlers[Node.COMMENT_NODE] = _do_comment |
---|
273 | |
---|
274 | |
---|
275 | def _do_attr(self, n, value): |
---|
276 | ''''_do_attr(self, node) -> None |
---|
277 | Process an attribute.''' |
---|
278 | |
---|
279 | W = self.write |
---|
280 | W(' ') |
---|
281 | W(n) |
---|
282 | W('="') |
---|
283 | s = string.replace(value, "&", "&") |
---|
284 | s = string.replace(s, "<", "<") |
---|
285 | s = string.replace(s, '"', '"') |
---|
286 | s = string.replace(s, '\011', '	') |
---|
287 | s = string.replace(s, '\012', '
') |
---|
288 | s = string.replace(s, '\015', '
') |
---|
289 | W(s) |
---|
290 | W('"') |
---|
291 | |
---|
292 | |
---|
293 | def _do_element(self, node, initial_other_attrs = [], unused = None): |
---|
294 | '''_do_element(self, node, initial_other_attrs = [], unused = {}) -> None |
---|
295 | Process an element (and its children).''' |
---|
296 | |
---|
297 | # Get state (from the stack) make local copies. |
---|
298 | # ns_parent -- NS declarations in parent |
---|
299 | # ns_rendered -- NS nodes rendered by ancestors |
---|
300 | # ns_local -- NS declarations relevant to this element |
---|
301 | # xml_attrs -- Attributes in XML namespace from parent |
---|
302 | # xml_attrs_local -- Local attributes in XML namespace. |
---|
303 | # ns_unused_inherited -- not rendered namespaces, used for exclusive |
---|
304 | ns_parent, ns_rendered, xml_attrs = \ |
---|
305 | self.state[0], self.state[1].copy(), self.state[2].copy() #0422 |
---|
306 | |
---|
307 | ns_unused_inherited = unused |
---|
308 | if unused is None: |
---|
309 | ns_unused_inherited = self.state[3].copy() |
---|
310 | |
---|
311 | ns_local = ns_parent.copy() |
---|
312 | inclusive = _inclusive(self) |
---|
313 | xml_attrs_local = {} |
---|
314 | |
---|
315 | # Divide attributes into NS, XML, and others. |
---|
316 | other_attrs = [] |
---|
317 | in_subset = _in_subset(self.subset, node) |
---|
318 | for a in initial_other_attrs + _attrs(node): |
---|
319 | if a.namespaceURI == XMLNS.BASE: |
---|
320 | n = a.nodeName |
---|
321 | if n == "xmlns:": n = "xmlns" # DOM bug workaround |
---|
322 | ns_local[n] = a.nodeValue |
---|
323 | elif a.namespaceURI == XMLNS.XML: |
---|
324 | if inclusive or (in_subset and _in_subset(self.subset, a)): #020925 Test to see if attribute node in subset |
---|
325 | xml_attrs_local[a.nodeName] = a #0426 |
---|
326 | else: |
---|
327 | if _in_subset(self.subset, a): #020925 Test to see if attribute node in subset |
---|
328 | other_attrs.append(a) |
---|
329 | |
---|
330 | # # TODO: exclusive, might need to define xmlns:prefix here |
---|
331 | # if not inclusive and a.prefix is not None and not ns_rendered.has_key('xmlns:%s' %a.prefix): |
---|
332 | # ns_local['xmlns:%s' %a.prefix] = ?? |
---|
333 | |
---|
334 | #add local xml:foo attributes to ancestor's xml:foo attributes |
---|
335 | xml_attrs.update(xml_attrs_local) |
---|
336 | |
---|
337 | # Render the node |
---|
338 | W, name = self.write, None |
---|
339 | if in_subset: |
---|
340 | name = node.nodeName |
---|
341 | if not inclusive: |
---|
342 | if node.prefix is not None: |
---|
343 | prefix = 'xmlns:%s' %node.prefix |
---|
344 | else: |
---|
345 | prefix = 'xmlns' |
---|
346 | |
---|
347 | if not ns_rendered.has_key(prefix) and not ns_local.has_key(prefix): |
---|
348 | if not ns_unused_inherited.has_key(prefix): |
---|
349 | raise RuntimeError(\ |
---|
350 | 'For exclusive c14n, unable to map prefix "%s" in %s' %( |
---|
351 | prefix, node)) |
---|
352 | |
---|
353 | ns_local[prefix] = ns_unused_inherited[prefix] |
---|
354 | del ns_unused_inherited[prefix] |
---|
355 | |
---|
356 | W('<') |
---|
357 | W(name) |
---|
358 | |
---|
359 | # Create list of NS attributes to render. |
---|
360 | ns_to_render = [] |
---|
361 | for n,v in ns_local.items(): |
---|
362 | |
---|
363 | # If default namespace is XMLNS.BASE or empty, |
---|
364 | # and if an ancestor was the same |
---|
365 | if n == "xmlns" and v in [ XMLNS.BASE, '' ] \ |
---|
366 | and ns_rendered.get('xmlns') in [ XMLNS.BASE, '', None ]: |
---|
367 | continue |
---|
368 | |
---|
369 | # "omit namespace node with local name xml, which defines |
---|
370 | # the xml prefix, if its string value is |
---|
371 | # http://www.w3.org/XML/1998/namespace." |
---|
372 | if n in ["xmlns:xml", "xml"] \ |
---|
373 | and v in [ 'http://www.w3.org/XML/1998/namespace' ]: |
---|
374 | continue |
---|
375 | |
---|
376 | |
---|
377 | # If not previously rendered |
---|
378 | # and it's inclusive or utilized |
---|
379 | if (n,v) not in ns_rendered.items(): |
---|
380 | if inclusive or _utilized(n, node, other_attrs, self.unsuppressedPrefixes): |
---|
381 | ns_to_render.append((n, v)) |
---|
382 | elif not inclusive: |
---|
383 | ns_unused_inherited[n] = v |
---|
384 | |
---|
385 | # Sort and render the ns, marking what was rendered. |
---|
386 | ns_to_render.sort(_sorter_ns) |
---|
387 | for n,v in ns_to_render: |
---|
388 | self._do_attr(n, v) |
---|
389 | ns_rendered[n]=v #0417 |
---|
390 | |
---|
391 | # If exclusive or the parent is in the subset, add the local xml attributes |
---|
392 | # Else, add all local and ancestor xml attributes |
---|
393 | # Sort and render the attributes. |
---|
394 | if not inclusive or _in_subset(self.subset,node.parentNode): #0426 |
---|
395 | other_attrs.extend(xml_attrs_local.values()) |
---|
396 | else: |
---|
397 | other_attrs.extend(xml_attrs.values()) |
---|
398 | other_attrs.sort(_sorter) |
---|
399 | for a in other_attrs: |
---|
400 | self._do_attr(a.nodeName, a.value) |
---|
401 | W('>') |
---|
402 | |
---|
403 | # Push state, recurse, pop state. |
---|
404 | state, self.state = self.state, (ns_local, ns_rendered, xml_attrs, ns_unused_inherited) |
---|
405 | for c in _children(node): |
---|
406 | _implementation.handlers[c.nodeType](self, c) |
---|
407 | self.state = state |
---|
408 | |
---|
409 | if name: W('</%s>' % name) |
---|
410 | handlers[Node.ELEMENT_NODE] = _do_element |
---|
411 | |
---|
412 | |
---|
413 | def Canonicalize(node, output=None, **kw): |
---|
414 | '''Canonicalize(node, output=None, **kw) -> UTF-8 |
---|
415 | |
---|
416 | Canonicalize a DOM document/element node and all descendents. |
---|
417 | Return the text; if output is specified then output.write will |
---|
418 | be called to output the text and None will be returned |
---|
419 | Keyword parameters: |
---|
420 | nsdict: a dictionary of prefix:uri namespace entries |
---|
421 | assumed to exist in the surrounding context |
---|
422 | comments: keep comments if non-zero (default is 0) |
---|
423 | subset: Canonical XML subsetting resulting from XPath |
---|
424 | (default is []) |
---|
425 | unsuppressedPrefixes: do exclusive C14N, and this specifies the |
---|
426 | prefixes that should be inherited. |
---|
427 | ''' |
---|
428 | if output: |
---|
429 | apply(_implementation, (node, output.write), kw) |
---|
430 | else: |
---|
431 | s = StringIO.StringIO() |
---|
432 | apply(_implementation, (node, s.write), kw) |
---|
433 | return s.getvalue() |
---|