1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 """
42 Provides general XML-related functionality.
43
44 What I'm trying to do here is abstract much of the functionality that directly
45 accesses the DOM tree. This is not so much to "protect" the other code from
46 the DOM, but to standardize the way it's used. It will also help extension
47 authors write code that easily looks more like the rest of Cedar Backup.
48
49 @sort: createInputDom, createOutputDom, serializeDom, isElement, readChildren,
50 readFirstChild, readStringList, readString, readInteger, readBoolean,
51 addContainerNode, addStringNode, addIntegerNode, addBooleanNode,
52 TRUE_BOOLEAN_VALUES, FALSE_BOOLEAN_VALUES, VALID_BOOLEAN_VALUES
53
54 @var TRUE_BOOLEAN_VALUES: List of boolean values in XML representing C{True}.
55 @var FALSE_BOOLEAN_VALUES: List of boolean values in XML representing C{False}.
56 @var VALID_BOOLEAN_VALUES: List of valid boolean values in XML.
57
58 @author: Kenneth J. Pronovici <pronovic@ieee.org>
59 """
60
61
62
63
64
65
66
67 import sys
68 import re
69 import logging
70 from io import StringIO
71
72
73 from xml.parsers.expat import ExpatError
74 from xml.dom.minidom import Node
75 from xml.dom.minidom import getDOMImplementation
76 from xml.dom.minidom import parseString
77
78
79
80
81
82
83 logger = logging.getLogger("CedarBackup3.log.xml")
84
85 TRUE_BOOLEAN_VALUES = [ "Y", "y", ]
86 FALSE_BOOLEAN_VALUES = [ "N", "n", ]
87 VALID_BOOLEAN_VALUES = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES
88
89
90
91
92
93
107
109 """
110 Creates a DOM tree used for writing an XML document.
111 @param name: Base name of the document (root node name).
112 @return: Tuple (xmlDom, parentNode) for the new document
113 """
114 impl = getDOMImplementation()
115 xmlDom = impl.createDocument(None, name, None)
116 return (xmlDom, xmlDom.documentElement)
117
118
119
120
121
122
124 """
125 Returns True or False depending on whether the XML node is an element node.
126 """
127 return node.nodeType == Node.ELEMENT_NODE
128
130 """
131 Returns a list of nodes with a given name immediately beneath the
132 parent.
133
134 By "immediately beneath" the parent, we mean from among nodes that are
135 direct children of the passed-in parent node.
136
137 Underneath, we use the Python C{getElementsByTagName} method, which is
138 pretty cool, but which (surprisingly?) returns a list of all children
139 with a given name below the parent, at any level. We just prune that
140 list to include only children whose C{parentNode} matches the passed-in
141 parent.
142
143 @param parent: Parent node to search beneath.
144 @param name: Name of nodes to search for.
145
146 @return: List of child nodes with correct parent, or an empty list if
147 no matching nodes are found.
148 """
149 lst = []
150 if parent is not None:
151 result = parent.getElementsByTagName(name)
152 for entry in result:
153 if entry.parentNode is parent:
154 lst.append(entry)
155 return lst
156
158 """
159 Returns the first child with a given name immediately beneath the parent.
160
161 By "immediately beneath" the parent, we mean from among nodes that are
162 direct children of the passed-in parent node.
163
164 @param parent: Parent node to search beneath.
165 @param name: Name of node to search for.
166
167 @return: First properly-named child of parent, or C{None} if no matching nodes are found.
168 """
169 result = readChildren(parent, name)
170 if result is None or result == []:
171 return None
172 return result[0]
173
175 """
176 Returns a list of the string contents associated with nodes with a given
177 name immediately beneath the parent.
178
179 By "immediately beneath" the parent, we mean from among nodes that are
180 direct children of the passed-in parent node.
181
182 First, we find all of the nodes using L{readChildren}, and then we
183 retrieve the "string contents" of each of those nodes. The returned list
184 has one entry per matching node. We assume that string contents of a
185 given node belong to the first C{TEXT_NODE} child of that node. Nodes
186 which have no C{TEXT_NODE} children are not represented in the returned
187 list.
188
189 @param parent: Parent node to search beneath.
190 @param name: Name of node to search for.
191
192 @return: List of strings as described above, or C{None} if no matching nodes are found.
193 """
194 lst = []
195 result = readChildren(parent, name)
196 for entry in result:
197 if entry.hasChildNodes():
198 for child in entry.childNodes:
199 if child.nodeType == Node.TEXT_NODE:
200 lst.append(child.nodeValue)
201 break
202 if lst == []:
203 lst = None
204 return lst
205
207 """
208 Returns string contents of the first child with a given name immediately
209 beneath the parent.
210
211 By "immediately beneath" the parent, we mean from among nodes that are
212 direct children of the passed-in parent node. We assume that string
213 contents of a given node belong to the first C{TEXT_NODE} child of that
214 node.
215
216 @param parent: Parent node to search beneath.
217 @param name: Name of node to search for.
218
219 @return: String contents of node or C{None} if no matching nodes are found.
220 """
221 result = readStringList(parent, name)
222 if result is None:
223 return None
224 return result[0]
225
227 """
228 Returns integer contents of the first child with a given name immediately
229 beneath the parent.
230
231 By "immediately beneath" the parent, we mean from among nodes that are
232 direct children of the passed-in parent node.
233
234 @param parent: Parent node to search beneath.
235 @param name: Name of node to search for.
236
237 @return: Integer contents of node or C{None} if no matching nodes are found.
238 @raise ValueError: If the string at the location can't be converted to an integer.
239 """
240 result = readString(parent, name)
241 if result is None:
242 return None
243 else:
244 return int(result)
245
247 """
248 Returns long integer contents of the first child with a given name immediately
249 beneath the parent.
250
251 By "immediately beneath" the parent, we mean from among nodes that are
252 direct children of the passed-in parent node.
253
254 @param parent: Parent node to search beneath.
255 @param name: Name of node to search for.
256
257 @return: Long integer contents of node or C{None} if no matching nodes are found.
258 @raise ValueError: If the string at the location can't be converted to an integer.
259 """
260 result = readString(parent, name)
261 if result is None:
262 return None
263 else:
264 return int(result)
265
267 """
268 Returns float contents of the first child with a given name immediately
269 beneath the parent.
270
271 By "immediately beneath" the parent, we mean from among nodes that are
272 direct children of the passed-in parent node.
273
274 @param parent: Parent node to search beneath.
275 @param name: Name of node to search for.
276
277 @return: Float contents of node or C{None} if no matching nodes are found.
278 @raise ValueError: If the string at the location can't be converted to a
279 float value.
280 """
281 result = readString(parent, name)
282 if result is None:
283 return None
284 else:
285 return float(result)
286
288 """
289 Returns boolean contents of the first child with a given name immediately
290 beneath the parent.
291
292 By "immediately beneath" the parent, we mean from among nodes that are
293 direct children of the passed-in parent node.
294
295 The string value of the node must be one of the values in L{VALID_BOOLEAN_VALUES}.
296
297 @param parent: Parent node to search beneath.
298 @param name: Name of node to search for.
299
300 @return: Boolean contents of node or C{None} if no matching nodes are found.
301 @raise ValueError: If the string at the location can't be converted to a boolean.
302 """
303 result = readString(parent, name)
304 if result is None:
305 return None
306 else:
307 if result in TRUE_BOOLEAN_VALUES:
308 return True
309 elif result in FALSE_BOOLEAN_VALUES:
310 return False
311 else:
312 raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES)
313
314
315
316
317
318
320 """
321 Adds a container node as the next child of a parent node.
322
323 @param xmlDom: DOM tree as from C{impl.createDocument()}.
324 @param parentNode: Parent node to create child for.
325 @param nodeName: Name of the new container node.
326
327 @return: Reference to the newly-created node.
328 """
329 containerNode = xmlDom.createElement(nodeName)
330 parentNode.appendChild(containerNode)
331 return containerNode
332
334 """
335 Adds a text node as the next child of a parent, to contain a string.
336
337 If the C{nodeValue} is None, then the node will be created, but will be
338 empty (i.e. will contain no text node child).
339
340 @param xmlDom: DOM tree as from C{impl.createDocument()}.
341 @param parentNode: Parent node to create child for.
342 @param nodeName: Name of the new container node.
343 @param nodeValue: The value to put into the node.
344
345 @return: Reference to the newly-created node.
346 """
347 containerNode = addContainerNode(xmlDom, parentNode, nodeName)
348 if nodeValue is not None:
349 textNode = xmlDom.createTextNode(nodeValue)
350 containerNode.appendChild(textNode)
351 return containerNode
352
354 """
355 Adds a text node as the next child of a parent, to contain an integer.
356
357 If the C{nodeValue} is None, then the node will be created, but will be
358 empty (i.e. will contain no text node child).
359
360 The integer will be converted to a string using "%d". The result will be
361 added to the document via L{addStringNode}.
362
363 @param xmlDom: DOM tree as from C{impl.createDocument()}.
364 @param parentNode: Parent node to create child for.
365 @param nodeName: Name of the new container node.
366 @param nodeValue: The value to put into the node.
367
368 @return: Reference to the newly-created node.
369 """
370 if nodeValue is None:
371 return addStringNode(xmlDom, parentNode, nodeName, None)
372 else:
373 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
374
375 -def addLongNode(xmlDom, parentNode, nodeName, nodeValue):
376 """
377 Adds a text node as the next child of a parent, to contain a long integer.
378
379 If the C{nodeValue} is None, then the node will be created, but will be
380 empty (i.e. will contain no text node child).
381
382 The integer will be converted to a string using "%d". The result will be
383 added to the document via L{addStringNode}.
384
385 @param xmlDom: DOM tree as from C{impl.createDocument()}.
386 @param parentNode: Parent node to create child for.
387 @param nodeName: Name of the new container node.
388 @param nodeValue: The value to put into the node.
389
390 @return: Reference to the newly-created node.
391 """
392 if nodeValue is None:
393 return addStringNode(xmlDom, parentNode, nodeName, None)
394 else:
395 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
396
398 """
399 Adds a text node as the next child of a parent, to contain a boolean.
400
401 If the C{nodeValue} is None, then the node will be created, but will be
402 empty (i.e. will contain no text node child).
403
404 Boolean C{True}, or anything else interpreted as C{True} by Python, will
405 be converted to a string "Y". Anything else will be converted to a
406 string "N". The result is added to the document via L{addStringNode}.
407
408 @param xmlDom: DOM tree as from C{impl.createDocument()}.
409 @param parentNode: Parent node to create child for.
410 @param nodeName: Name of the new container node.
411 @param nodeValue: The value to put into the node.
412
413 @return: Reference to the newly-created node.
414 """
415 if nodeValue is None:
416 return addStringNode(xmlDom, parentNode, nodeName, None)
417 else:
418 if nodeValue:
419 return addStringNode(xmlDom, parentNode, nodeName, "Y")
420 else:
421 return addStringNode(xmlDom, parentNode, nodeName, "N")
422
423
424
425
426
427
429 """
430 Serializes a DOM tree and returns the result in a string.
431 @param xmlDom: XML DOM tree to serialize
432 @param indent: Number of spaces to indent, as an integer
433 @return: String form of DOM tree, pretty-printed.
434 """
435 xmlBuffer = StringIO()
436 serializer = Serializer(xmlBuffer, "UTF-8", indent=indent)
437 serializer.serialize(xmlDom)
438 xmlData = xmlBuffer.getvalue()
439 xmlBuffer.close()
440 return xmlData
441
443
444 """
445 XML serializer class.
446
447 This is a customized serializer that I hacked together based on what I found
448 in the PyXML distribution. Basically, around release 2.7.0, the only reason
449 I still had around a dependency on PyXML was for the PrettyPrint
450 functionality, and that seemed pointless. So, I stripped the PrettyPrint
451 code out of PyXML and hacked bits of it off until it did just what I needed
452 and no more.
453
454 This code started out being called PrintVisitor, but I decided it makes more
455 sense just calling it a serializer. I've made nearly all of the methods
456 private, and I've added a new high-level serialize() method rather than
457 having clients call C{visit()}.
458
459 Anyway, as a consequence of my hacking with it, this can't quite be called a
460 complete XML serializer any more. I ripped out support for HTML and XHTML,
461 and there is also no longer any support for namespaces (which I took out
462 because this dragged along a lot of extra code, and Cedar Backup doesn't use
463 namespaces). However, everything else should pretty much work as expected.
464
465 @copyright: This code, prior to customization, was part of the PyXML
466 codebase, and before that was part of the 4DOM suite developed by
467 Fourthought, Inc. It its original form, it was Copyright (c) 2000
468 Fourthought Inc, USA; All Rights Reserved.
469 """
470
471 - def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3):
472 """
473 Initialize a serializer.
474 @param stream: Stream to write output to.
475 @param encoding: Output encoding.
476 @param indent: Number of spaces to indent, as an integer
477 """
478 self.stream = stream
479 self.encoding = encoding
480 self._indent = indent * " "
481 self._depth = 0
482 self._inText = 0
483
485 """
486 Serialize the passed-in XML document.
487 @param xmlDom: XML DOM tree to serialize
488 @raise ValueError: If there's an unknown node type in the document.
489 """
490 self._visit(xmlDom)
491 self.stream.write("\n")
492
497
499 if not self._inText and self._indent:
500 self._write('\n' + self._indent*self._depth)
501 return
502
504 """
505 @raise ValueError: If there's an unknown node type in the document.
506 """
507 if node.nodeType == Node.ELEMENT_NODE:
508 return self._visitElement(node)
509
510 elif node.nodeType == Node.ATTRIBUTE_NODE:
511 return self._visitAttr(node)
512
513 elif node.nodeType == Node.TEXT_NODE:
514 return self._visitText(node)
515
516 elif node.nodeType == Node.CDATA_SECTION_NODE:
517 return self._visitCDATASection(node)
518
519 elif node.nodeType == Node.ENTITY_REFERENCE_NODE:
520 return self._visitEntityReference(node)
521
522 elif node.nodeType == Node.ENTITY_NODE:
523 return self._visitEntity(node)
524
525 elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
526 return self._visitProcessingInstruction(node)
527
528 elif node.nodeType == Node.COMMENT_NODE:
529 return self._visitComment(node)
530
531 elif node.nodeType == Node.DOCUMENT_NODE:
532 return self._visitDocument(node)
533
534 elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
535 return self._visitDocumentType(node)
536
537 elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
538 return self._visitDocumentFragment(node)
539
540 elif node.nodeType == Node.NOTATION_NODE:
541 return self._visitNotation(node)
542
543
544 raise ValueError("Unknown node type: %s" % repr(node))
545
547 for curr in node:
548 curr is not exclude and self._visit(curr)
549 return
550
552 for item in list(node.values()):
553 self._visit(item)
554 return
555
563
565 self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8'))
566 self._inText = 0
567 return
568
574
578
580 self._tryIndent()
581 self._write('<%s' % node.tagName)
582 for attr in list(node.attributes.values()):
583 self._visitAttr(attr)
584 if len(node.childNodes):
585 self._write('>')
586 self._depth = self._depth + 1
587 self._visitNodeList(node.childNodes)
588 self._depth = self._depth - 1
589 not (self._inText) and self._tryIndent()
590 self._write('</%s>' % node.tagName)
591 else:
592 self._write('/>')
593 self._inText = 0
594 return
595
596 - def _visitText(self, node):
597 text = node.data
598 if self._indent:
599 text.strip()
600 if text:
601 text = _translateCDATA(text, self.encoding)
602 self.stream.write(text)
603 self._inText = 1
604 return
605
607 if not doctype.systemId and not doctype.publicId: return
608 self._tryIndent()
609 self._write('<!DOCTYPE %s' % doctype.name)
610 if doctype.systemId and '"' in doctype.systemId:
611 system = "'%s'" % doctype.systemId
612 else:
613 system = '"%s"' % doctype.systemId
614 if doctype.publicId and '"' in doctype.publicId:
615
616
617
618 public = "'%s'" % doctype.publicId
619 else:
620 public = '"%s"' % doctype.publicId
621 if doctype.publicId and doctype.systemId:
622 self._write(' PUBLIC %s %s' % (public, system))
623 elif doctype.systemId:
624 self._write(' SYSTEM %s' % system)
625 if doctype.entities or doctype.notations:
626 self._write(' [')
627 self._depth = self._depth + 1
628 self._visitNamedNodeMap(doctype.entities)
629 self._visitNamedNodeMap(doctype.notations)
630 self._depth = self._depth - 1
631 self._tryIndent()
632 self._write(']>')
633 else:
634 self._write('>')
635 self._inText = 0
636 return
637
639 """Visited from a NamedNodeMap in DocumentType"""
640 self._tryIndent()
641 self._write('<!ENTITY %s' % (node.nodeName))
642 node.publicId and self._write(' PUBLIC %s' % node.publicId)
643 node.systemId and self._write(' SYSTEM %s' % node.systemId)
644 node.notationName and self._write(' NDATA %s' % node.notationName)
645 self._write('>')
646 return
647
649 """Visited from a NamedNodeMap in DocumentType"""
650 self._tryIndent()
651 self._write('<!NOTATION %s' % node.nodeName)
652 node.publicId and self._write(' PUBLIC %s' % node.publicId)
653 node.systemId and self._write(' SYSTEM %s' % node.systemId)
654 self._write('>')
655 return
656
658 self._tryIndent()
659 self._write('<![CDATA[%s]]>' % (node.data))
660 self._inText = 0
661 return
662
668
670 self._write('&%s;' % node.nodeName)
671 self._inText = 1
672 return
673
675 self._tryIndent()
676 self._write('<?%s %s?>' % (node.target, node.data))
677 self._inText = 0
678 return
679
680 -def _encodeText(text, encoding):
681 """Safely encodes the passed-in text as a Unicode string, converting bytes to UTF-8 if necessary."""
682 if text is None:
683 return text
684 try:
685 if isinstance(text, bytes):
686 text = str(text, "utf-8")
687 return text
688 except UnicodeError:
689 raise ValueError("Path could not be safely encoded as utf-8.")
690
692 """
693 Handles normalization and some intelligence about quoting.
694
695 @copyright: This code, prior to customization, was part of the PyXML
696 codebase, and before that was part of the 4DOM suite developed by
697 Fourthought, Inc. It its original form, it was Copyright (c) 2000
698 Fourthought Inc, USA; All Rights Reserved.
699 """
700 if not characters:
701 return '', "'"
702 if "'" in characters:
703 delimiter = '"'
704 new_chars = re.sub('"', '"', characters)
705 else:
706 delimiter = "'"
707 new_chars = re.sub("'", ''', characters)
708
709
710
711 if "\n" in characters:
712 new_chars = re.sub('\n', ' ', new_chars)
713 return new_chars, delimiter
714
715
716 -def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0):
717 """
718 @copyright: This code, prior to customization, was part of the PyXML
719 codebase, and before that was part of the 4DOM suite developed by
720 Fourthought, Inc. It its original form, it was Copyright (c) 2000
721 Fourthought Inc, USA; All Rights Reserved.
722 """
723 CDATA_CHAR_PATTERN = re.compile('[&<]|]]>')
724 CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', }
725 ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
726 ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
727 XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
728 if not characters:
729 return ''
730 if not markupSafe:
731 if CDATA_CHAR_PATTERN.search(characters):
732 new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0]
733 else:
734 new_string = characters
735 if prev_chars[-2:] == ']]' and characters[0] == '>':
736 new_string = '>' + new_string[1:]
737 else:
738 new_string = characters
739
740
741
742 if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
743 new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0]
744 new_string = _encodeText(new_string, encoding)
745 return new_string
746