Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion Doc/library/xml.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,19 @@ This module also defines utility functions.
"!", "?", and "=" are forbidden.
The name cannot start with a digit or a character like "-", ".", and "·".

..versionadded:: next
.. versionadded:: next


.. function:: is_valid_text(data)

Return ``True`` if the string is a sequence of legal XML 1.0 characters,
``False`` otherwise.

Almost all characters are permitted in XML 1.0 documents, except C0 control
characters (excluding TAB, CR and LF), surrogate characters and special
Unicode characters U+FFFE and U+FFFF.

.. versionadded:: next


.. _xml-security:
Expand Down
4 changes: 4 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,10 @@ xml
whether a string can be used as an element or attribute name in XML.
(Contributed by Serhiy Storchaka in :gh:`139489`.)

* Add the :func:`xml.is_valid_text` function, which allows to check
whether a string can be used in the XML document.
(Contributed by Serhiy Storchaka in :gh:`139489`.)


xml.parsers.expat
-----------------
Expand Down
16 changes: 16 additions & 0 deletions Lib/test/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,22 @@ def test_is_valid_name(self):
for c in '<>/!?=\x00\x01\x7f\ud800\udfff\ufffe\uffff\U000F0000':
self.assertFalse(is_valid_name('name' + c))

def test_is_valid_text(self):
is_valid_text = xml.is_valid_text
self.assertTrue(is_valid_text(''))
self.assertTrue(is_valid_text('!0Aa_~ \r\n\t\x85\xa0'))
self.assertTrue(is_valid_text('\ud7ff\ue000\ufffd\U00010000\U0010ffff'))
self.assertFalse(is_valid_text('\x00'))
self.assertFalse(is_valid_text('\x01'))
self.assertFalse(is_valid_text('\x1f'))
self.assertTrue(is_valid_text('\x7f'))
self.assertTrue(is_valid_text('\x80'))
self.assertTrue(is_valid_text('\x9f'))
self.assertFalse(is_valid_text('\ud800'))
self.assertFalse(is_valid_text('\udfff'))
self.assertFalse(is_valid_text('\ufffe'))
self.assertFalse(is_valid_text('\uffff'))


if __name__ == '__main__':
unittest.main()
12 changes: 12 additions & 0 deletions Lib/xml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,15 @@ def is_valid_name(name):
'\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF'
']*+',
name) is not None

# https://www.w3.org/TR/xml/#charsets
_ILLEGAL_XML_CHAR = (
'['
'\x00-\x08\x0B\x0C\x0E-\x1F' # C0 controls except TAB, CR and LF
'\uD800-\uDFFF' # the surrogate blocks
'\uFFFE\uFFFF' # special Unicode characters
']')

def is_valid_text(data):
"""Test whether a string is a sequence of legal XML 1.0 characters."""
return _re.search(_ILLEGAL_XML_CHAR, data) is None
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add the :func:`xml.is_valid_text` function, which allow to check whether
a string can be used in the XML document.
Loading