diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f85f7dd0..2fdf0a2f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,11 @@ CHANGELOG ********* +`v2.0.1`_ (2020-08-24) +====================== +* Add regex transpiler (for XPath/XQuery and XML Schema regular expressions) +* Hotfix for issue #30 + `v2.0.0`_ (2020-08-13) ====================== * Extensive testing with W3C XPath 2.0 tests (~98% passed) @@ -232,3 +237,5 @@ CHANGELOG .. _v1.4.5: https://github.com/sissaschool/elementpath/compare/v1.4.4...v1.4.5 .. _v1.4.6: https://github.com/sissaschool/elementpath/compare/v1.4.5...v1.4.6 .. _v2.0.0: https://github.com/sissaschool/elementpath/compare/v1.4.6...v2.0.0 +.. _v2.0.1: https://github.com/sissaschool/elementpath/compare/v2.0.0...v2.0.1 + diff --git a/MANIFEST.in b/MANIFEST.in index 3ca5dccd..b101b193 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,6 +7,8 @@ include setup.cfg include requirements-dev.txt include tox.ini include doc/* + +recursive-include elementpath *.py recursive-include tests *.py global-exclude *.py[cod] diff --git a/elementpath/regex/character_classes.py b/elementpath/regex/character_classes.py index e3ac09ff..13eae7c3 100644 --- a/elementpath/regex/character_classes.py +++ b/elementpath/regex/character_classes.py @@ -7,9 +7,6 @@ # # @author Davide Brunato # -""" -Parse and translate XML Schema regular expressions to Python regex syntax. -""" import re from itertools import chain from sys import maxunicode @@ -92,7 +89,7 @@ def __init__(self, charset=None, is_syntax=True): self.add(charset) def __repr__(self): - return '<%s at %d>' % (self.__class__.__name__, id(self)) + return '%s(%s)' % (self.__class__.__name__, str(self)) def __str__(self): if not self.negative: diff --git a/elementpath/regex/codepoints.py b/elementpath/regex/codepoints.py index 14831d08..c7dfe3ba 100644 --- a/elementpath/regex/codepoints.py +++ b/elementpath/regex/codepoints.py @@ -69,21 +69,24 @@ def iter_code_points(code_points, reverse=False): yield start_cp -def check_code_point(cp): +def get_code_point_range(cp): """ - Checks a code point or code point range. + Returns a code point range. - :return: a valid code point range. + :param cp: a single code point or a code point range. + :return: a code point range or `None` if the argument is not a \ + code point or a code point range. """ if isinstance(cp, int): - if not (0 <= cp <= maxunicode): - raise ValueError("not a Unicode code point: %r" % cp) - return cp, cp + 1 + if 0 <= cp <= maxunicode: + return cp, cp + 1 else: - if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \ - or not isinstance(cp[0], int) or not isinstance(cp[1], int): - raise ValueError("not a Unicode code point range: %r" % cp) - return cp + try: + if isinstance(cp[0], int) and isinstance(cp[1], int): + if 0 <= cp[0] < cp[1] <= maxunicode + 1: + return cp + except (IndexError, TypeError): + pass def code_point_repr(cp): diff --git a/elementpath/regex/unicode_subsets.py b/elementpath/regex/unicode_subsets.py index 8e6761f4..b3a17c60 100644 --- a/elementpath/regex/unicode_subsets.py +++ b/elementpath/regex/unicode_subsets.py @@ -14,18 +14,18 @@ from collections.abc import Iterable, MutableSet from .unicode_categories import RAW_UNICODE_CATEGORIES -from .codepoints import code_point_order, code_point_repr, iter_code_points, check_code_point +from .codepoints import code_point_order, code_point_repr, iter_code_points, get_code_point_range class RegexError(Exception): pass -def iterparse_character_class(s, expand_ranges=False): +def iterparse_character_subset(s, expand_ranges=False): """ - Parse a regex character group part, generating a sequence of code points - and code points ranges. An unescaped hyphen (-) that is not at the start - or at the and is interpreted as range specifier. + Parses a regex character subset, generating a sequence of code points + and code points ranges. An unescaped hyphen (-) that is not at the + start or at the and is interpreted as range specifier. :param s: a string representing a character group part. :param expand_ranges: if set to `True` then expands character ranges. @@ -116,10 +116,10 @@ class UnicodeSubset(MutableSet): """ Represents a subset of Unicode code points, implemented with an ordered list of integer values and ranges. Codepoints can be added or discarded using sequences - of integer values and ranges or with strings equivalent to regex character class. + of integer values and ranges or with strings equivalent to regex character set. :param codepoints: a sequence of integer values and ranges, another UnicodeSubset \ - instance ora a string equivalent of a regex character class. + instance ora a string equivalent of a regex character set. """ def __init__(self, codepoints=None): @@ -191,9 +191,7 @@ def __contains__(self, value): try: value = ord(value) except TypeError: - raise TypeError( - "%r: argument must be a code point or a character." % value - ) + return False for cp in self._codepoints: if not isinstance(cp, int): @@ -225,14 +223,18 @@ def __len__(self): def update(self, *others): for value in others: if isinstance(value, str): - for cp in iter_code_points(iterparse_character_class(value), reverse=True): + for cp in iter_code_points(iterparse_character_subset(value), reverse=True): self.add(cp) else: for cp in iter_code_points(value, reverse=True): self.add(cp) def add(self, value): - start_value, end_value = check_code_point(value) + try: + start_value, end_value = get_code_point_range(value) + except TypeError: + raise ValueError("{!r} is not a Unicode code point value/range".format(value)) + code_points = self._codepoints last_index = len(code_points) - 1 for k, cp in enumerate(code_points): @@ -264,14 +266,18 @@ def add(self, value): def difference_update(self, *others): for value in others: if isinstance(value, str): - for cp in iter_code_points(iterparse_character_class(value), reverse=True): + for cp in iter_code_points(iterparse_character_subset(value), reverse=True): self.discard(cp) else: for cp in iter_code_points(value, reverse=True): self.discard(cp) def discard(self, value): - start_cp, end_cp = check_code_point(value) + try: + start_cp, end_cp = get_code_point_range(value) + except TypeError: + raise ValueError("{!r} is not a Unicode code point value/range".format(value)) + code_points = self._codepoints for k in reversed(range(len(code_points))): cp = code_points[k] diff --git a/publiccode.yml b/publiccode.yml index e649bd70..d743b073 100644 --- a/publiccode.yml +++ b/publiccode.yml @@ -6,8 +6,8 @@ publiccodeYmlVersion: '0.2' name: elementpath url: 'https://github.com/sissaschool/elementpath' landingURL: 'https://github.com/sissaschool/elementpath' -releaseDate: '2020-08-13' -softwareVersion: v2.0.0 +releaseDate: '2020-08-24' +softwareVersion: v2.0.1 developmentStatus: stable platforms: - linux diff --git a/requirements-dev.txt b/requirements-dev.txt index 1d021fad..1cee8c71 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,6 +3,6 @@ setuptools tox coverage lxml -xmlschema~=1.2.0 +xmlschema~=1.2.3 Sphinx -e . diff --git a/setup.py b/setup.py index e06e19dd..d53ef4a7 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ # # @author Davide Brunato # -from setuptools import setup +from setuptools import setup, find_packages with open("README.rst") as readme: long_description = readme.read() @@ -16,7 +16,7 @@ setup( name='elementpath', version='2.0.1', - packages=['elementpath'], + packages=find_packages(include=['elementpath', 'elementpath.*']), author='Davide Brunato', author_email='brunato@sissa.it', url='https://github.com/sissaschool/elementpath', diff --git a/tests/test_regex.py b/tests/test_regex.py index a6be6a26..9da96046 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -18,8 +18,9 @@ from unicodedata import category from elementpath.regex import RegexError, CharacterClass, get_python_pattern +from elementpath.regex.codepoints import get_code_point_range from elementpath.regex.unicode_subsets import code_point_repr, \ - iterparse_character_class, iter_code_points, UnicodeSubset, \ + iterparse_character_subset, iter_code_points, UnicodeSubset, \ UNICODE_CATEGORIES @@ -44,6 +45,18 @@ def test_iter_code_points(self): [25, (8, 23), 0] ) + def test_get_code_point_range(self): + self.assertEqual(get_code_point_range(97), (97, 98)) + self.assertEqual(get_code_point_range((97, 100)), (97, 100)) + self.assertEqual(get_code_point_range([97, 100]), [97, 100]) + + self.assertIsNone(get_code_point_range(-1)) + self.assertIsNone(get_code_point_range(sys.maxunicode + 1)) + self.assertIsNone(get_code_point_range((-1, 100))) + self.assertIsNone(get_code_point_range((97, sys.maxunicode + 2))) + self.assertIsNone(get_code_point_range(97.0)) + self.assertIsNone(get_code_point_range((97.0, 100))) + class TestUnicodeSubset(unittest.TestCase): @@ -136,38 +149,53 @@ def test_code_point_repr_function(self): class TestCharacterClass(unittest.TestCase): - def test_char_group_split(self): + def test_char_class_init(self): + char_class = CharacterClass() + self.assertEqual(char_class.positive, []) + self.assertEqual(char_class.negative, []) + + char_class = CharacterClass('a-z') + self.assertEqual(char_class.positive, [(97, 123)]) + self.assertEqual(char_class.negative, []) + + def test_char_class_repr(self): + char_class = CharacterClass('a-z') + self.assertEqual(repr(char_class), 'CharacterClass([a-z])') + char_class.complement() + self.assertEqual(repr(char_class), 'CharacterClass([^a-z])') + + def test_char_class_split(self): self.assertListEqual(CharacterClass._re_char_set.split(r'2-\\'), [r'2-\\']) def test_complement(self): - char_group = CharacterClass('a-z') - char_group.complement() - self.assertEqual(str(char_group), '[^a-z]') + char_class = CharacterClass('a-z') + char_class.complement() + self.assertEqual(str(char_class), '[^a-z]') def test_isub_operator(self): - char_group = CharacterClass('A-Za-z') - char_group -= CharacterClass('a-z') - self.assertEqual(str(char_group), '[A-Z]') + char_class = CharacterClass('A-Za-z') + char_class -= CharacterClass('a-z') + self.assertEqual(str(char_class), '[A-Z]') - char_group = CharacterClass('a-z') + char_class = CharacterClass('a-z') other = CharacterClass('A-Za-c') other.complement() - char_group -= other - self.assertEqual(str(char_group), '[a-c]') + char_class -= other + self.assertEqual(str(char_class), '[a-c]') - char_group = CharacterClass('a-z') + char_class = CharacterClass('a-z') other = CharacterClass('A-Za-c') other.complement() other.add('b') - char_group -= other - self.assertEqual(str(char_group), '[ac]') + char_class -= other + self.assertEqual(str(char_class), '[ac]') - char_group = CharacterClass('a-c') - char_group.complement() + char_class = CharacterClass('a-c') + char_class.complement() other = CharacterClass('a-z') other.complement() - char_group -= other - self.assertEqual(str(char_group), '[d-z]') + char_class -= other + self.assertEqual(str(char_class), '[d-z]') class TestUnicodeCategories(unittest.TestCase): @@ -327,10 +355,10 @@ def test_character_class_reordering(self): self.assertIsNone(pattern.search('xx:y')) def test_iterparse_character_group(self): - self.assertListEqual(list(iterparse_character_class('a-c-1-4x-z-7-9')), + self.assertListEqual(list(iterparse_character_subset('a-c-1-4x-z-7-9')), [(ord('a'), ord('c') + 1), ord('-'), (ord('1'), ord('4') + 1), (ord('x'), ord('z') + 1), ord('-'), (55, 58)]) - self.assertListEqual(list(iterparse_character_class('2-\\')), [(ord('2'), ord('\\') + 1)]) + self.assertListEqual(list(iterparse_character_subset('2-\\')), [(ord('2'), ord('\\') + 1)]) def test_occurrences_qualifiers(self): regex = get_python_pattern('#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?', anchors=False)