Update to release v2.0.1

sissaschool · Aug 24, 2020 · 3276ca2 · 3276ca2
1 parent 055285d
commit 3276ca2
Show file tree

Hide file tree

Showing 9 changed files with 96 additions and 53 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,11 @@
 CHANGELOG
 *********
 
+`v2.0.1`_ (2020-08-24)
+======================
+* Add regex transpiler (for XPath/XQuery and XML Schema regular expressions)
+* Hotfix for issue #30
+
 `v2.0.0`_ (2020-08-13)
 ======================
 * Extensive testing with W3C XPath 2.0 tests (~98% passed)
@@ -232,3 +237,5 @@ CHANGELOG
 .. _v1.4.5: https://github.com/sissaschool/elementpath/compare/v1.4.4...v1.4.5
 .. _v1.4.6: https://github.com/sissaschool/elementpath/compare/v1.4.5...v1.4.6
 .. _v2.0.0: https://github.com/sissaschool/elementpath/compare/v1.4.6...v2.0.0
+.. _v2.0.1: https://github.com/sissaschool/elementpath/compare/v2.0.0...v2.0.1
+
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,6 +7,8 @@ include setup.cfg
 include requirements-dev.txt
 include tox.ini
 include doc/*
+
+recursive-include elementpath *.py
 recursive-include tests *.py
 
 global-exclude *.py[cod]
diff --git a/elementpath/regex/character_classes.py b/elementpath/regex/character_classes.py
@@ -7,9 +7,6 @@
 #
 # @author Davide Brunato <brunato@sissa.it>
 #
-"""
-Parse and translate XML Schema regular expressions to Python regex syntax.
-"""
 import re
 from itertools import chain
 from sys import maxunicode
@@ -92,7 +89,7 @@ def __init__(self, charset=None, is_syntax=True):
             self.add(charset)
 
     def __repr__(self):
-        return '<%s at %d>' % (self.__class__.__name__, id(self))
+        return '%s(%s)' % (self.__class__.__name__, str(self))
 
     def __str__(self):
         if not self.negative:

diff --git a/elementpath/regex/codepoints.py b/elementpath/regex/codepoints.py
@@ -69,21 +69,24 @@ def iter_code_points(code_points, reverse=False):
                 yield start_cp
 
 
-def check_code_point(cp):
+def get_code_point_range(cp):
     """
-    Checks a code point or code point range.
+    Returns a code point range.
 
-    :return: a valid code point range.
+    :param cp: a single code point or a code point range.
+    :return: a code point range or `None` if the argument is not a \
+    code point or a code point range.
     """
     if isinstance(cp, int):
-        if not (0 <= cp <= maxunicode):
-            raise ValueError("not a Unicode code point: %r" % cp)
-        return cp, cp + 1
+        if 0 <= cp <= maxunicode:
+            return cp, cp + 1
     else:
-        if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \
-                or not isinstance(cp[0], int) or not isinstance(cp[1], int):
-            raise ValueError("not a Unicode code point range: %r" % cp)
-        return cp
+        try:
+            if isinstance(cp[0], int) and isinstance(cp[1], int):
+                if 0 <= cp[0] < cp[1] <= maxunicode + 1:
+                    return cp
+        except (IndexError, TypeError):
+            pass
 
 
 def code_point_repr(cp):

diff --git a/elementpath/regex/unicode_subsets.py b/elementpath/regex/unicode_subsets.py
@@ -14,18 +14,18 @@
 from collections.abc import Iterable, MutableSet
 
 from .unicode_categories import RAW_UNICODE_CATEGORIES
-from .codepoints import code_point_order, code_point_repr, iter_code_points, check_code_point
+from .codepoints import code_point_order, code_point_repr, iter_code_points, get_code_point_range
 
 
 class RegexError(Exception):
     pass
 
 
-def iterparse_character_class(s, expand_ranges=False):
+def iterparse_character_subset(s, expand_ranges=False):
     """
-    Parse a regex character group part, generating a sequence of code points
-    and code points ranges. An unescaped hyphen (-) that is not at the start
-    or at the and is interpreted as range specifier.
+    Parses a regex character subset, generating a sequence of code points
+    and code points ranges. An unescaped hyphen (-) that is not at the
+    start or at the and is interpreted as range specifier.
 
     :param s: a string representing a character group part.
     :param expand_ranges: if set to `True` then expands character ranges.
@@ -116,10 +116,10 @@ class UnicodeSubset(MutableSet):
     """
     Represents a subset of Unicode code points, implemented with an ordered list of
     integer values and ranges. Codepoints can be added or discarded using sequences
-    of integer values and ranges or with strings equivalent to regex character class.
+    of integer values and ranges or with strings equivalent to regex character set.
 
     :param codepoints: a sequence of integer values and ranges, another UnicodeSubset \
-    instance ora a string equivalent of a regex character class.
+    instance ora a string equivalent of a regex character set.
     """
 
     def __init__(self, codepoints=None):
@@ -191,9 +191,7 @@ def __contains__(self, value):
             try:
                 value = ord(value)
             except TypeError:
-                raise TypeError(
-                    "%r: argument must be a code point or a character." % value
-                )
+                return False
 
         for cp in self._codepoints:
             if not isinstance(cp, int):
@@ -225,14 +223,18 @@ def __len__(self):
     def update(self, *others):
         for value in others:
             if isinstance(value, str):
-                for cp in iter_code_points(iterparse_character_class(value), reverse=True):
+                for cp in iter_code_points(iterparse_character_subset(value), reverse=True):
                     self.add(cp)
             else:
                 for cp in iter_code_points(value, reverse=True):
                     self.add(cp)
 
     def add(self, value):
-        start_value, end_value = check_code_point(value)
+        try:
+            start_value, end_value = get_code_point_range(value)
+        except TypeError:
+            raise ValueError("{!r} is not a Unicode code point value/range".format(value))
+
         code_points = self._codepoints
         last_index = len(code_points) - 1
         for k, cp in enumerate(code_points):
@@ -264,14 +266,18 @@ def add(self, value):
     def difference_update(self, *others):
         for value in others:
             if isinstance(value, str):
-                for cp in iter_code_points(iterparse_character_class(value), reverse=True):
+                for cp in iter_code_points(iterparse_character_subset(value), reverse=True):
                     self.discard(cp)
             else:
                 for cp in iter_code_points(value, reverse=True):
                     self.discard(cp)
 
     def discard(self, value):
-        start_cp, end_cp = check_code_point(value)
+        try:
+            start_cp, end_cp = get_code_point_range(value)
+        except TypeError:
+            raise ValueError("{!r} is not a Unicode code point value/range".format(value))
+
         code_points = self._codepoints
         for k in reversed(range(len(code_points))):
             cp = code_points[k]

diff --git a/publiccode.yml b/publiccode.yml
@@ -6,8 +6,8 @@ publiccodeYmlVersion: '0.2'
 name: elementpath
 url: 'https://github.com/sissaschool/elementpath'
 landingURL: 'https://github.com/sissaschool/elementpath'
-releaseDate: '2020-08-13'
-softwareVersion: v2.0.0
+releaseDate: '2020-08-24'
+softwareVersion: v2.0.1
 developmentStatus: stable
 platforms:
   - linux

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -3,6 +3,6 @@ setuptools
 tox
 coverage
 lxml
-xmlschema~=1.2.0
+xmlschema~=1.2.3
 Sphinx
 -e .
diff --git a/setup.py b/setup.py
@@ -8,15 +8,15 @@
 #
 # @author Davide Brunato <brunato@sissa.it>
 #
-from setuptools import setup
+from setuptools import setup, find_packages
 
 with open("README.rst") as readme:
     long_description = readme.read()
 
 setup(
     name='elementpath',
     version='2.0.1',
-    packages=['elementpath'],
+    packages=find_packages(include=['elementpath', 'elementpath.*']),
     author='Davide Brunato',
     author_email='brunato@sissa.it',
     url='https://github.com/sissaschool/elementpath',

diff --git a/tests/test_regex.py b/tests/test_regex.py
@@ -18,8 +18,9 @@
 from unicodedata import category
 
 from elementpath.regex import RegexError, CharacterClass, get_python_pattern
+from elementpath.regex.codepoints import get_code_point_range
 from elementpath.regex.unicode_subsets import code_point_repr, \
-    iterparse_character_class, iter_code_points, UnicodeSubset, \
+    iterparse_character_subset, iter_code_points, UnicodeSubset, \
     UNICODE_CATEGORIES
 
 
@@ -44,6 +45,18 @@ def test_iter_code_points(self):
             [25, (8, 23), 0]
         )
 
+    def test_get_code_point_range(self):
+        self.assertEqual(get_code_point_range(97), (97, 98))
+        self.assertEqual(get_code_point_range((97, 100)), (97, 100))
+        self.assertEqual(get_code_point_range([97, 100]), [97, 100])
+
+        self.assertIsNone(get_code_point_range(-1))
+        self.assertIsNone(get_code_point_range(sys.maxunicode + 1))
+        self.assertIsNone(get_code_point_range((-1, 100)))
+        self.assertIsNone(get_code_point_range((97, sys.maxunicode + 2)))
+        self.assertIsNone(get_code_point_range(97.0))
+        self.assertIsNone(get_code_point_range((97.0, 100)))
+
 
 class TestUnicodeSubset(unittest.TestCase):
 
@@ -136,38 +149,53 @@ def test_code_point_repr_function(self):
 
 class TestCharacterClass(unittest.TestCase):
 
-    def test_char_group_split(self):
+    def test_char_class_init(self):
+        char_class = CharacterClass()
+        self.assertEqual(char_class.positive, [])
+        self.assertEqual(char_class.negative, [])
+
+        char_class = CharacterClass('a-z')
+        self.assertEqual(char_class.positive, [(97, 123)])
+        self.assertEqual(char_class.negative, [])
+
+    def test_char_class_repr(self):
+        char_class = CharacterClass('a-z')
+        self.assertEqual(repr(char_class), 'CharacterClass([a-z])')
+        char_class.complement()
+        self.assertEqual(repr(char_class), 'CharacterClass([^a-z])')
+
+    def test_char_class_split(self):
         self.assertListEqual(CharacterClass._re_char_set.split(r'2-\\'), [r'2-\\'])
 
     def test_complement(self):
-        char_group = CharacterClass('a-z')
-        char_group.complement()
-        self.assertEqual(str(char_group), '[^a-z]')
+        char_class = CharacterClass('a-z')
+        char_class.complement()
+        self.assertEqual(str(char_class), '[^a-z]')
 
     def test_isub_operator(self):
-        char_group = CharacterClass('A-Za-z')
-        char_group -= CharacterClass('a-z')
-        self.assertEqual(str(char_group), '[A-Z]')
+        char_class = CharacterClass('A-Za-z')
+        char_class -= CharacterClass('a-z')
+        self.assertEqual(str(char_class), '[A-Z]')
 
-        char_group = CharacterClass('a-z')
+        char_class = CharacterClass('a-z')
         other = CharacterClass('A-Za-c')
         other.complement()
-        char_group -= other
-        self.assertEqual(str(char_group), '[a-c]')
+        char_class -= other
+        self.assertEqual(str(char_class), '[a-c]')
 
-        char_group = CharacterClass('a-z')
+        char_class = CharacterClass('a-z')
         other = CharacterClass('A-Za-c')
         other.complement()
         other.add('b')
-        char_group -= other
-        self.assertEqual(str(char_group), '[ac]')
+        char_class -= other
+        self.assertEqual(str(char_class), '[ac]')
 
-        char_group = CharacterClass('a-c')
-        char_group.complement()
+        char_class = CharacterClass('a-c')
+        char_class.complement()
         other = CharacterClass('a-z')
         other.complement()
-        char_group -= other
-        self.assertEqual(str(char_group), '[d-z]')
+        char_class -= other
+        self.assertEqual(str(char_class), '[d-z]')
 
 
 class TestUnicodeCategories(unittest.TestCase):
@@ -327,10 +355,10 @@ def test_character_class_reordering(self):
         self.assertIsNone(pattern.search('xx:y'))
 
     def test_iterparse_character_group(self):
-        self.assertListEqual(list(iterparse_character_class('a-c-1-4x-z-7-9')),
+        self.assertListEqual(list(iterparse_character_subset('a-c-1-4x-z-7-9')),
                              [(ord('a'), ord('c') + 1), ord('-'), (ord('1'), ord('4') + 1),
                               (ord('x'), ord('z') + 1), ord('-'), (55, 58)])
-        self.assertListEqual(list(iterparse_character_class('2-\\')), [(ord('2'), ord('\\') + 1)])
+        self.assertListEqual(list(iterparse_character_subset('2-\\')), [(ord('2'), ord('\\') + 1)])
 
     def test_occurrences_qualifiers(self):
         regex = get_python_pattern('#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?', anchors=False)