Skip to content

Commit

Permalink
Update to release v2.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
brunato committed Aug 24, 2020
1 parent 055285d commit 3276ca2
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 53 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
CHANGELOG
*********

`v2.0.1`_ (2020-08-24)
======================
* Add regex transpiler (for XPath/XQuery and XML Schema regular expressions)
* Hotfix for issue #30

`v2.0.0`_ (2020-08-13)
======================
* Extensive testing with W3C XPath 2.0 tests (~98% passed)
Expand Down Expand Up @@ -232,3 +237,5 @@ CHANGELOG
.. _v1.4.5: https://github.com/sissaschool/elementpath/compare/v1.4.4...v1.4.5
.. _v1.4.6: https://github.com/sissaschool/elementpath/compare/v1.4.5...v1.4.6
.. _v2.0.0: https://github.com/sissaschool/elementpath/compare/v1.4.6...v2.0.0
.. _v2.0.1: https://github.com/sissaschool/elementpath/compare/v2.0.0...v2.0.1

2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ include setup.cfg
include requirements-dev.txt
include tox.ini
include doc/*

recursive-include elementpath *.py
recursive-include tests *.py

global-exclude *.py[cod]
5 changes: 1 addition & 4 deletions elementpath/regex/character_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
#
# @author Davide Brunato <brunato@sissa.it>
#
"""
Parse and translate XML Schema regular expressions to Python regex syntax.
"""
import re
from itertools import chain
from sys import maxunicode
Expand Down Expand Up @@ -92,7 +89,7 @@ def __init__(self, charset=None, is_syntax=True):
self.add(charset)

def __repr__(self):
return '<%s at %d>' % (self.__class__.__name__, id(self))
return '%s(%s)' % (self.__class__.__name__, str(self))

def __str__(self):
if not self.negative:
Expand Down
23 changes: 13 additions & 10 deletions elementpath/regex/codepoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,24 @@ def iter_code_points(code_points, reverse=False):
yield start_cp


def check_code_point(cp):
def get_code_point_range(cp):
"""
Checks a code point or code point range.
Returns a code point range.
:return: a valid code point range.
:param cp: a single code point or a code point range.
:return: a code point range or `None` if the argument is not a \
code point or a code point range.
"""
if isinstance(cp, int):
if not (0 <= cp <= maxunicode):
raise ValueError("not a Unicode code point: %r" % cp)
return cp, cp + 1
if 0 <= cp <= maxunicode:
return cp, cp + 1
else:
if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \
or not isinstance(cp[0], int) or not isinstance(cp[1], int):
raise ValueError("not a Unicode code point range: %r" % cp)
return cp
try:
if isinstance(cp[0], int) and isinstance(cp[1], int):
if 0 <= cp[0] < cp[1] <= maxunicode + 1:
return cp
except (IndexError, TypeError):
pass


def code_point_repr(cp):
Expand Down
34 changes: 20 additions & 14 deletions elementpath/regex/unicode_subsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@
from collections.abc import Iterable, MutableSet

from .unicode_categories import RAW_UNICODE_CATEGORIES
from .codepoints import code_point_order, code_point_repr, iter_code_points, check_code_point
from .codepoints import code_point_order, code_point_repr, iter_code_points, get_code_point_range


class RegexError(Exception):
pass


def iterparse_character_class(s, expand_ranges=False):
def iterparse_character_subset(s, expand_ranges=False):
"""
Parse a regex character group part, generating a sequence of code points
and code points ranges. An unescaped hyphen (-) that is not at the start
or at the and is interpreted as range specifier.
Parses a regex character subset, generating a sequence of code points
and code points ranges. An unescaped hyphen (-) that is not at the
start or at the and is interpreted as range specifier.
:param s: a string representing a character group part.
:param expand_ranges: if set to `True` then expands character ranges.
Expand Down Expand Up @@ -116,10 +116,10 @@ class UnicodeSubset(MutableSet):
"""
Represents a subset of Unicode code points, implemented with an ordered list of
integer values and ranges. Codepoints can be added or discarded using sequences
of integer values and ranges or with strings equivalent to regex character class.
of integer values and ranges or with strings equivalent to regex character set.
:param codepoints: a sequence of integer values and ranges, another UnicodeSubset \
instance ora a string equivalent of a regex character class.
instance ora a string equivalent of a regex character set.
"""

def __init__(self, codepoints=None):
Expand Down Expand Up @@ -191,9 +191,7 @@ def __contains__(self, value):
try:
value = ord(value)
except TypeError:
raise TypeError(
"%r: argument must be a code point or a character." % value
)
return False

for cp in self._codepoints:
if not isinstance(cp, int):
Expand Down Expand Up @@ -225,14 +223,18 @@ def __len__(self):
def update(self, *others):
for value in others:
if isinstance(value, str):
for cp in iter_code_points(iterparse_character_class(value), reverse=True):
for cp in iter_code_points(iterparse_character_subset(value), reverse=True):
self.add(cp)
else:
for cp in iter_code_points(value, reverse=True):
self.add(cp)

def add(self, value):
start_value, end_value = check_code_point(value)
try:
start_value, end_value = get_code_point_range(value)
except TypeError:
raise ValueError("{!r} is not a Unicode code point value/range".format(value))

code_points = self._codepoints
last_index = len(code_points) - 1
for k, cp in enumerate(code_points):
Expand Down Expand Up @@ -264,14 +266,18 @@ def add(self, value):
def difference_update(self, *others):
for value in others:
if isinstance(value, str):
for cp in iter_code_points(iterparse_character_class(value), reverse=True):
for cp in iter_code_points(iterparse_character_subset(value), reverse=True):
self.discard(cp)
else:
for cp in iter_code_points(value, reverse=True):
self.discard(cp)

def discard(self, value):
start_cp, end_cp = check_code_point(value)
try:
start_cp, end_cp = get_code_point_range(value)
except TypeError:
raise ValueError("{!r} is not a Unicode code point value/range".format(value))

code_points = self._codepoints
for k in reversed(range(len(code_points))):
cp = code_points[k]
Expand Down
4 changes: 2 additions & 2 deletions publiccode.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ publiccodeYmlVersion: '0.2'
name: elementpath
url: 'https://github.com/sissaschool/elementpath'
landingURL: 'https://github.com/sissaschool/elementpath'
releaseDate: '2020-08-13'
softwareVersion: v2.0.0
releaseDate: '2020-08-24'
softwareVersion: v2.0.1
developmentStatus: stable
platforms:
- linux
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ setuptools
tox
coverage
lxml
xmlschema~=1.2.0
xmlschema~=1.2.3
Sphinx
-e .
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
#
# @author Davide Brunato <brunato@sissa.it>
#
from setuptools import setup
from setuptools import setup, find_packages

with open("README.rst") as readme:
long_description = readme.read()

setup(
name='elementpath',
version='2.0.1',
packages=['elementpath'],
packages=find_packages(include=['elementpath', 'elementpath.*']),
author='Davide Brunato',
author_email='brunato@sissa.it',
url='https://github.com/sissaschool/elementpath',
Expand Down
68 changes: 48 additions & 20 deletions tests/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
from unicodedata import category

from elementpath.regex import RegexError, CharacterClass, get_python_pattern
from elementpath.regex.codepoints import get_code_point_range
from elementpath.regex.unicode_subsets import code_point_repr, \
iterparse_character_class, iter_code_points, UnicodeSubset, \
iterparse_character_subset, iter_code_points, UnicodeSubset, \
UNICODE_CATEGORIES


Expand All @@ -44,6 +45,18 @@ def test_iter_code_points(self):
[25, (8, 23), 0]
)

def test_get_code_point_range(self):
self.assertEqual(get_code_point_range(97), (97, 98))
self.assertEqual(get_code_point_range((97, 100)), (97, 100))
self.assertEqual(get_code_point_range([97, 100]), [97, 100])

self.assertIsNone(get_code_point_range(-1))
self.assertIsNone(get_code_point_range(sys.maxunicode + 1))
self.assertIsNone(get_code_point_range((-1, 100)))
self.assertIsNone(get_code_point_range((97, sys.maxunicode + 2)))
self.assertIsNone(get_code_point_range(97.0))
self.assertIsNone(get_code_point_range((97.0, 100)))


class TestUnicodeSubset(unittest.TestCase):

Expand Down Expand Up @@ -136,38 +149,53 @@ def test_code_point_repr_function(self):

class TestCharacterClass(unittest.TestCase):

def test_char_group_split(self):
def test_char_class_init(self):
char_class = CharacterClass()
self.assertEqual(char_class.positive, [])
self.assertEqual(char_class.negative, [])

char_class = CharacterClass('a-z')
self.assertEqual(char_class.positive, [(97, 123)])
self.assertEqual(char_class.negative, [])

def test_char_class_repr(self):
char_class = CharacterClass('a-z')
self.assertEqual(repr(char_class), 'CharacterClass([a-z])')
char_class.complement()
self.assertEqual(repr(char_class), 'CharacterClass([^a-z])')

def test_char_class_split(self):
self.assertListEqual(CharacterClass._re_char_set.split(r'2-\\'), [r'2-\\'])

def test_complement(self):
char_group = CharacterClass('a-z')
char_group.complement()
self.assertEqual(str(char_group), '[^a-z]')
char_class = CharacterClass('a-z')
char_class.complement()
self.assertEqual(str(char_class), '[^a-z]')

def test_isub_operator(self):
char_group = CharacterClass('A-Za-z')
char_group -= CharacterClass('a-z')
self.assertEqual(str(char_group), '[A-Z]')
char_class = CharacterClass('A-Za-z')
char_class -= CharacterClass('a-z')
self.assertEqual(str(char_class), '[A-Z]')

char_group = CharacterClass('a-z')
char_class = CharacterClass('a-z')
other = CharacterClass('A-Za-c')
other.complement()
char_group -= other
self.assertEqual(str(char_group), '[a-c]')
char_class -= other
self.assertEqual(str(char_class), '[a-c]')

char_group = CharacterClass('a-z')
char_class = CharacterClass('a-z')
other = CharacterClass('A-Za-c')
other.complement()
other.add('b')
char_group -= other
self.assertEqual(str(char_group), '[ac]')
char_class -= other
self.assertEqual(str(char_class), '[ac]')

char_group = CharacterClass('a-c')
char_group.complement()
char_class = CharacterClass('a-c')
char_class.complement()
other = CharacterClass('a-z')
other.complement()
char_group -= other
self.assertEqual(str(char_group), '[d-z]')
char_class -= other
self.assertEqual(str(char_class), '[d-z]')


class TestUnicodeCategories(unittest.TestCase):
Expand Down Expand Up @@ -327,10 +355,10 @@ def test_character_class_reordering(self):
self.assertIsNone(pattern.search('xx:y'))

def test_iterparse_character_group(self):
self.assertListEqual(list(iterparse_character_class('a-c-1-4x-z-7-9')),
self.assertListEqual(list(iterparse_character_subset('a-c-1-4x-z-7-9')),
[(ord('a'), ord('c') + 1), ord('-'), (ord('1'), ord('4') + 1),
(ord('x'), ord('z') + 1), ord('-'), (55, 58)])
self.assertListEqual(list(iterparse_character_class('2-\\')), [(ord('2'), ord('\\') + 1)])
self.assertListEqual(list(iterparse_character_subset('2-\\')), [(ord('2'), ord('\\') + 1)])

def test_occurrences_qualifiers(self):
regex = get_python_pattern('#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?', anchors=False)
Expand Down

0 comments on commit 3276ca2

Please sign in to comment.