Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Another approach to parse tag string with spaces to avoid long running time on regex matching. #7625

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 98 additions & 7 deletions samcli/cli/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,31 @@
LOG = logging.getLogger(__name__)


def _generate_quoted_match_regex(match_pattern):
"""
Creates a regex on a quoted string based on a match pattern (also a regex) that is to be
run on a string (which may contain escaped quotes) that is separated by delimiters.

Parameters
----------
match_pattern: (str) regex pattern to match
delim: (str) delimiter that is respected when identifying matching groups with generated regex.

Returns
-------
str: regex expression

Examples
-------
match_pattern: [A-Za-z0-9\\"_:\\.\\/\\+-\\@=]
input: createdBy=\"Test user\" ProjectName='Test App'
output: ['"Test user"', "'Test App'"]

"""

return f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|""" + f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\')"""


def _generate_match_regex(match_pattern, delim):
"""
Creates a regex string based on a match pattern (also a regex) that is to be
Expand Down Expand Up @@ -194,6 +219,7 @@ def __init__(self, multiple_values_per_key=False):
TAG_REGEX = '[A-Za-z0-9\\"_:\\.\\/\\+-\\@=]'

_pattern = r"{tag}={tag}".format(tag=_generate_match_regex(match_pattern=TAG_REGEX, delim=" "))
_quoted_pattern = _generate_quoted_match_regex(match_pattern=TAG_REGEX)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we reuse or refactor the old _generate_match_regex? They share similar return values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They have different regex, hence creating a new function.


name = "string,list"

Expand Down Expand Up @@ -222,13 +248,36 @@ def convert(self, value, param, ctx):
for k in tags:
self._add_value(result, _unquote_wrapped_quotes(k), _unquote_wrapped_quotes(tags[k]))
else:
groups = re.findall(self._pattern, val)

if not groups:
fail = True
for group in groups:
key, v = group
self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(v))
# Instead of parsing a full {tag}={tag} pattern, we will try to look for quoted string with spaces
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we refactor this new logic out to a separate method to unit test? There's a lot of moving parts here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is tricky because the block below

groups = re.findall(self._pattern, val)

take longer time to return

# first, remove all the spaces and start over again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there missing code here, or maybe duplicate comment? This comment and the one under say "first"

# First, we need to unquote a full string
modified_val = _unquote_wrapped_quotes(val)

# Next, looking for a quote strings that contain spaces and proceed to replace them
quoted_strings_with_spaces = re.findall(self._quoted_pattern, modified_val)
quoted_strings_without_spaces = [self._replace_spaces(s) for s in quoted_strings_with_spaces]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: no single character variable names

for s, replacement in zip(quoted_strings_with_spaces, quoted_strings_without_spaces):
modified_val = modified_val.replace(s, replacement[0])

# Finally, restart the parsing with key=value separated by (multiple) spaces.
parsed, tags = self._multiple_space_separated_key_value_parser(modified_val)
if parsed:
for k in tags:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
for k in tags:
for key, value in tags.items():

v = tags[k]
m = [i for i in quoted_strings_without_spaces if i[0] == v]
if len(m) > 0:
v = self._restore_spaces(*m[0])
self._add_value(result, _unquote_wrapped_quotes(k), _unquote_wrapped_quotes(v))
else:
# Otherwise, fall back to the original mechanism.
groups = re.findall(self._pattern, val)

if not groups:
fail = True
for group in groups:
key, v = group
self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(v))

if fail:
return self.fail(
Expand Down Expand Up @@ -286,6 +335,48 @@ def _space_separated_key_value_parser(tag_value):
tags_dict = {**tags_dict, **parsed_tag}
return True, tags_dict

@staticmethod
def _multiple_space_separated_key_value_parser(tag_value):
"""
Method to parse space separated `Key1=Value1 Key2=Value2` type tags without using regex.
Parameters
----------
tag_value
"""
tags_dict = {}
for value in tag_value.split():
parsed, parsed_tag = CfnTags._standard_key_value_parser(value)
if not parsed:
return False, None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How come we're also returning a boolean here? Can we just return None, or an empty dict instead of returning a tuple?

tags_dict = {**tags_dict, **parsed_tag}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit (if parsed_tag is of type dict)

Suggested change
tags_dict = {**tags_dict, **parsed_tag}
tags_dict.update(parsed_tag)

return True, tags_dict

@staticmethod
def _replace_spaces(text, replacement="_"):
"""
Replace spaces in a text with a replacement together with its original locations.
Input: "test 1"
Output: "test_1" [4]
"""
space_positions = [i for i, char in enumerate(text) if char == " "]
modified = text.replace(" ", replacement)

return modified, space_positions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts on using dataclasses for return objects, instead of returning tuples and performing array indexes to get the modified strings/replacement index array?

I imagine that changing things to dataclasses will make it easier for future readers to understand what is happening quickly.


@staticmethod
def _restore_spaces(modified_text, space_positions, replacement="_"):
"""
Restore spaces in a text from a original space locations.
Input: "test_1" [4]
Output: "test 1"
"""
text_list = list(modified_text)

for pos in space_positions:
text_list[pos] = " "

return "".join(text_list)


class SigningProfilesOptionType(click.ParamType):
"""
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/cli/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,14 @@ def test_must_fail_on_invalid_format(self, input):
["stage=int", "company:application=awesome-service", "company:department=engineering"],
{"stage": "int", "company:application": "awesome-service", "company:department": "engineering"},
),
# input as string with multiple key-values including spaces
(('tag1="son of anton" tag2="company abc"',), {"tag1": "son of anton", "tag2": "company abc"}),
(('tag1="son of anton" tag2="company abc"',), {"tag1": "son of anton", "tag2": "company abc"}),
(('\'tag1="son of anton" tag2="company abc"\'',), {"tag1": "son of anton", "tag2": "company abc"}),
(
('tag1="son of anton" tag2="company abc" tag:3="dummy tag"',),
{"tag1": "son of anton", "tag2": "company abc", "tag:3": "dummy tag"},
),
]
)
def test_successful_parsing(self, input, expected):
Expand Down
Loading