From 8f4f6940af49393d6ff0b9b04d855a09aa606d83 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 12 Dec 2023 18:16:53 -0500 Subject: [PATCH] Changes to structured_data to respect uniqueItems for arrays. --- CHANGELOG.rst | 1 + dcicutils/misc_utils.py | 17 +++++++++++------ dcicutils/structured_data.py | 29 +++++++++++++++++------------ test/test_misc_utils.py | 10 ++++++++-- 4 files changed, 37 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8c0206d6a..cc20fc041 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,7 @@ Change Log ===== * Minor fix to misc_utils.to_integer to handle float strings. * Minor fix to structured_data to accumulate unique resolved_refs across schemas. +* Changes to structured_data to respect uniqueItems for arrays. 8.5.0 diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index ebc5d4b7e..307c66ca3 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1469,28 +1469,33 @@ def string_list(s): return [p for p in [part.strip() for part in s.split(",")] if p] -def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> List[str]: +def split_string(value: str, delimiter: str, escape: Optional[str] = None, unique: bool = False) -> List[str]: """ Splits the given string into an array of string based on the given delimiter, and an optional escape character. """ if not isinstance(value, str) or not (value := value.strip()): return [] - if not isinstance(escape, str) or not escape: - return [item.strip() for item in value.split(delimiter)] result = [] + if not isinstance(escape, str) or not escape: + for item in value.split(delimiter): + if (item := item.strip()) and (unique is not True or item not in result): + result.append(item) + return result item = r"" escaped = False for c in value: if c == delimiter and not escaped: - result.append(item.strip()) + if (item := item.strip()) and (unique is not True or item not in result): + result.append(item) item = r"" elif c == escape and not escaped: escaped = True else: item += c escaped = False - result.append(item.strip()) - return [item for item in result if item] + if (item := item.strip()) and (unique is not True or item not in result): + result.append(item) + return result def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]], diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index eab7e7112..71400929a 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -237,7 +237,7 @@ def parse_components(column_components: List[str], path: List[Union[str, int]]) return {array_name: array} if array_name else {column_component: value} def set_value_internal(data: Union[dict, list], value: Optional[Any], src: Optional[str], - path: List[Union[str, int]], mapv: Optional[Callable]) -> None: + path: List[Union[str, int]], typeinfo: Optional[dict], mapv: Optional[Callable]) -> None: def set_value_backtrack_object(path_index: int, path_element: str) -> None: nonlocal data, path, original_data @@ -257,7 +257,7 @@ def set_value_backtrack_object(path_index: int, path_element: str) -> None: set_value_backtrack_object(i, p) data = data[p] if (p := path[-1]) == -1 and isinstance(value, str): - values = _split_array_string(value) + values = _split_array_string(value, unique=typeinfo.get("unique") if typeinfo else False) if mapv: values = [mapv(value, src) for value in values] merge_objects(data, values) @@ -288,11 +288,13 @@ def ensure_column_consistency(column_name: str) -> None: for column_name in column_names or []: ensure_column_consistency(column_name) rational_column_name = self._schema.rationalize_column_name(column_name) if self._schema else column_name - map_value_function = self._schema.get_map_value_function(rational_column_name) if self._schema else None + column_typeinfo = self._schema.get_typeinfo(rational_column_name) if self._schema else None + map_value_function = column_typeinfo.get("map") if column_typeinfo else None if (column_components := _split_dotted_string(rational_column_name)): merge_objects(structured_row_template, parse_components(column_components, path := []), True) - self._set_value_functions[column_name] = (lambda data, value, src, path=path, mapv=map_value_function: - set_value_internal(data, value, src, path, mapv)) + self._set_value_functions[column_name] = ( + lambda data, value, src, path=path, typeinfo=column_typeinfo, mapv=map_value_function: + set_value_internal(data, value, src, path, typeinfo, mapv)) return structured_row_template @@ -331,10 +333,7 @@ def unresolved_refs(self) -> List[dict]: def resolved_refs(self) -> List[str]: return list(self._resolved_refs) - def get_map_value_function(self, column_name: str) -> Optional[Any]: - return (self._get_typeinfo(column_name) or {}).get("map") - - def _get_typeinfo(self, column_name: str) -> Optional[dict]: + def get_typeinfo(self, column_name: str) -> Optional[dict]: if isinstance(info := self._typeinfo.get(column_name), str): info = self._typeinfo.get(info) if not info and isinstance(info := self._typeinfo.get(self.unadorn_column_name(column_name)), str): @@ -467,9 +466,15 @@ def _create_typeinfo(self, schema_json: dict, parent_key: Optional[str] = None) raise Exception(f"Array of undefined or multiple types in JSON schema NOT supported: {key}") raise Exception(f"Invalid array type specifier in JSON schema: {key}") key = key + ARRAY_NAME_SUFFIX_CHAR + if unique := (property_value.get("uniqueItems") is True): + pass property_value = array_property_items property_value_type = property_value.get("type") - result.update(self._create_typeinfo(array_property_items, parent_key=key)) + typeinfo = self._create_typeinfo(array_property_items, parent_key=key) + if unique: + typeinfo[key]["unique"] = True + result.update(typeinfo) +# result.update(self._create_typeinfo(array_property_items, parent_key=key)) continue result[key] = {"type": property_value_type, "map": self._map_function({**property_value, "column": key})} if ARRAY_NAME_SUFFIX_CHAR in key: @@ -615,5 +620,5 @@ def _split_dotted_string(value: str): return split_string(value, DOTTED_NAME_DELIMITER_CHAR) -def _split_array_string(value: str): - return split_string(value, ARRAY_VALUE_DELIMITER_CHAR, ARRAY_VALUE_DELIMITER_ESCAPE_CHAR) +def _split_array_string(value: str, unique: bool = False): + return split_string(value, ARRAY_VALUE_DELIMITER_CHAR, ARRAY_VALUE_DELIMITER_ESCAPE_CHAR, unique=unique) diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 8dfa8454c..fabf661f6 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -3593,8 +3593,8 @@ def test_json_lines_reader_lists(): def test_split_array_string(): - def split_array_string(value: str) -> List[str]: - return split_string(value, "|", "\\") + def split_array_string(value: str, unique: bool = False) -> List[str]: + return split_string(value, "|", "\\", unique=unique) assert split_array_string(r"abc|def|ghi") == ["abc", "def", "ghi"] assert split_array_string(r"abc\|def|ghi") == ["abc|def", "ghi"] assert split_array_string(r"abc\\|def|ghi") == ["abc\\", "def", "ghi"] @@ -3609,6 +3609,12 @@ def split_array_string(value: str) -> List[str]: assert split_array_string(r"|") == [] assert split_array_string(r"\|") == ["|"] assert split_array_string(r"\\|") == ["\\"] + assert split_array_string(r"abc|def|abc|ghi", unique=False) == ["abc", "def", "abc", "ghi"] + assert split_array_string(r"abc|def|abc|ghi", unique=True) == ["abc", "def", "ghi"] + assert split_array_string(r"abc\\\|def\|ghi|jkl|mno|jkl", unique=False) == ["abc\\|def|ghi", "jkl", "mno", "jkl"] + assert split_array_string(r"abc\\\|def\|ghi|jkl|mno|jkl", unique=True) == ["abc\\|def|ghi", "jkl", "mno"] + assert split_string(r"abc|def|ghi|def", delimiter="|", unique=False) == ["abc", "def", "ghi", "def"] + assert split_string(r"abc|def|ghi|def", delimiter="|", unique=True) == ["abc", "def", "ghi"] def test_merge_objects_1():