Skip to content

Commit

Permalink
Changes to structured_data to respect uniqueItems for arrays.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmichaels-harvard committed Dec 12, 2023
1 parent a447f3f commit 8f4f694
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 20 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Change Log
=====
* Minor fix to misc_utils.to_integer to handle float strings.
* Minor fix to structured_data to accumulate unique resolved_refs across schemas.
* Changes to structured_data to respect uniqueItems for arrays.


8.5.0
Expand Down
17 changes: 11 additions & 6 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1469,28 +1469,33 @@ def string_list(s):
return [p for p in [part.strip() for part in s.split(",")] if p]


def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> List[str]:
def split_string(value: str, delimiter: str, escape: Optional[str] = None, unique: bool = False) -> List[str]:
"""
Splits the given string into an array of string based on the given delimiter, and an optional escape character.
"""
if not isinstance(value, str) or not (value := value.strip()):
return []
if not isinstance(escape, str) or not escape:
return [item.strip() for item in value.split(delimiter)]
result = []
if not isinstance(escape, str) or not escape:
for item in value.split(delimiter):
if (item := item.strip()) and (unique is not True or item not in result):
result.append(item)
return result
item = r""
escaped = False
for c in value:
if c == delimiter and not escaped:
result.append(item.strip())
if (item := item.strip()) and (unique is not True or item not in result):
result.append(item)
item = r""
elif c == escape and not escaped:
escaped = True
else:
item += c
escaped = False
result.append(item.strip())
return [item for item in result if item]
if (item := item.strip()) and (unique is not True or item not in result):
result.append(item)
return result


def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]],
Expand Down
29 changes: 17 additions & 12 deletions dcicutils/structured_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def parse_components(column_components: List[str], path: List[Union[str, int]])
return {array_name: array} if array_name else {column_component: value}

def set_value_internal(data: Union[dict, list], value: Optional[Any], src: Optional[str],
path: List[Union[str, int]], mapv: Optional[Callable]) -> None:
path: List[Union[str, int]], typeinfo: Optional[dict], mapv: Optional[Callable]) -> None:

def set_value_backtrack_object(path_index: int, path_element: str) -> None:
nonlocal data, path, original_data
Expand All @@ -257,7 +257,7 @@ def set_value_backtrack_object(path_index: int, path_element: str) -> None:
set_value_backtrack_object(i, p)
data = data[p]
if (p := path[-1]) == -1 and isinstance(value, str):
values = _split_array_string(value)
values = _split_array_string(value, unique=typeinfo.get("unique") if typeinfo else False)
if mapv:
values = [mapv(value, src) for value in values]
merge_objects(data, values)
Expand Down Expand Up @@ -288,11 +288,13 @@ def ensure_column_consistency(column_name: str) -> None:
for column_name in column_names or []:
ensure_column_consistency(column_name)
rational_column_name = self._schema.rationalize_column_name(column_name) if self._schema else column_name
map_value_function = self._schema.get_map_value_function(rational_column_name) if self._schema else None
column_typeinfo = self._schema.get_typeinfo(rational_column_name) if self._schema else None
map_value_function = column_typeinfo.get("map") if column_typeinfo else None
if (column_components := _split_dotted_string(rational_column_name)):
merge_objects(structured_row_template, parse_components(column_components, path := []), True)
self._set_value_functions[column_name] = (lambda data, value, src, path=path, mapv=map_value_function:
set_value_internal(data, value, src, path, mapv))
self._set_value_functions[column_name] = (
lambda data, value, src, path=path, typeinfo=column_typeinfo, mapv=map_value_function:
set_value_internal(data, value, src, path, typeinfo, mapv))
return structured_row_template


Expand Down Expand Up @@ -331,10 +333,7 @@ def unresolved_refs(self) -> List[dict]:
def resolved_refs(self) -> List[str]:
return list(self._resolved_refs)

def get_map_value_function(self, column_name: str) -> Optional[Any]:
return (self._get_typeinfo(column_name) or {}).get("map")

def _get_typeinfo(self, column_name: str) -> Optional[dict]:
def get_typeinfo(self, column_name: str) -> Optional[dict]:
if isinstance(info := self._typeinfo.get(column_name), str):
info = self._typeinfo.get(info)
if not info and isinstance(info := self._typeinfo.get(self.unadorn_column_name(column_name)), str):
Expand Down Expand Up @@ -467,9 +466,15 @@ def _create_typeinfo(self, schema_json: dict, parent_key: Optional[str] = None)
raise Exception(f"Array of undefined or multiple types in JSON schema NOT supported: {key}")
raise Exception(f"Invalid array type specifier in JSON schema: {key}")
key = key + ARRAY_NAME_SUFFIX_CHAR
if unique := (property_value.get("uniqueItems") is True):
pass
property_value = array_property_items
property_value_type = property_value.get("type")
result.update(self._create_typeinfo(array_property_items, parent_key=key))
typeinfo = self._create_typeinfo(array_property_items, parent_key=key)
if unique:
typeinfo[key]["unique"] = True
result.update(typeinfo)
# result.update(self._create_typeinfo(array_property_items, parent_key=key))
continue
result[key] = {"type": property_value_type, "map": self._map_function({**property_value, "column": key})}
if ARRAY_NAME_SUFFIX_CHAR in key:
Expand Down Expand Up @@ -615,5 +620,5 @@ def _split_dotted_string(value: str):
return split_string(value, DOTTED_NAME_DELIMITER_CHAR)


def _split_array_string(value: str):
return split_string(value, ARRAY_VALUE_DELIMITER_CHAR, ARRAY_VALUE_DELIMITER_ESCAPE_CHAR)
def _split_array_string(value: str, unique: bool = False):
return split_string(value, ARRAY_VALUE_DELIMITER_CHAR, ARRAY_VALUE_DELIMITER_ESCAPE_CHAR, unique=unique)
10 changes: 8 additions & 2 deletions test/test_misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3593,8 +3593,8 @@ def test_json_lines_reader_lists():


def test_split_array_string():
def split_array_string(value: str) -> List[str]:
return split_string(value, "|", "\\")
def split_array_string(value: str, unique: bool = False) -> List[str]:
return split_string(value, "|", "\\", unique=unique)
assert split_array_string(r"abc|def|ghi") == ["abc", "def", "ghi"]
assert split_array_string(r"abc\|def|ghi") == ["abc|def", "ghi"]
assert split_array_string(r"abc\\|def|ghi") == ["abc\\", "def", "ghi"]
Expand All @@ -3609,6 +3609,12 @@ def split_array_string(value: str) -> List[str]:
assert split_array_string(r"|") == []
assert split_array_string(r"\|") == ["|"]
assert split_array_string(r"\\|") == ["\\"]
assert split_array_string(r"abc|def|abc|ghi", unique=False) == ["abc", "def", "abc", "ghi"]
assert split_array_string(r"abc|def|abc|ghi", unique=True) == ["abc", "def", "ghi"]
assert split_array_string(r"abc\\\|def\|ghi|jkl|mno|jkl", unique=False) == ["abc\\|def|ghi", "jkl", "mno", "jkl"]
assert split_array_string(r"abc\\\|def\|ghi|jkl|mno|jkl", unique=True) == ["abc\\|def|ghi", "jkl", "mno"]
assert split_string(r"abc|def|ghi|def", delimiter="|", unique=False) == ["abc", "def", "ghi", "def"]
assert split_string(r"abc|def|ghi|def", delimiter="|", unique=True) == ["abc", "def", "ghi"]


def test_merge_objects_1():
Expand Down

0 comments on commit 8f4f694

Please sign in to comment.