diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 507f919d1..eb4ce6e0d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,9 @@ Change Log for running Foursight checks locally (with local ssh tunnel to ES proxy); came up in foursight/checks/audit_checks (2024-04-23). * Allow Python 3.12 (pyproject.toml). +* Added remove_empty_objects_from_lists options to structured_data.StructuredDataSet, defaulting + to True, which deletes empty objects from lists; however, only from the *end* of a list; if + this flag is True and there are non-empty objects following empty objects then we flag an error. * Few general things initially related to and factored out of rclone support in smaht-submitr: - Added extract_file_from_zip to zip_utils. - Added http_utils with download function. diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 060ca95ec..0d719c421 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1155,7 +1155,8 @@ def remove_suffix(suffix: str, text: str, required: bool = False): def remove_empty_properties(data: Optional[Union[list, dict]], isempty: Optional[Callable] = None, - isempty_array_element: Optional[Callable] = None) -> None: + isempty_array_element: Optional[Callable] = None, + raise_exception_on_nonempty_array_element_after_empty: bool = False) -> None: def _isempty(value: Any) -> bool: # noqa return isempty(value) if callable(isempty) else value in [None, "", {}, []] if isinstance(data, dict): @@ -1163,11 +1164,22 @@ def _isempty(value: Any) -> bool: # noqa if _isempty(value := data[key]): del data[key] else: - remove_empty_properties(value, isempty=isempty, isempty_array_element=isempty_array_element) + remove_empty_properties(value, isempty=isempty, isempty_array_element=isempty_array_element, + raise_exception_on_nonempty_array_element_after_empty= # noqa + raise_exception_on_nonempty_array_element_after_empty) elif isinstance(data, list): for item in data: - remove_empty_properties(item, isempty=isempty, isempty_array_element=isempty_array_element) + remove_empty_properties(item, isempty=isempty, isempty_array_element=isempty_array_element, + raise_exception_on_nonempty_array_element_after_empty= # noqa + raise_exception_on_nonempty_array_element_after_empty) if callable(isempty_array_element): + if raise_exception_on_nonempty_array_element_after_empty is True: + empty_element_seen = False + for item in data: + if not empty_element_seen and isempty_array_element(item): + empty_element_seen = True + elif empty_element_seen and not isempty_array_element(item): + raise Exception("Non-empty element found after empty element.") data[:] = [item for item in data if not isempty_array_element(item)] diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index 36ee036f8..e80caf637 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -53,6 +53,7 @@ class StructuredDataSet: def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None, schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None, order: Optional[List[str]] = None, prune: bool = True, + remove_empty_objects_from_lists: bool = True, ref_lookup_strategy: Optional[Callable] = None, ref_lookup_nocache: bool = False, norefs: bool = False, @@ -65,7 +66,8 @@ def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp ref_lookup_nocache=ref_lookup_nocache) if portal else None self._ref_lookup_strategy = ref_lookup_strategy self._order = order - self._prune = prune + self._prune = prune is True + self._remove_empty_objects_from_lists = remove_empty_objects_from_lists is True self._warnings = {} self._errors = {} self._resolved_refs = set() @@ -93,12 +95,14 @@ def portal(self) -> Optional[Portal]: def load(file: str, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None, schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None, order: Optional[List[str]] = None, prune: bool = True, + remove_empty_objects_from_lists: bool = True, ref_lookup_strategy: Optional[Callable] = None, ref_lookup_nocache: bool = False, norefs: bool = False, progress: Optional[Callable] = None, debug_sleep: Optional[str] = None) -> StructuredDataSet: return StructuredDataSet(file=file, portal=portal, schemas=schemas, autoadd=autoadd, order=order, prune=prune, + remove_empty_objects_from_lists=remove_empty_objects_from_lists, ref_lookup_strategy=ref_lookup_strategy, ref_lookup_nocache=ref_lookup_nocache, norefs=norefs, progress=progress, debug_sleep=debug_sleep) @@ -368,7 +372,11 @@ def _load_reader(self, reader: RowReader, type_name: str) -> None: structured_row_template.set_value(structured_row, column_name, value, reader.file, reader.row_number) if self._autoadd_properties: self._add_properties(structured_row, self._autoadd_properties, schema) - self._add(type_name, structured_row) + if (prune_error := self._prune_structured_row(structured_row)) is not None: + self._note_error({"src": create_dict(type=schema_name, row=reader.row_number), + "error": prune_error}, "validation") + else: + self._add(type_name, structured_row) if self._progress: self._progress({ PROGRESS.LOAD_ITEM: self._nrows, @@ -385,9 +393,20 @@ def _load_reader(self, reader: RowReader, type_name: str) -> None: self._note_error(schema._unresolved_refs, "ref") self._resolved_refs.update(schema._resolved_refs) - def _add(self, type_name: str, data: Union[dict, List[dict]]) -> None: - if self._prune: + def _prune_structured_row(self, data: dict) -> Optional[str]: + if not self._prune: + return None + if not self._remove_empty_objects_from_lists: remove_empty_properties(data) + return None + try: + remove_empty_properties(data, isempty_array_element=lambda element: element == {}, + raise_exception_on_nonempty_array_element_after_empty=True) + except Exception as e: + return str(e) + return None + + def _add(self, type_name: str, data: Union[dict, List[dict]]) -> None: if type_name in self._data: self._data[type_name].extend([data] if isinstance(data, dict) else data) else: diff --git a/pyproject.toml b/pyproject.toml index f0bef193a..421985fb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.8.4.1b35" # TODO: To become 8.8.5 +version = "8.8.4.1b36" # TODO: To become 8.8.5 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_structured_data.py b/test/test_structured_data.py index a1b713762..453ce32bc 100644 --- a/test/test_structured_data.py +++ b/test/test_structured_data.py @@ -943,9 +943,37 @@ def _pytest_kwargs(kwargs: List[dict]) -> List[dict]: } } ], - # "expected": {"Test": [{"arrayofobject": [{"name": "anastasiia", "id": 1234}]}]}, "expected": {"Test": [{"arrayofobject": [{}, {}, {"name": "olha", "id": 5678}, - {}, {"name": "anastasiia", "id": 1234}]}]} + {}, {"name": "anastasiia", "id": 1234}]}]}, + "remove_empty_objects_from_lists": False + }, + # ---------------------------------------------------------------------------------------------- + { + "rows": [ + "arrayofobject#0.name,arrayofobject#0.id,arrayofobject#1.name,arrayofobject#1.id," + "arrayofobject#2.name,arrayofobject#2.id", + "anastasiia,1234,olha,5678,," + ], + "as_file_name": "test.csv", + "schemas": [ + { + "title": "Test", + "properties": { + "arrayofobject": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "id": {"type": "integer"} + } + } + } + } + } + ], + "expected": {"Test": [{"arrayofobject": [{"name": "anastasiia", "id": 1234}, + {"name": "olha", "id": 5678}]}]} }, # ---------------------------------------------------------------------------------------------- { @@ -1332,6 +1360,7 @@ def _test_parse_structured_data(testapp, schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None, prune: bool = True, + remove_empty_objects_from_lists: bool = True, ignore: bool = False, debug: bool = False) -> None: @@ -1360,12 +1389,13 @@ def _test_parse_structured_data(testapp, def assert_parse_structured_data(): def call_parse_structured_data(file: str): - nonlocal portal, novalidate, autoadd, prune, debug + nonlocal portal, novalidate, autoadd, prune, remove_empty_objects_from_lists, debug if debug: # import pdb ; pdb.set_trace() pass return parse_structured_data(file=file, portal=portal, novalidate=novalidate, - autoadd=autoadd, prune=True if prune is not False else False) + autoadd=autoadd, prune=True if prune is not False else False, + remove_empty_objects_from_lists=remove_empty_objects_from_lists) nonlocal file, expected, expected_errors, schemas, noschemas, debug portal = Portal(testapp, schemas=schemas) if not noschemas else None # But see mocked_schemas. @@ -1511,6 +1541,7 @@ def is_accession(value: str) -> bool: # Same as in smaht-portal/../ingestion_processors.py def parse_structured_data(file: str, portal: Optional[Union[VirtualApp, TestApp, Portal]], novalidate: bool = False, autoadd: Optional[dict] = None, prune: bool = True, + remove_empty_objects_from_lists: bool = True, ref_nocache: bool = False) -> StructuredDataSet: def ref_lookup_strategy(portal: Portal, type_name: str, schema: dict, value: str) -> Tuple[int, Optional[str]]: @@ -1528,6 +1559,7 @@ def ref_lookup_strategy(portal: Portal, type_name: str, schema: dict, value: str structured_data = StructuredDataSet.load(file=file, portal=portal, autoadd=autoadd, order=ITEM_INDEX_ORDER, prune=prune, + remove_empty_objects_from_lists=remove_empty_objects_from_lists, ref_lookup_strategy=ref_lookup_strategy, ref_lookup_nocache=ref_nocache) if not novalidate: