Skip to content

Commit

Permalink
Added remove_empty_objects_from_lists support in structured_data.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmichaels-harvard committed May 22, 2024
1 parent c2fefc4 commit dbea701
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ Change Log
for running Foursight checks locally (with local ssh tunnel to ES proxy);
came up in foursight/checks/audit_checks (2024-04-23).
* Allow Python 3.12 (pyproject.toml).
* Added remove_empty_objects_from_lists options to structured_data.StructuredDataSet, defaulting
to True, which deletes empty objects from lists; however, only from the *end* of a list; if
this flag is True and there are non-empty objects following empty objects then we flag an error.
* Few general things initially related to and factored out of rclone support in smaht-submitr:
- Added extract_file_from_zip to zip_utils.
- Added http_utils with download function.
Expand Down
18 changes: 15 additions & 3 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,19 +1155,31 @@ def remove_suffix(suffix: str, text: str, required: bool = False):

def remove_empty_properties(data: Optional[Union[list, dict]],
isempty: Optional[Callable] = None,
isempty_array_element: Optional[Callable] = None) -> None:
isempty_array_element: Optional[Callable] = None,
raise_exception_on_nonempty_array_element_after_empty: bool = False) -> None:
def _isempty(value: Any) -> bool: # noqa
return isempty(value) if callable(isempty) else value in [None, "", {}, []]
if isinstance(data, dict):
for key in list(data.keys()):
if _isempty(value := data[key]):
del data[key]
else:
remove_empty_properties(value, isempty=isempty, isempty_array_element=isempty_array_element)
remove_empty_properties(value, isempty=isempty, isempty_array_element=isempty_array_element,
raise_exception_on_nonempty_array_element_after_empty= # noqa
raise_exception_on_nonempty_array_element_after_empty)
elif isinstance(data, list):
for item in data:
remove_empty_properties(item, isempty=isempty, isempty_array_element=isempty_array_element)
remove_empty_properties(item, isempty=isempty, isempty_array_element=isempty_array_element,
raise_exception_on_nonempty_array_element_after_empty= # noqa
raise_exception_on_nonempty_array_element_after_empty)
if callable(isempty_array_element):
if raise_exception_on_nonempty_array_element_after_empty is True:
empty_element_seen = False
for item in data:
if not empty_element_seen and isempty_array_element(item):
empty_element_seen = True
elif empty_element_seen and not isempty_array_element(item):
raise Exception("Non-empty element found after empty element.")
data[:] = [item for item in data if not isempty_array_element(item)]


Expand Down
27 changes: 23 additions & 4 deletions dcicutils/structured_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class StructuredDataSet:
def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
order: Optional[List[str]] = None, prune: bool = True,
remove_empty_objects_from_lists: bool = True,
ref_lookup_strategy: Optional[Callable] = None,
ref_lookup_nocache: bool = False,
norefs: bool = False,
Expand All @@ -65,7 +66,8 @@ def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp
ref_lookup_nocache=ref_lookup_nocache) if portal else None
self._ref_lookup_strategy = ref_lookup_strategy
self._order = order
self._prune = prune
self._prune = prune is True
self._remove_empty_objects_from_lists = remove_empty_objects_from_lists is True
self._warnings = {}
self._errors = {}
self._resolved_refs = set()
Expand Down Expand Up @@ -93,12 +95,14 @@ def portal(self) -> Optional[Portal]:
def load(file: str, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
order: Optional[List[str]] = None, prune: bool = True,
remove_empty_objects_from_lists: bool = True,
ref_lookup_strategy: Optional[Callable] = None,
ref_lookup_nocache: bool = False,
norefs: bool = False,
progress: Optional[Callable] = None,
debug_sleep: Optional[str] = None) -> StructuredDataSet:
return StructuredDataSet(file=file, portal=portal, schemas=schemas, autoadd=autoadd, order=order, prune=prune,
remove_empty_objects_from_lists=remove_empty_objects_from_lists,
ref_lookup_strategy=ref_lookup_strategy, ref_lookup_nocache=ref_lookup_nocache,
norefs=norefs, progress=progress, debug_sleep=debug_sleep)

Expand Down Expand Up @@ -368,7 +372,11 @@ def _load_reader(self, reader: RowReader, type_name: str) -> None:
structured_row_template.set_value(structured_row, column_name, value, reader.file, reader.row_number)
if self._autoadd_properties:
self._add_properties(structured_row, self._autoadd_properties, schema)
self._add(type_name, structured_row)
if (prune_error := self._prune_structured_row(structured_row)) is not None:
self._note_error({"src": create_dict(type=schema_name, row=reader.row_number),
"error": prune_error}, "validation")
else:
self._add(type_name, structured_row)
if self._progress:
self._progress({
PROGRESS.LOAD_ITEM: self._nrows,
Expand All @@ -385,9 +393,20 @@ def _load_reader(self, reader: RowReader, type_name: str) -> None:
self._note_error(schema._unresolved_refs, "ref")
self._resolved_refs.update(schema._resolved_refs)

def _add(self, type_name: str, data: Union[dict, List[dict]]) -> None:
if self._prune:
def _prune_structured_row(self, data: dict) -> Optional[str]:
if not self._prune:
return None
if not self._remove_empty_objects_from_lists:
remove_empty_properties(data)
return None
try:
remove_empty_properties(data, isempty_array_element=lambda element: element == {},
raise_exception_on_nonempty_array_element_after_empty=True)
except Exception as e:
return str(e)
return None

def _add(self, type_name: str, data: Union[dict, List[dict]]) -> None:
if type_name in self._data:
self._data[type_name].extend([data] if isinstance(data, dict) else data)
else:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "8.8.4.1b35" # TODO: To become 8.8.5
version = "8.8.4.1b36" # TODO: To become 8.8.5
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down
40 changes: 36 additions & 4 deletions test/test_structured_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -943,9 +943,37 @@ def _pytest_kwargs(kwargs: List[dict]) -> List[dict]:
}
}
],
# "expected": {"Test": [{"arrayofobject": [{"name": "anastasiia", "id": 1234}]}]},
"expected": {"Test": [{"arrayofobject": [{}, {}, {"name": "olha", "id": 5678},
{}, {"name": "anastasiia", "id": 1234}]}]}
{}, {"name": "anastasiia", "id": 1234}]}]},
"remove_empty_objects_from_lists": False
},
# ----------------------------------------------------------------------------------------------
{
"rows": [
"arrayofobject#0.name,arrayofobject#0.id,arrayofobject#1.name,arrayofobject#1.id,"
"arrayofobject#2.name,arrayofobject#2.id",
"anastasiia,1234,olha,5678,,"
],
"as_file_name": "test.csv",
"schemas": [
{
"title": "Test",
"properties": {
"arrayofobject": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"id": {"type": "integer"}
}
}
}
}
}
],
"expected": {"Test": [{"arrayofobject": [{"name": "anastasiia", "id": 1234},
{"name": "olha", "id": 5678}]}]}
},
# ----------------------------------------------------------------------------------------------
{
Expand Down Expand Up @@ -1332,6 +1360,7 @@ def _test_parse_structured_data(testapp,
schemas: Optional[List[dict]] = None,
autoadd: Optional[dict] = None,
prune: bool = True,
remove_empty_objects_from_lists: bool = True,
ignore: bool = False,
debug: bool = False) -> None:

Expand Down Expand Up @@ -1360,12 +1389,13 @@ def _test_parse_structured_data(testapp,
def assert_parse_structured_data():

def call_parse_structured_data(file: str):
nonlocal portal, novalidate, autoadd, prune, debug
nonlocal portal, novalidate, autoadd, prune, remove_empty_objects_from_lists, debug
if debug:
# import pdb ; pdb.set_trace()
pass
return parse_structured_data(file=file, portal=portal, novalidate=novalidate,
autoadd=autoadd, prune=True if prune is not False else False)
autoadd=autoadd, prune=True if prune is not False else False,
remove_empty_objects_from_lists=remove_empty_objects_from_lists)

nonlocal file, expected, expected_errors, schemas, noschemas, debug
portal = Portal(testapp, schemas=schemas) if not noschemas else None # But see mocked_schemas.
Expand Down Expand Up @@ -1511,6 +1541,7 @@ def is_accession(value: str) -> bool:
# Same as in smaht-portal/../ingestion_processors.py
def parse_structured_data(file: str, portal: Optional[Union[VirtualApp, TestApp, Portal]], novalidate: bool = False,
autoadd: Optional[dict] = None, prune: bool = True,
remove_empty_objects_from_lists: bool = True,
ref_nocache: bool = False) -> StructuredDataSet:

def ref_lookup_strategy(portal: Portal, type_name: str, schema: dict, value: str) -> Tuple[int, Optional[str]]:
Expand All @@ -1528,6 +1559,7 @@ def ref_lookup_strategy(portal: Portal, type_name: str, schema: dict, value: str

structured_data = StructuredDataSet.load(file=file, portal=portal,
autoadd=autoadd, order=ITEM_INDEX_ORDER, prune=prune,
remove_empty_objects_from_lists=remove_empty_objects_from_lists,
ref_lookup_strategy=ref_lookup_strategy,
ref_lookup_nocache=ref_nocache)
if not novalidate:
Expand Down

0 comments on commit dbea701

Please sign in to comment.