Skip to content

Commit

Permalink
Merge pull request #308 from 4dn-dcic/misc-minor-updates-20240525
Browse files Browse the repository at this point in the history
Added merge capability to structured_data; to handle ingesting partial objects.
  • Loading branch information
dmichaels-harvard authored Jun 11, 2024
2 parents d2670cc + 2e123e9 commit 36d98fc
Show file tree
Hide file tree
Showing 10 changed files with 462 additions and 188 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ dcicutils
Change Log
----------

8.10.0
======

* Added merge capabilities to structured_data.
* Added Question class to command_utils (factored out of smaht-submitr).
* Refactored out some identifying property related code from portal_object_utils to portal_utils.
* Internalized lookup_strategy related code to structured_data/portal_object_utils/portal_utils.


8.9.0
=====
Expand Down
70 changes: 69 additions & 1 deletion dcicutils/command_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import annotations
import contextlib
import functools
import glob
Expand All @@ -7,7 +8,7 @@
import requests
import subprocess

from typing import Optional
from typing import Callable, Optional
from .exceptions import InvalidParameterError
from .lang_utils import there_are
from .misc_utils import INPUT, PRINT, environ_bool, print_error_message, decorator
Expand Down Expand Up @@ -384,3 +385,70 @@ def fail(*message):
message = str(e) # Note: We ignore the type, which isn't intended to be shown.
PRINT(message)
exit(1)


class Question:
"""
Supports asking the user (via stdin) a yes/no question, possibly repeatedly; and after
some maximum number times of the same answer in a row (consecutively), then asks them
if they want to automatically give that same answer to any/all subsequent questions.
Supports static/global list of such Question instances, hashed (only) by the question text.
"""
_static_instances = {}

@staticmethod
def instance(question: Optional[str] = None,
max: Optional[int] = None, printf: Optional[Callable] = None) -> Question:
question = question if isinstance(question, str) else ""
if not (instance := Question._static_instances.get(question)):
Question._static_instances[question] = (instance := Question(question, max=max, printf=printf))
return instance

@staticmethod
def yes(question: Optional[str] = None,
max: Optional[int] = None, printf: Optional[Callable] = None) -> bool:
return Question.instance(question, max=max, printf=printf).ask()

def __init__(self, question: Optional[str] = None,
max: Optional[int] = None, printf: Optional[Callable] = None) -> None:
self._question = question if isinstance(question, str) else ""
self._max = max if isinstance(max, int) and max > 0 else None
self._print = printf if callable(printf) else print
self._yes_consecutive_count = 0
self._no_consecutive_count = 0
self._yes_automatic = False
self._no_automatic = False

def ask(self, question: Optional[str] = None) -> bool:

def question_automatic(value: str) -> bool:
nonlocal self
RARROW = "▶"
LARROW = "◀"
if yes_or_no(f"{RARROW}{RARROW}{RARROW}"
f" Do you want to answer {value} to all such questions?"
f" {LARROW}{LARROW}{LARROW}"):
return True
self._yes_consecutive_count = 0
self._no_consecutive_count = 0

if self._yes_automatic:
return True
elif self._no_automatic:
return False
elif yes_or_no((question if isinstance(question, str) else "") or self._question or "Undefined question"):
self._yes_consecutive_count += 1
self._no_consecutive_count = 0
if (self._no_consecutive_count == 0) and self._max and (self._yes_consecutive_count >= self._max):
# Have reached the maximum number of consecutive YES answers; ask if YES to all subsequent.
if question_automatic("YES"):
self._yes_automatic = True
return True
else:
self._no_consecutive_count += 1
self._yes_consecutive_count = 0
if (self._yes_consecutive_count == 0) and self._max and (self._no_consecutive_count >= self._max):
# Have reached the maximum number of consecutive NO answers; ask if NO to all subsequent.
if question_automatic("NO"):
self._no_automatic = True
return False
51 changes: 41 additions & 10 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from collections import namedtuple
import appdirs
from copy import deepcopy
import contextlib
import datetime
import functools
Expand Down Expand Up @@ -2199,28 +2200,58 @@ def merge_key_value_dict_lists(x, y):
return [key_value_dict(k, v) for k, v in merged.items()]


def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict:
def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]],
full: bool = False, # deprecated
expand_lists: Optional[bool] = None,
primitive_lists: bool = False,
copy: bool = False, _recursing: bool = False) -> Union[dict, List[Any]]:
"""
Merges the given source dictionary or list into the target dictionary or list.
This MAY well change the given target (dictionary or list) IN PLACE.
The the full argument is True then any target lists longer than the
source be will be filled out with the last element(s) of the source.
Merges the given source dictionary or list into the target dictionary or list and returns the
result. This MAY well change the given target (dictionary or list) IN PLACE ... UNLESS the copy
argument is True, then the given target will not change as a local copy is made (and returned).
If the expand_lists argument is True then any target lists longer than the
source be will be filled out with the last element(s) of the source; the full
argument (is deprecated and) is a synomym for this. The default is False.
If the primitive_lists argument is True then lists of primitives (i.e. lists in which
NONE of its elements are dictionaries, lists, or tuples) will themselves be treated
like primitives, meaning the whole of a source list will replace the corresponding
target; otherwise they will be merged normally, meaning each element of a source list
will be merged, recursively, into the corresponding target list. The default is False.
"""
def is_primitive_list(value: Any) -> bool: # noqa
if not isinstance(value, list):
return False
for item in value:
if isinstance(item, (dict, list, tuple)):
return False
return True

if target is None:
return source
if expand_lists not in (True, False):
expand_lists = full is True
if (copy is True) and (_recursing is not True):
target = deepcopy(target)
if isinstance(target, dict) and isinstance(source, dict) and source:
for key, value in source.items():
target[key] = merge_objects(target[key], value, full) if key in target else value
if ((primitive_lists is True) and
(key in target) and is_primitive_list(target[key]) and is_primitive_list(value)): # noqa
target[key] = value
else:
target[key] = merge_objects(target[key], value,
expand_lists=expand_lists, _recursing=True) if key in target else value
elif isinstance(target, list) and isinstance(source, list) and source:
for i in range(max(len(source), len(target))):
if i < len(target):
if i < len(source):
target[i] = merge_objects(target[i], source[i], full)
elif full:
target[i] = merge_objects(target[i], source[len(source) - 1], full)
target[i] = merge_objects(target[i], source[i], expand_lists=expand_lists, _recursing=True)
elif expand_lists is True:
target[i] = merge_objects(target[i], source[len(source) - 1], expand_lists=expand_lists)
else:
target.append(source[i])
elif source:
elif source not in (None, {}, []):
target = source
return target

Expand Down
113 changes: 24 additions & 89 deletions dcicutils/portal_object_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from copy import deepcopy
from functools import lru_cache
import re
from typing import Any, Callable, List, Optional, Tuple, Type, Union
from dcicutils.data_readers import RowReader
from dcicutils.misc_utils import create_readonly_object
Expand All @@ -14,11 +13,9 @@ class PortalObject:

_PROPERTY_DELETION_SENTINEL = RowReader.CELL_DELETION_SENTINEL

def __init__(self, data: dict, portal: Portal = None,
schema: Optional[Union[dict, Schema]] = None, type: Optional[str] = None) -> None:
def __init__(self, data: dict, portal: Optional[Portal] = None, type: Optional[str] = None) -> None:
self._data = data if isinstance(data, dict) else {}
self._portal = portal if isinstance(portal, Portal) else None
self._schema = schema if isinstance(schema, dict) else (schema.data if isinstance(schema, Schema) else None)
self._type = type if isinstance(type, str) else ""

@property
Expand All @@ -32,7 +29,7 @@ def portal(self) -> Optional[Portal]:
@property
@lru_cache(maxsize=1)
def type(self) -> str:
return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else "")
return self._type or Portal.get_schema_type(self._data) or ""

@property
@lru_cache(maxsize=1)
Expand All @@ -47,7 +44,7 @@ def uuid(self) -> Optional[str]:
@property
@lru_cache(maxsize=1)
def schema(self) -> Optional[dict]:
return self._schema if self._schema else (self._portal.get_schema(self.type) if self._portal else None)
return self._portal.get_schema(self.type) if self._portal else None

def copy(self) -> PortalObject:
return PortalObject(deepcopy(self.data), portal=self.portal, type=self.type)
Expand All @@ -59,39 +56,29 @@ def identifying_properties(self) -> Optional[List[str]]:
Returns the list of all identifying property names of this Portal object which actually have values.
Implicitly include "uuid" and "identifier" properties as identifying properties if they are actually
properties in the object schema, and favor these (first); defavor "aliases"; no other ordering defined.
Changed (2024-05-26) to use portal_utils.get_identifying_property_names; migrating some intricate stuff there.
"""
if not (schema := self.schema) or not (schema_identifying_properties := schema.get("identifyingProperties")):
return None
identifying_properties = []
for identifying_property in schema_identifying_properties:
if identifying_property not in ["uuid", "identifier", "aliases"]:
if self._data.get(identifying_property):
identifying_properties.append(identifying_property)
if self._data.get("identifier"):
identifying_properties.insert(0, "identifier")
if self._data.get("uuid"):
identifying_properties.insert(0, "uuid")
if "aliases" in schema_identifying_properties and self._data.get("aliases"):
identifying_properties.append("aliases")
return identifying_properties or None
# Migrating to and unifying this in portal_utils.Portal.get_identifying_paths (2024-05-26).
return self._portal.get_identifying_property_names(self.type, portal_object=self._data) if self._portal else []

@lru_cache(maxsize=8192)
def lookup(self, raw: bool = False,
ref_lookup_strategy: Optional[Callable] = None) -> Tuple[Optional[PortalObject], Optional[str], int]:
if not (identifying_paths := self._get_identifying_paths(ref_lookup_strategy=ref_lookup_strategy)):
return None, None, 0
nlookups = 0
first_identifying_path = None
try:
if identifying_paths := self._get_identifying_paths(ref_lookup_strategy=ref_lookup_strategy):
for identifying_path in identifying_paths:
if not first_identifying_path:
first_identifying_path = identifying_path
nlookups += 1
if (value := self._portal.get(identifying_path, raw=raw)) and (value.status_code == 200):
return (
PortalObject(value.json(), portal=self._portal, type=self.type if raw else None),
identifying_path,
nlookups
)
for identifying_path in identifying_paths:
if not first_identifying_path:
first_identifying_path = identifying_path
nlookups += 1
if self._portal and (item := self._portal.get(identifying_path, raw=raw)) and (item.status_code == 200):
return (
PortalObject(item.json(), portal=self._portal, type=self.type if raw else None),
identifying_path,
nlookups
)
except Exception:
pass
return None, first_identifying_path, nlookups
Expand Down Expand Up @@ -159,64 +146,12 @@ def diff_deleting(value: Any) -> object: # noqa

@lru_cache(maxsize=1)
def _get_identifying_paths(self, ref_lookup_strategy: Optional[Callable] = None) -> Optional[List[str]]:
"""
Returns a list of the possible Portal URL paths identifying this Portal object.
"""
identifying_paths = []
if not (identifying_properties := self.identifying_properties):
if self.uuid:
if self.type:
identifying_paths.append(f"/{self.type}/{self.uuid}")
identifying_paths.append(f"/{self.uuid}")
return identifying_paths
for identifying_property in identifying_properties:
if identifying_value := self._data.get(identifying_property):
if identifying_property == "uuid":
if self.type:
identifying_paths.append(f"/{self.type}/{identifying_value}")
identifying_paths.append(f"/{identifying_value}")
# For now at least we include the path both with and without the schema type component,
# as for some identifying values, it works (only) with, and some, it works (only) without.
# For example: If we have FileSet with "accession", an identifying property, with value
# SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
# conversely using "submitted_id", also an identifying property, with value
# UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
# not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
elif isinstance(identifying_value, list):
for identifying_value_item in identifying_value:
if self.type:
identifying_paths.append(f"/{self.type}/{identifying_value_item}")
identifying_paths.append(f"/{identifying_value_item}")
else:
# TODO: Import from somewhere ...
lookup_options = 0
if schema := self.schema:
# TODO: Hook into the ref_lookup_strategy thing in structured_data to make
# sure we check accession format (since it does not have a pattern).
if callable(ref_lookup_strategy):
lookup_options, ref_validator = ref_lookup_strategy(
self._portal, self.type, schema, identifying_value)
if callable(ref_validator):
if ref_validator(schema, identifying_property, identifying_value) is False:
continue
if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
if not re.match(pattern, identifying_value):
# If this identifying value is for a (identifying) property which has a
# pattern, and the value does NOT match the pattern, then do NOT include
# this value as an identifying path, since it cannot possibly be found.
continue
if not lookup_options:
lookup_options = Portal.LOOKUP_DEFAULT
if Portal.is_lookup_root_first(lookup_options):
identifying_paths.append(f"/{identifying_value}")
if Portal.is_lookup_specified_type(lookup_options) and self.type:
identifying_paths.append(f"/{self.type}/{identifying_value}")
if Portal.is_lookup_root(lookup_options) and not Portal.is_lookup_root_first(lookup_options):
identifying_paths.append(f"/{identifying_value}")
if Portal.is_lookup_subtypes(lookup_options):
for subtype_name in self._portal.get_schema_subtype_names(self.type):
identifying_paths.append(f"/{subtype_name}/{identifying_value}")
return identifying_paths or None
if not self._portal and (uuid := self.uuid):
return [f"/{uuid}"]
# Migrating to and unifying this in portal_utils.Portal.get_identifying_paths (2024-05-26).
return self._portal.get_identifying_paths(self._data,
portal_type=self.schema,
lookup_strategy=ref_lookup_strategy) if self._portal else None

def _normalized_refs(self, refs: List[dict]) -> Tuple[PortalObject, int]:
"""
Expand Down
Loading

0 comments on commit 36d98fc

Please sign in to comment.